mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2024-12-25 02:44:18 +00:00
Merge branch 'visualizations-update' of gitlab.ub.uni-bielefeld.de:sfb1288inf/nopaque into visualizations-update
This commit is contained in:
commit
22b43a689f
@ -19,6 +19,9 @@ def reset():
|
|||||||
for corpus in [x for x in Corpus.query.all() if x.status in status]:
|
for corpus in [x for x in Corpus.query.all() if x.status in status]:
|
||||||
print(f'Resetting corpus {corpus}')
|
print(f'Resetting corpus {corpus}')
|
||||||
shutil.rmtree(os.path.join(corpus.path, 'cwb'), ignore_errors=True)
|
shutil.rmtree(os.path.join(corpus.path, 'cwb'), ignore_errors=True)
|
||||||
|
os.mkdir(os.path.join(corpus.path, 'cwb'))
|
||||||
|
os.mkdir(os.path.join(corpus.path, 'cwb', 'data'))
|
||||||
|
os.mkdir(os.path.join(corpus.path, 'cwb', 'registry'))
|
||||||
corpus.status = CorpusStatus.UNPREPARED
|
corpus.status = CorpusStatus.UNPREPARED
|
||||||
corpus.num_analysis_sessions = 0
|
corpus.num_analysis_sessions = 0
|
||||||
db.session.commit()
|
db.session.commit()
|
||||||
|
@ -7,6 +7,7 @@ from threading import Lock
|
|||||||
from app import db, hashids, socketio
|
from app import db, hashids, socketio
|
||||||
from app.decorators import socketio_login_required
|
from app.decorators import socketio_login_required
|
||||||
from app.models import Corpus, CorpusStatus
|
from app.models import Corpus, CorpusStatus
|
||||||
|
import math
|
||||||
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
@ -83,7 +84,7 @@ def connect(auth):
|
|||||||
socketio.sleep(3)
|
socketio.sleep(3)
|
||||||
retry_counter -= 1
|
retry_counter -= 1
|
||||||
db.session.refresh(corpus)
|
db.session.refresh(corpus)
|
||||||
cqi_client = CQiClient(f'cqpserver_{corpus_id}')
|
cqi_client = CQiClient(f'cqpserver_{corpus_id}', timeout=math.inf)
|
||||||
session['cqi_over_sio'] = {
|
session['cqi_over_sio'] = {
|
||||||
'corpus_id': corpus_id,
|
'corpus_id': corpus_id,
|
||||||
'cqi_client': cqi_client,
|
'cqi_client': cqi_client,
|
||||||
|
@ -1,9 +1,14 @@
|
|||||||
from collections import Counter
|
from collections import Counter
|
||||||
from cqi import CQiClient
|
from cqi import CQiClient
|
||||||
from cqi.models.corpora import Corpus
|
from cqi.models.corpora import Corpus as CQiCorpus
|
||||||
from cqi.status import StatusOk
|
from cqi.models.attributes import (
|
||||||
|
PositionalAttribute as CQiPositionalAttribute,
|
||||||
|
StructuralAttribute as CQiStructuralAttribute
|
||||||
|
)
|
||||||
|
from cqi.status import StatusOk as CQiStatusOk
|
||||||
from flask import session
|
from flask import session
|
||||||
from typing import Dict, List
|
from typing import Dict, List
|
||||||
|
import gzip
|
||||||
import json
|
import json
|
||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
@ -22,43 +27,28 @@ CQI_EXTENSION_FUNCTION_NAMES: List[str] = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def ext_corpus_update_db(corpus: str):
|
def ext_corpus_update_db(corpus: str) -> CQiStatusOk:
|
||||||
db_corpus = Corpus.query.get(session['cqi_over_sio']['corpus_id'])
|
db_corpus = Corpus.query.get(session['cqi_over_sio']['corpus_id'])
|
||||||
cqi_client: CQiClient = session['cqi_over_sio']['cqi_client']
|
cqi_client: CQiClient = session['cqi_over_sio']['cqi_client']
|
||||||
cqi_corpus = cqi_client.corpora.get(corpus)
|
cqi_corpus: CQiCorpus = cqi_client.corpora.get(corpus)
|
||||||
db_corpus.num_tokens = cqi_corpus.size
|
db_corpus.num_tokens = cqi_corpus.size
|
||||||
db.session.commit()
|
db.session.commit()
|
||||||
return StatusOk()
|
return CQiStatusOk()
|
||||||
|
|
||||||
|
|
||||||
def ext_corpus_static_data(corpus: str) -> Dict:
|
def ext_corpus_static_data(corpus: str) -> Dict:
|
||||||
db_corpus = Corpus.query.get(session['cqi_over_sio']['corpus_id'])
|
db_corpus = Corpus.query.get(session['cqi_over_sio']['corpus_id'])
|
||||||
static_corpus_data_file = os.path.join(db_corpus.path, 'cwb', 'static.json')
|
static_corpus_data_file = os.path.join(db_corpus.path, 'cwb', 'static.json.gz')
|
||||||
if os.path.exists(static_corpus_data_file):
|
if os.path.exists(static_corpus_data_file):
|
||||||
with open(static_corpus_data_file, 'r') as f:
|
with open(static_corpus_data_file, 'rb') as f:
|
||||||
return json.load(f)
|
return f.read()
|
||||||
cqi_client: CQiClient = session['cqi_over_sio']['cqi_client']
|
cqi_client: CQiClient = session['cqi_over_sio']['cqi_client']
|
||||||
cqi_corpus = cqi_client.corpora.get(corpus)
|
cqi_corpus: CQiCorpus = cqi_client.corpora.get(corpus)
|
||||||
##########################################################################
|
cqi_p_attrs: Dict[str, CQiPositionalAttribute] = {
|
||||||
# A faster way to get cpos boundaries for smaller s_attrs #
|
|
||||||
##########################################################################
|
|
||||||
# cqi_corpus.query('Last', '<s> []* </s>;')
|
|
||||||
# cqi_subcorpus = cqi_corpus.subcorpora.get('Last')
|
|
||||||
# print(cqi_subcorpus.size)
|
|
||||||
# first_match = 0
|
|
||||||
# last_match = cqi_subcorpus.attrs['size'] - 1
|
|
||||||
# match_boundaries = zip(
|
|
||||||
# list(range(first_match, last_match + 1)),
|
|
||||||
# cqi_subcorpus.dump(cqi_subcorpus.attrs['fields']['match'], first_match, last_match),
|
|
||||||
# cqi_subcorpus.dump(cqi_subcorpus.attrs['fields']['matchend'], first_match, last_match)
|
|
||||||
# )
|
|
||||||
# for x in match_boundaries:
|
|
||||||
# print(x)
|
|
||||||
cqi_p_attrs = {
|
|
||||||
p_attr.name: p_attr
|
p_attr.name: p_attr
|
||||||
for p_attr in cqi_corpus.positional_attributes.list()
|
for p_attr in cqi_corpus.positional_attributes.list()
|
||||||
}
|
}
|
||||||
cqi_s_attrs = {
|
cqi_s_attrs: Dict[str, CQiStructuralAttribute] = {
|
||||||
s_attr.name: s_attr
|
s_attr.name: s_attr
|
||||||
for s_attr in cqi_corpus.structural_attributes.list()
|
for s_attr in cqi_corpus.structural_attributes.list()
|
||||||
}
|
}
|
||||||
@ -75,73 +65,121 @@ def ext_corpus_static_data(corpus: str) -> Dict:
|
|||||||
'values': {'p_attrs': {}, 's_attrs': {}}
|
'values': {'p_attrs': {}, 's_attrs': {}}
|
||||||
}
|
}
|
||||||
for p_attr in cqi_p_attrs.values():
|
for p_attr in cqi_p_attrs.values():
|
||||||
static_corpus_data['corpus']['freqs'][p_attr.name] = dict(
|
static_corpus_data['corpus']['freqs'][p_attr.name] = {}
|
||||||
zip(
|
chunk_size = 10000
|
||||||
range(0, p_attr.lexicon_size),
|
p_attr_id_list = list(range(p_attr.lexicon_size))
|
||||||
p_attr.freqs_by_ids(list(range(0, p_attr.lexicon_size)))
|
chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)]
|
||||||
|
del p_attr_id_list
|
||||||
|
for chunk in chunks:
|
||||||
|
# print(f'corpus.freqs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
|
||||||
|
static_corpus_data['corpus']['freqs'][p_attr.name].update(
|
||||||
|
dict(zip(chunk, p_attr.freqs_by_ids(chunk)))
|
||||||
)
|
)
|
||||||
)
|
del chunks
|
||||||
static_corpus_data['p_attrs'][p_attr.name] = dict(
|
static_corpus_data['p_attrs'][p_attr.name] = {}
|
||||||
zip(
|
cpos_list = list(range(cqi_corpus.size))
|
||||||
range(0, cqi_corpus.size),
|
chunks = [cpos_list[i:i+chunk_size] for i in range(0, len(cpos_list), chunk_size)]
|
||||||
p_attr.ids_by_cpos(list(range(0, cqi_corpus.size)))
|
del cpos_list
|
||||||
|
for chunk in chunks:
|
||||||
|
# print(f'p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
|
||||||
|
static_corpus_data['p_attrs'][p_attr.name].update(
|
||||||
|
dict(zip(chunk, p_attr.ids_by_cpos(chunk)))
|
||||||
)
|
)
|
||||||
)
|
del chunks
|
||||||
static_corpus_data['values']['p_attrs'][p_attr.name] = dict(
|
static_corpus_data['values']['p_attrs'][p_attr.name] = {}
|
||||||
zip(
|
p_attr_id_list = list(range(p_attr.lexicon_size))
|
||||||
range(0, p_attr.lexicon_size),
|
chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)]
|
||||||
p_attr.values_by_ids(list(range(0, p_attr.lexicon_size)))
|
del p_attr_id_list
|
||||||
|
for chunk in chunks:
|
||||||
|
# print(f'values.p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
|
||||||
|
static_corpus_data['values']['p_attrs'][p_attr.name].update(
|
||||||
|
dict(zip(chunk, p_attr.values_by_ids(chunk)))
|
||||||
)
|
)
|
||||||
)
|
del chunks
|
||||||
for s_attr in cqi_s_attrs.values():
|
for s_attr in cqi_s_attrs.values():
|
||||||
if s_attr.has_values:
|
if s_attr.has_values:
|
||||||
continue
|
continue
|
||||||
static_corpus_data['corpus']['counts'][s_attr.name] = s_attr.size
|
static_corpus_data['corpus']['counts'][s_attr.name] = s_attr.size
|
||||||
static_corpus_data['s_attrs'][s_attr.name] = {'lexicon': {}, 'values': None}
|
static_corpus_data['s_attrs'][s_attr.name] = {'lexicon': {}, 'values': None}
|
||||||
static_corpus_data['values']['s_attrs'][s_attr.name] = {}
|
static_corpus_data['values']['s_attrs'][s_attr.name] = {}
|
||||||
if s_attr.name in ['s', 'ent']:
|
##########################################################################
|
||||||
cqi_corpus.query('Last', f'<{s_attr.name}> []* </{s_attr.name}>;')
|
# A faster way to get cpos boundaries for smaller s_attrs #
|
||||||
cqi_subcorpus = cqi_corpus.subcorpora.get('Last')
|
##########################################################################
|
||||||
first_match = 0
|
# if s_attr.name in ['s', 'ent']:
|
||||||
last_match = cqi_subcorpus.size - 1
|
# cqi_corpus.query('Last', f'<{s_attr.name}> []* </{s_attr.name}>;')
|
||||||
match_boundaries = zip(
|
# cqi_subcorpus = cqi_corpus.subcorpora.get('Last')
|
||||||
range(first_match, last_match + 1),
|
# first_match = 0
|
||||||
cqi_subcorpus.dump(cqi_subcorpus.fields['match'], first_match, last_match),
|
# last_match = cqi_subcorpus.size - 1
|
||||||
cqi_subcorpus.dump(cqi_subcorpus.fields['matchend'], first_match, last_match)
|
# match_boundaries = zip(
|
||||||
)
|
# range(first_match, last_match + 1),
|
||||||
for id, lbound, rbound in match_boundaries:
|
# cqi_subcorpus.dump(cqi_subcorpus.fields['match'], first_match, last_match),
|
||||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {}
|
# cqi_subcorpus.dump(cqi_subcorpus.fields['matchend'], first_match, last_match)
|
||||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
|
# )
|
||||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {}
|
# for id, lbound, rbound in match_boundaries:
|
||||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
|
# static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {}
|
||||||
cqi_subcorpus.drop()
|
# static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
|
||||||
|
# static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {}
|
||||||
|
# static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
|
||||||
|
# cqi_subcorpus.drop()
|
||||||
for id in range(0, s_attr.size):
|
for id in range(0, s_attr.size):
|
||||||
if s_attr.name not in ['s', 'ent']:
|
# print(f's_attrs.{s_attr.name}.lexicon.{id}')
|
||||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {}
|
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {
|
||||||
lbound, rbound = s_attr.cpos_by_id(id)
|
'bounds': None,
|
||||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
|
'counts': None,
|
||||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {}
|
'freqs': None
|
||||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
|
}
|
||||||
if s_attr.name not in ['text', 's']:
|
|
||||||
continue
|
|
||||||
cpos_range = range(lbound, rbound + 1)
|
|
||||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['ent'] = len({x for x in cqi_s_attrs['ent'].ids_by_cpos(list(cpos_range)) if x != -1})
|
|
||||||
if s_attr.name != 'text':
|
if s_attr.name != 'text':
|
||||||
continue
|
continue
|
||||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['s'] = len({x for x in cqi_s_attrs['s'].ids_by_cpos(list(cpos_range)) if x != -1})
|
lbound, rbound = s_attr.cpos_by_id(id)
|
||||||
|
# print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
|
||||||
|
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
|
||||||
|
# print(f's_attrs.{s_attr.name}.lexicon.{id}.counts')
|
||||||
|
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {}
|
||||||
|
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
|
||||||
|
cpos_list = list(range(lbound, rbound + 1))
|
||||||
|
chunks = [cpos_list[i:i+chunk_size] for i in range(0, len(cpos_list), chunk_size)]
|
||||||
|
del cpos_list
|
||||||
|
ent_ids = set()
|
||||||
|
for chunk in chunks:
|
||||||
|
# print(f'Gather ent_ids from cpos: {chunk[0]} - {chunk[-1]}')
|
||||||
|
ent_ids.update({x for x in cqi_s_attrs['ent'].ids_by_cpos(chunk) if x != -1})
|
||||||
|
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['ent'] = len(ent_ids)
|
||||||
|
del ent_ids
|
||||||
|
s_ids = set()
|
||||||
|
for chunk in chunks:
|
||||||
|
# print(f'Gather s_ids from cpos: {chunk[0]} - {chunk[-1]}')
|
||||||
|
s_ids.update({x for x in cqi_s_attrs['s'].ids_by_cpos(chunk) if x != -1})
|
||||||
|
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['s'] = len(s_ids)
|
||||||
|
del s_ids
|
||||||
|
# print(f's_attrs.{s_attr.name}.lexicon.{id}.freqs')
|
||||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'] = {}
|
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'] = {}
|
||||||
for p_attr in cqi_p_attrs.values():
|
for p_attr in cqi_p_attrs.values():
|
||||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'][p_attr.name] = dict(Counter(p_attr.ids_by_cpos(list(cpos_range))))
|
p_attr_ids = []
|
||||||
|
for chunk in chunks:
|
||||||
|
# print(f'Gather p_attr_ids from cpos: {chunk[0]} - {chunk[-1]}')
|
||||||
|
p_attr_ids.extend(p_attr.ids_by_cpos(chunk))
|
||||||
|
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'][p_attr.name] = dict(Counter(p_attr_ids))
|
||||||
|
del p_attr_ids
|
||||||
|
del chunks
|
||||||
sub_s_attrs = cqi_corpus.structural_attributes.list(filters={'part_of': s_attr})
|
sub_s_attrs = cqi_corpus.structural_attributes.list(filters={'part_of': s_attr})
|
||||||
s_attr_value_names = [
|
s_attr_value_names: List[str] = [
|
||||||
sub_s_attr.name[(len(s_attr.name) + 1):]
|
sub_s_attr.name[(len(s_attr.name) + 1):]
|
||||||
for sub_s_attr in sub_s_attrs
|
for sub_s_attr in sub_s_attrs
|
||||||
]
|
]
|
||||||
sub_s_attr_values = [
|
s_attr_id_list = list(range(s_attr.size))
|
||||||
sub_s_attr.values_by_ids(list(range(0, s_attr.size)))
|
chunks = [s_attr_id_list[i:i+chunk_size] for i in range(0, len(s_attr_id_list), chunk_size)]
|
||||||
for sub_s_attr in sub_s_attrs
|
del s_attr_id_list
|
||||||
]
|
sub_s_attr_values = []
|
||||||
|
for sub_s_attr in sub_s_attrs:
|
||||||
|
tmp = []
|
||||||
|
for chunk in chunks:
|
||||||
|
tmp.extend(sub_s_attr.values_by_ids(chunk))
|
||||||
|
sub_s_attr_values.append(tmp)
|
||||||
|
del tmp
|
||||||
|
del chunks
|
||||||
|
# print(f's_attrs.{s_attr.name}.values')
|
||||||
static_corpus_data['s_attrs'][s_attr.name]['values'] = s_attr_value_names
|
static_corpus_data['s_attrs'][s_attr.name]['values'] = s_attr_value_names
|
||||||
|
# print(f'values.s_attrs.{s_attr.name}')
|
||||||
static_corpus_data['values']['s_attrs'][s_attr.name] = {
|
static_corpus_data['values']['s_attrs'][s_attr.name] = {
|
||||||
s_attr_id: {
|
s_attr_id: {
|
||||||
s_attr_value_name: sub_s_attr_values[s_attr_value_name_idx][s_attr_id_idx]
|
s_attr_value_name: sub_s_attr_values[s_attr_value_name_idx][s_attr_id_idx]
|
||||||
@ -150,9 +188,12 @@ def ext_corpus_static_data(corpus: str) -> Dict:
|
|||||||
)
|
)
|
||||||
} for s_attr_id_idx, s_attr_id in enumerate(range(0, s_attr.size))
|
} for s_attr_id_idx, s_attr_id in enumerate(range(0, s_attr.size))
|
||||||
}
|
}
|
||||||
with open(static_corpus_data_file, 'w') as f:
|
del sub_s_attr_values
|
||||||
|
with gzip.open(static_corpus_data_file, 'wt') as f:
|
||||||
json.dump(static_corpus_data, f)
|
json.dump(static_corpus_data, f)
|
||||||
return static_corpus_data
|
del static_corpus_data
|
||||||
|
with open(static_corpus_data_file, 'rb') as f:
|
||||||
|
return f.read()
|
||||||
|
|
||||||
|
|
||||||
def ext_corpus_paginate_corpus(
|
def ext_corpus_paginate_corpus(
|
||||||
|
@ -28,19 +28,19 @@ def _create_build_corpus_service(corpus):
|
|||||||
''' ## Command ## '''
|
''' ## Command ## '''
|
||||||
command = ['bash', '-c']
|
command = ['bash', '-c']
|
||||||
command.append(
|
command.append(
|
||||||
f'mkdir /corpora/data/nopaque_{corpus.id}'
|
f'mkdir /corpora/data/nopaque-{corpus.hashid.lower()}'
|
||||||
' && '
|
' && '
|
||||||
'cwb-encode'
|
'cwb-encode'
|
||||||
' -c utf8'
|
' -c utf8'
|
||||||
f' -d /corpora/data/nopaque_{corpus.id}'
|
f' -d /corpora/data/nopaque-{corpus.hashid.lower()}'
|
||||||
' -f /root/files/corpus.vrt'
|
' -f /root/files/corpus.vrt'
|
||||||
f' -R /usr/local/share/cwb/registry/nopaque_{corpus.id}'
|
f' -R /usr/local/share/cwb/registry/nopaque-{corpus.hashid.lower()}'
|
||||||
' -P pos -P lemma -P simple_pos'
|
' -P pos -P lemma -P simple_pos'
|
||||||
' -S ent:0+type -S s:0'
|
' -S ent:0+type -S s:0'
|
||||||
' -S text:0+address+author+booktitle+chapter+editor+institution+journal+pages+publisher+publishing_year+school+title'
|
' -S text:0+address+author+booktitle+chapter+editor+institution+journal+pages+publisher+publishing_year+school+title'
|
||||||
' -xsB -9'
|
' -xsB -9'
|
||||||
' && '
|
' && '
|
||||||
f'cwb-make -V NOPAQUE_{corpus.id}'
|
f'cwb-make -V NOPAQUE-{corpus.hashid.upper()}'
|
||||||
)
|
)
|
||||||
''' ## Constraints ## '''
|
''' ## Constraints ## '''
|
||||||
constraints = ['node.role==worker']
|
constraints = ['node.role==worker']
|
||||||
@ -149,11 +149,15 @@ def _create_cqpserver_container(corpus):
|
|||||||
''' ### Corpus data volume ### '''
|
''' ### Corpus data volume ### '''
|
||||||
data_volume_source = os.path.join(corpus.path, 'cwb', 'data')
|
data_volume_source = os.path.join(corpus.path, 'cwb', 'data')
|
||||||
data_volume_target = '/corpora/data'
|
data_volume_target = '/corpora/data'
|
||||||
|
# data_volume_source = os.path.join(corpus.path, 'cwb', 'data', f'nopaque_{corpus.id}')
|
||||||
|
# data_volume_target = f'/corpora/data/nopaque_{corpus.hashid.lower()}'
|
||||||
data_volume = f'{data_volume_source}:{data_volume_target}:rw'
|
data_volume = f'{data_volume_source}:{data_volume_target}:rw'
|
||||||
volumes.append(data_volume)
|
volumes.append(data_volume)
|
||||||
''' ### Corpus registry volume ### '''
|
''' ### Corpus registry volume ### '''
|
||||||
registry_volume_source = os.path.join(corpus.path, 'cwb', 'registry')
|
registry_volume_source = os.path.join(corpus.path, 'cwb', 'registry')
|
||||||
registry_volume_target = '/usr/local/share/cwb/registry'
|
registry_volume_target = '/usr/local/share/cwb/registry'
|
||||||
|
# registry_volume_source = os.path.join(corpus.path, 'cwb', 'registry', f'nopaque_{corpus.id}')
|
||||||
|
# registry_volume_target = f'/usr/local/share/cwb/registry/nopaque_{corpus.hashid.lower()}'
|
||||||
registry_volume = f'{registry_volume_source}:{registry_volume_target}:rw'
|
registry_volume = f'{registry_volume_source}:{registry_volume_target}:rw'
|
||||||
volumes.append(registry_volume)
|
volumes.append(registry_volume)
|
||||||
# Check if a cqpserver container already exists. If this is the case,
|
# Check if a cqpserver container already exists. If this is the case,
|
||||||
|
@ -1607,9 +1607,14 @@ class Corpus(HashidMixin, db.Model):
|
|||||||
return corpus
|
return corpus
|
||||||
|
|
||||||
def build(self):
|
def build(self):
|
||||||
|
build_dir = os.path.join(self.path, 'cwb')
|
||||||
|
shutil.rmtree(build_dir, ignore_errors=True)
|
||||||
|
os.mkdir(build_dir)
|
||||||
|
os.mkdir(os.path.join(build_dir, 'data'))
|
||||||
|
os.mkdir(os.path.join(build_dir, 'registry'))
|
||||||
corpus_element = ET.fromstring('<corpus>\n</corpus>')
|
corpus_element = ET.fromstring('<corpus>\n</corpus>')
|
||||||
for corpus_file in self.files:
|
for corpus_file in self.files:
|
||||||
normalized_vrt_path = os.path.join(self.path, 'cwb', f'{corpus_file.id}.norm.vrt')
|
normalized_vrt_path = os.path.join(build_dir, f'{corpus_file.id}.norm.vrt')
|
||||||
try:
|
try:
|
||||||
normalize_vrt_file(corpus_file.path, normalized_vrt_path)
|
normalize_vrt_file(corpus_file.path, normalized_vrt_path)
|
||||||
except:
|
except:
|
||||||
@ -1636,7 +1641,7 @@ class Corpus(HashidMixin, db.Model):
|
|||||||
# corpus_element.insert(1, text_element)
|
# corpus_element.insert(1, text_element)
|
||||||
corpus_element.append(text_element)
|
corpus_element.append(text_element)
|
||||||
ET.ElementTree(corpus_element).write(
|
ET.ElementTree(corpus_element).write(
|
||||||
os.path.join(self.path, 'cwb', 'corpus.vrt'),
|
os.path.join(build_dir, 'corpus.vrt'),
|
||||||
encoding='utf-8'
|
encoding='utf-8'
|
||||||
)
|
)
|
||||||
self.status = CorpusStatus.SUBMITTED
|
self.status = CorpusStatus.SUBMITTED
|
||||||
|
@ -138,7 +138,15 @@ cqi.models.corpora.CorpusCollection = class CorpusCollection extends cqi.models.
|
|||||||
/************************************************************************
|
/************************************************************************
|
||||||
* Custom additions for nopaque *
|
* Custom additions for nopaque *
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
returnValue.static_data = await this.client.api.ext_corpus_static_data(corpusName);
|
// returnValue.static_data = await this.client.api.ext_corpus_static_data(corpusName);
|
||||||
|
let tmp = await this.client.api.ext_corpus_static_data(corpusName);
|
||||||
|
console.log(tmp);
|
||||||
|
let inflated = pako.inflate(tmp);
|
||||||
|
console.log(inflated);
|
||||||
|
let decoder = new TextDecoder('utf-8');
|
||||||
|
console.log(decoder);
|
||||||
|
let decoded = decoder.decode(inflated);
|
||||||
|
returnValue.static_data = JSON.parse(decoded);
|
||||||
return returnValue;
|
return returnValue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/list.js/2.3.1/list.min.js" integrity="sha512-93wYgwrIFL+b+P3RvYxi/WUFRXXUDSLCT2JQk9zhVGXuS2mHl2axj6d+R6pP+gcU5isMHRj1u0oYE/mWyt/RjA==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/list.js/2.3.1/list.min.js" integrity="sha512-93wYgwrIFL+b+P3RvYxi/WUFRXXUDSLCT2JQk9zhVGXuS2mHl2axj6d+R6pP+gcU5isMHRj1u0oYE/mWyt/RjA==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
|
||||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/socket.io/4.5.4/socket.io.min.js" integrity="sha512-HTENHrkQ/P0NGDFd5nk6ibVtCkcM7jhr2c7GyvXp5O+4X6O5cQO9AhqFzM+MdeBivsX7Hoys2J7pp2wdgMpCvw==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/socket.io/4.5.4/socket.io.min.js" integrity="sha512-HTENHrkQ/P0NGDFd5nk6ibVtCkcM7jhr2c7GyvXp5O+4X6O5cQO9AhqFzM+MdeBivsX7Hoys2J7pp2wdgMpCvw==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
|
||||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/plotly.js/2.24.2/plotly.min.js" integrity="sha512-dAXqGCq94D0kgLSPnfvd/pZpCMoJQpGj2S2XQmFQ9Ay1+96kbjss02ISEh+TBNXMggGg/1qoMcOHcxg+Op/Jmw==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/plotly.js/2.24.2/plotly.min.js" integrity="sha512-dAXqGCq94D0kgLSPnfvd/pZpCMoJQpGj2S2XQmFQ9Ay1+96kbjss02ISEh+TBNXMggGg/1qoMcOHcxg+Op/Jmw==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
|
||||||
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/pako/2.1.0/pako_inflate.min.js" integrity="sha512-mlnC6JeOvg9V4vBpWMxGKscsCdScB6yvGVCeFF2plnQMRmwH69s9F8SHPbC0oirqfePmRBhqx2s3Bx7WIvHfWg==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
|
||||||
|
|
||||||
{%- assets
|
{%- assets
|
||||||
filters='rjsmin',
|
filters='rjsmin',
|
||||||
|
Loading…
Reference in New Issue
Block a user