Add compression to static corpus data, use chunked computation, hide read corpus ids in corpus analysis

This commit is contained in:
Patrick Jentsch
2023-07-06 13:02:22 +02:00
parent 413b6111df
commit a9973e9c8e
7 changed files with 148 additions and 84 deletions

View File

@ -28,19 +28,19 @@ def _create_build_corpus_service(corpus):
''' ## Command ## '''
command = ['bash', '-c']
command.append(
f'mkdir /corpora/data/nopaque_{corpus.id}'
f'mkdir /corpora/data/nopaque-{corpus.hashid.lower()}'
' && '
'cwb-encode'
' -c utf8'
f' -d /corpora/data/nopaque_{corpus.id}'
f' -d /corpora/data/nopaque-{corpus.hashid.lower()}'
' -f /root/files/corpus.vrt'
f' -R /usr/local/share/cwb/registry/nopaque_{corpus.id}'
f' -R /usr/local/share/cwb/registry/nopaque-{corpus.hashid.lower()}'
' -P pos -P lemma -P simple_pos'
' -S ent:0+type -S s:0'
' -S text:0+address+author+booktitle+chapter+editor+institution+journal+pages+publisher+publishing_year+school+title'
' -xsB -9'
' && '
f'cwb-make -V NOPAQUE_{corpus.id}'
f'cwb-make -V NOPAQUE-{corpus.hashid.upper()}'
)
''' ## Constraints ## '''
constraints = ['node.role==worker']
@ -149,11 +149,15 @@ def _create_cqpserver_container(corpus):
''' ### Corpus data volume ### '''
data_volume_source = os.path.join(corpus.path, 'cwb', 'data')
data_volume_target = '/corpora/data'
# data_volume_source = os.path.join(corpus.path, 'cwb', 'data', f'nopaque_{corpus.id}')
# data_volume_target = f'/corpora/data/nopaque_{corpus.hashid.lower()}'
data_volume = f'{data_volume_source}:{data_volume_target}:rw'
volumes.append(data_volume)
''' ### Corpus registry volume ### '''
registry_volume_source = os.path.join(corpus.path, 'cwb', 'registry')
registry_volume_target = '/usr/local/share/cwb/registry'
# registry_volume_source = os.path.join(corpus.path, 'cwb', 'registry', f'nopaque_{corpus.id}')
# registry_volume_target = f'/usr/local/share/cwb/registry/nopaque_{corpus.hashid.lower()}'
registry_volume = f'{registry_volume_source}:{registry_volume_target}:rw'
volumes.append(registry_volume)
# Check if a cqpserver container already exists. If this is the case,