Use new cqi version. No chunking needed anymore

This commit is contained in:
Patrick Jentsch 2023-09-08 11:12:43 +02:00
parent aad347caa0
commit 9200837e63
4 changed files with 109 additions and 137 deletions

View File

@ -121,10 +121,7 @@ class CQiNamespace(Namespace):
socketio.sleep(3) socketio.sleep(3)
retry_counter -= 1 retry_counter -= 1
db.session.refresh(db_corpus) db.session.refresh(db_corpus)
cqi_client: CQiClient = CQiClient( cqi_client: CQiClient = CQiClient(f'cqpserver_{db_corpus_id}')
f'cqpserver_{db_corpus_id}',
timeout=float('inf')
)
session['cqi_over_sio'] = { session['cqi_over_sio'] = {
'cqi_client': cqi_client, 'cqi_client': cqi_client,
'cqi_client_lock': Lock(), 'cqi_client_lock': Lock(),

View File

@ -1,6 +1,7 @@
from collections import Counter from collections import Counter
from cqi import CQiClient from cqi import CQiClient
from cqi.models.corpora import Corpus as CQiCorpus from cqi.models.corpora import Corpus as CQiCorpus
from cqi.models.subcorpora import Subcorpus as CQiSubcorpus
from cqi.models.attributes import ( from cqi.models.attributes import (
PositionalAttribute as CQiPositionalAttribute, PositionalAttribute as CQiPositionalAttribute,
StructuralAttribute as CQiStructuralAttribute StructuralAttribute as CQiStructuralAttribute
@ -40,161 +41,132 @@ def ext_corpus_update_db(corpus: str) -> CQiStatusOk:
def ext_corpus_static_data(corpus: str) -> Dict: def ext_corpus_static_data(corpus: str) -> Dict:
db_corpus_id: int = session['cqi_over_sio']['db_corpus_id'] db_corpus_id: int = session['cqi_over_sio']['db_corpus_id']
db_corpus: Corpus = Corpus.query.get(db_corpus_id) db_corpus: Corpus = Corpus.query.get(db_corpus_id)
cache_file_path: str = os.path.join(db_corpus.path, 'cwb', 'static.json.gz')
if os.path.exists(cache_file_path): static_data_file_path: str = os.path.join(db_corpus.path, 'cwb', 'static.json.gz')
with open(cache_file_path, 'rb') as f: if os.path.exists(static_data_file_path):
with open(static_data_file_path, 'rb') as f:
return f.read() return f.read()
cqi_client: CQiClient = session['cqi_over_sio']['cqi_client'] cqi_client: CQiClient = session['cqi_over_sio']['cqi_client']
cqi_corpus: CQiCorpus = cqi_client.corpora.get(corpus) cqi_corpus: CQiCorpus = cqi_client.corpora.get(corpus)
cqi_p_attrs: Dict[str, CQiPositionalAttribute] = { cqi_p_attrs: List[CQiPositionalAttribute] = cqi_corpus.positional_attributes.list()
p_attr.name: p_attr cqi_s_attrs: List[CQiStructuralAttribute] = cqi_corpus.structural_attributes.list()
for p_attr in cqi_corpus.positional_attributes.list()
} static_data = {
cqi_s_attrs: Dict[str, CQiStructuralAttribute] = {
s_attr.name: s_attr
for s_attr in cqi_corpus.structural_attributes.list()
}
static_corpus_data = {
'corpus': { 'corpus': {
'bounds': [0, cqi_corpus.size - 1], 'bounds': [0, cqi_corpus.size - 1],
'counts': {
'token': cqi_corpus.size
},
'freqs': {} 'freqs': {}
}, },
'p_attrs': {}, 'p_attrs': {},
's_attrs': {}, 's_attrs': {},
'values': {'p_attrs': {}, 's_attrs': {}} 'values': {'p_attrs': {}, 's_attrs': {}}
} }
for p_attr in cqi_p_attrs.values():
static_corpus_data['corpus']['freqs'][p_attr.name] = {} for p_attr in cqi_p_attrs:
chunk_size = 10000 print(f'corpus.freqs.{p_attr.name}')
p_attr_id_list = list(range(p_attr.lexicon_size)) static_data['corpus']['freqs'][p_attr.name] = []
chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)] p_attr_id_list: List[int] = list(range(p_attr.lexicon_size))
static_data['corpus']['freqs'][p_attr.name].extend(p_attr.freqs_by_ids(p_attr_id_list))
del p_attr_id_list del p_attr_id_list
for chunk in chunks:
# print(f'corpus.freqs.{p_attr.name}: {chunk[0]} - {chunk[-1]}') print(f'p_attrs.{p_attr.name}')
static_corpus_data['corpus']['freqs'][p_attr.name].update( static_data['p_attrs'][p_attr.name] = []
dict(zip(chunk, p_attr.freqs_by_ids(chunk))) cpos_list: List[int] = list(range(cqi_corpus.size))
) static_data['p_attrs'][p_attr.name].extend(p_attr.ids_by_cpos(cpos_list))
del chunks
static_corpus_data['p_attrs'][p_attr.name] = {}
cpos_list = list(range(cqi_corpus.size))
chunks = [cpos_list[i:i+chunk_size] for i in range(0, len(cpos_list), chunk_size)]
del cpos_list del cpos_list
for chunk in chunks:
# print(f'p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}') print(f'values.p_attrs.{p_attr.name}')
static_corpus_data['p_attrs'][p_attr.name].update( static_data['values']['p_attrs'][p_attr.name] = []
dict(zip(chunk, p_attr.ids_by_cpos(chunk))) p_attr_id_list: List[int] = list(range(p_attr.lexicon_size))
) static_data['values']['p_attrs'][p_attr.name].extend(p_attr.values_by_ids(p_attr_id_list))
del chunks
static_corpus_data['values']['p_attrs'][p_attr.name] = {}
p_attr_id_list = list(range(p_attr.lexicon_size))
chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)]
del p_attr_id_list del p_attr_id_list
for chunk in chunks:
# print(f'values.p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}') for s_attr in cqi_s_attrs:
static_corpus_data['values']['p_attrs'][p_attr.name].update(
dict(zip(chunk, p_attr.values_by_ids(chunk)))
)
del chunks
for s_attr in cqi_s_attrs.values():
if s_attr.has_values: if s_attr.has_values:
continue continue
static_corpus_data['corpus']['counts'][s_attr.name] = s_attr.size
static_corpus_data['s_attrs'][s_attr.name] = {'lexicon': {}, 'values': None} static_data['s_attrs'][s_attr.name] = {'lexicon': [], 'values': None}
static_corpus_data['values']['s_attrs'][s_attr.name] = {}
########################################################################## if s_attr.name in ['s', 'ent']:
##############################################################
# A faster way to get cpos boundaries for smaller s_attrs # # A faster way to get cpos boundaries for smaller s_attrs #
########################################################################## # Note: Needs more testing, don't use it in production #
# if s_attr.name in ['s', 'ent']: ##############################################################
# cqi_corpus.query('Last', f'<{s_attr.name}> []* </{s_attr.name}>;') cqi_corpus.query('Last', f'<{s_attr.name}> []* </{s_attr.name}>;')
# cqi_subcorpus = cqi_corpus.subcorpora.get('Last') cqi_subcorpus: CQiSubcorpus = cqi_corpus.subcorpora.get('Last')
# first_match = 0 first_match: int = 0
# last_match = cqi_subcorpus.size - 1 last_match: int = cqi_subcorpus.size - 1
# match_boundaries = zip( match_boundaries = zip(
# range(first_match, last_match + 1), range(first_match, last_match + 1),
# cqi_subcorpus.dump(cqi_subcorpus.fields['match'], first_match, last_match), cqi_subcorpus.dump(
# cqi_subcorpus.dump(cqi_subcorpus.fields['matchend'], first_match, last_match) cqi_subcorpus.fields['match'],
# ) first_match,
# for id, lbound, rbound in match_boundaries: last_match
# static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {} ),
# static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound] cqi_subcorpus.dump(
# static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {} cqi_subcorpus.fields['matchend'],
# static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1 first_match,
# cqi_subcorpus.drop() last_match
for id in range(0, s_attr.size): )
# print(f's_attrs.{s_attr.name}.lexicon.{id}') )
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = { cqi_subcorpus.drop()
'bounds': None, del cqi_subcorpus, first_match, last_match
'counts': None, for id, lbound, rbound in match_boundaries:
'freqs': None static_data['s_attrs'][s_attr.name]['lexicon'].append({})
} print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
static_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
del match_boundaries
if s_attr.name != 'text': if s_attr.name != 'text':
continue continue
for id in range(0, s_attr.size):
static_data['s_attrs'][s_attr.name]['lexicon'].append({})
# This is a very slow operation, thats why we only use it for
# the text attribute
lbound, rbound = s_attr.cpos_by_id(id) lbound, rbound = s_attr.cpos_by_id(id)
# print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds') print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound] static_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
# print(f's_attrs.{s_attr.name}.lexicon.{id}.counts') static_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'] = {}
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {} cpos_list: List[int] = list(range(lbound, rbound + 1))
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1 for p_attr in cqi_p_attrs:
cpos_list = list(range(lbound, rbound + 1)) p_attr_ids: List[int] = []
chunks = [cpos_list[i:i+chunk_size] for i in range(0, len(cpos_list), chunk_size)] p_attr_ids.extend(p_attr.ids_by_cpos(cpos_list))
del cpos_list print(f's_attrs.{s_attr.name}.lexicon.{id}.freqs.{p_attr.name}')
ent_ids = set() static_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'][p_attr.name] = dict(Counter(p_attr_ids))
for chunk in chunks:
# print(f'Gather ent_ids from cpos: {chunk[0]} - {chunk[-1]}')
ent_ids.update({x for x in cqi_s_attrs['ent'].ids_by_cpos(chunk) if x != -1})
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['ent'] = len(ent_ids)
del ent_ids
s_ids = set()
for chunk in chunks:
# print(f'Gather s_ids from cpos: {chunk[0]} - {chunk[-1]}')
s_ids.update({x for x in cqi_s_attrs['s'].ids_by_cpos(chunk) if x != -1})
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['s'] = len(s_ids)
del s_ids
# print(f's_attrs.{s_attr.name}.lexicon.{id}.freqs')
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'] = {}
for p_attr in cqi_p_attrs.values():
p_attr_ids = []
for chunk in chunks:
# print(f'Gather p_attr_ids from cpos: {chunk[0]} - {chunk[-1]}')
p_attr_ids.extend(p_attr.ids_by_cpos(chunk))
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'][p_attr.name] = dict(Counter(p_attr_ids))
del p_attr_ids del p_attr_ids
del chunks del cpos_list
sub_s_attrs = cqi_corpus.structural_attributes.list(filters={'part_of': s_attr})
s_attr_value_names: List[str] = [ sub_s_attrs: List[CQiStructuralAttribute] = cqi_corpus.structural_attributes.list(filters={'part_of': s_attr})
print(f's_attrs.{s_attr.name}.values')
static_data['s_attrs'][s_attr.name]['values'] = [
sub_s_attr.name[(len(s_attr.name) + 1):] sub_s_attr.name[(len(s_attr.name) + 1):]
for sub_s_attr in sub_s_attrs for sub_s_attr in sub_s_attrs
] ]
s_attr_id_list = list(range(s_attr.size)) s_attr_id_list: List[int] = list(range(s_attr.size))
chunks = [s_attr_id_list[i:i+chunk_size] for i in range(0, len(s_attr_id_list), chunk_size)] sub_s_attr_values: List[str] = []
del s_attr_id_list
sub_s_attr_values = []
for sub_s_attr in sub_s_attrs: for sub_s_attr in sub_s_attrs:
tmp = [] tmp = []
for chunk in chunks: tmp.extend(sub_s_attr.values_by_ids(s_attr_id_list))
tmp.extend(sub_s_attr.values_by_ids(chunk))
sub_s_attr_values.append(tmp) sub_s_attr_values.append(tmp)
del tmp del tmp
del chunks del s_attr_id_list
# print(f's_attrs.{s_attr.name}.values') print(f'values.s_attrs.{s_attr.name}')
static_corpus_data['s_attrs'][s_attr.name]['values'] = s_attr_value_names static_data['values']['s_attrs'][s_attr.name] = [
# print(f'values.s_attrs.{s_attr.name}') {
static_corpus_data['values']['s_attrs'][s_attr.name] = { s_attr_value_name: sub_s_attr_values[s_attr_value_name_idx][s_attr_id]
s_attr_id: {
s_attr_value_name: sub_s_attr_values[s_attr_value_name_idx][s_attr_id_idx]
for s_attr_value_name_idx, s_attr_value_name in enumerate( for s_attr_value_name_idx, s_attr_value_name in enumerate(
static_corpus_data['s_attrs'][s_attr.name]['values'] static_data['s_attrs'][s_attr.name]['values']
) )
} for s_attr_id_idx, s_attr_id in enumerate(range(0, s_attr.size)) } for s_attr_id in range(0, s_attr.size)
} ]
del sub_s_attr_values del sub_s_attr_values
with gzip.open(cache_file_path, 'wt') as f: print('Saving static data to file')
json.dump(static_corpus_data, f) with gzip.open(static_data_file_path, 'wt') as f:
del static_corpus_data json.dump(static_data, f)
with open(cache_file_path, 'rb') as f: del static_data
print('Sending static data to client')
with open(static_data_file_path, 'rb') as f:
return f.read() return f.read()

View File

@ -93,8 +93,8 @@ class CorpusAnalysisStaticVisualization {
renderGeneralCorpusInfo() { renderGeneralCorpusInfo() {
let corpusData = this.data.corpus.o.staticData; let corpusData = this.data.corpus.o.staticData;
document.querySelector('.corpus-num-tokens').innerHTML = corpusData.corpus.counts.token; document.querySelector('.corpus-num-tokens').innerHTML = corpusData.corpus.bounds[1] - corpusData.corpus.bounds[0];
document.querySelector('.corpus-num-s').innerHTML = corpusData.corpus.counts.s; document.querySelector('.corpus-num-s').innerHTML = corpusData.s_attrs.s.lexicon.length;
document.querySelector('.corpus-num-unique-words').innerHTML = Object.entries(corpusData.corpus.freqs.word).length; document.querySelector('.corpus-num-unique-words').innerHTML = Object.entries(corpusData.corpus.freqs.word).length;
document.querySelector('.corpus-num-unique-lemmas').innerHTML = Object.entries(corpusData.corpus.freqs.lemma).length; document.querySelector('.corpus-num-unique-lemmas').innerHTML = Object.entries(corpusData.corpus.freqs.lemma).length;
document.querySelector('.corpus-num-unique-pos').innerHTML = Object.entries(corpusData.corpus.freqs.pos).length; document.querySelector('.corpus-num-unique-pos').innerHTML = Object.entries(corpusData.corpus.freqs.pos).length;
@ -111,8 +111,11 @@ class CorpusAnalysisStaticVisualization {
let resource = { let resource = {
title: corpusData.values.s_attrs.text[i].title, title: corpusData.values.s_attrs.text[i].title,
publishing_year: corpusData.values.s_attrs.text[i].publishing_year, publishing_year: corpusData.values.s_attrs.text[i].publishing_year,
num_tokens: corpusData.s_attrs.text.lexicon[i].counts.token, // num_sentences: corpusData.s_attrs.text.lexicon[i].counts.s,
num_sentences: corpusData.s_attrs.text.lexicon[i].counts.s, num_tokens: corpusData.s_attrs.text.lexicon[i].bounds[1] - corpusData.s_attrs.text.lexicon[i].bounds[0],
num_sentences: corpusData.s_attrs.s.lexicon.filter((s) => {
return s.bounds[0] >= corpusData.s_attrs.text.lexicon[i].bounds[0] && s.bounds[1] <= corpusData.s_attrs.text.lexicon[i].bounds[1];
}).length,
num_unique_words: Object.entries(corpusData.s_attrs.text.lexicon[i].freqs.word).length, num_unique_words: Object.entries(corpusData.s_attrs.text.lexicon[i].freqs.word).length,
num_unique_lemmas: Object.entries(corpusData.s_attrs.text.lexicon[i].freqs.lemma).length, num_unique_lemmas: Object.entries(corpusData.s_attrs.text.lexicon[i].freqs.lemma).length,
num_unique_pos: Object.entries(corpusData.s_attrs.text.lexicon[i].freqs.pos).length, num_unique_pos: Object.entries(corpusData.s_attrs.text.lexicon[i].freqs.pos).length,
@ -125,7 +128,7 @@ class CorpusAnalysisStaticVisualization {
corpusTextInfoList.add(textData); corpusTextInfoList.add(textData);
let textCountChipElement = document.querySelector('.text-count-chip'); let textCountChipElement = document.querySelector('.text-count-chip');
textCountChipElement.innerHTML = `Text count: ${corpusData.corpus.counts.text}`; textCountChipElement.innerHTML = `Text count: ${corpusData.s_attrs.text.lexicon.length}`;
} }
renderTextProportionsGraphic() { renderTextProportionsGraphic() {
@ -198,7 +201,7 @@ class CorpusAnalysisStaticVisualization {
default: default:
graphData = [ graphData = [
{ {
values: texts.map(text => text[1].counts.token), values: texts.map(text => text[1].bounds[1] - text[1].bounds[0]),
labels: texts.map(text => `${corpusData.values.s_attrs.text[text[0]].title} (${corpusData.values.s_attrs.text[text[0]].publishing_year})`), labels: texts.map(text => `${corpusData.values.s_attrs.text[text[0]].title} (${corpusData.values.s_attrs.text[text[0]].publishing_year})`),
type: graphtype type: graphtype
} }

View File

@ -1,5 +1,5 @@
apifairy apifairy
cqi>=0.1.6 cqi>=0.1.7
dnspython==2.2.1 dnspython==2.2.1
docker docker
eventlet eventlet