diff --git a/app/corpora/cqi_over_sio/__init__.py b/app/corpora/cqi_over_sio/__init__.py index 9a0f478d..888bfaae 100644 --- a/app/corpora/cqi_over_sio/__init__.py +++ b/app/corpora/cqi_over_sio/__init__.py @@ -121,10 +121,7 @@ class CQiNamespace(Namespace): socketio.sleep(3) retry_counter -= 1 db.session.refresh(db_corpus) - cqi_client: CQiClient = CQiClient( - f'cqpserver_{db_corpus_id}', - timeout=float('inf') - ) + cqi_client: CQiClient = CQiClient(f'cqpserver_{db_corpus_id}') session['cqi_over_sio'] = { 'cqi_client': cqi_client, 'cqi_client_lock': Lock(), diff --git a/app/corpora/cqi_over_sio/extensions.py b/app/corpora/cqi_over_sio/extensions.py index 70ee5d97..6748b963 100644 --- a/app/corpora/cqi_over_sio/extensions.py +++ b/app/corpora/cqi_over_sio/extensions.py @@ -1,6 +1,7 @@ from collections import Counter from cqi import CQiClient from cqi.models.corpora import Corpus as CQiCorpus +from cqi.models.subcorpora import Subcorpus as CQiSubcorpus from cqi.models.attributes import ( PositionalAttribute as CQiPositionalAttribute, StructuralAttribute as CQiStructuralAttribute @@ -40,161 +41,132 @@ def ext_corpus_update_db(corpus: str) -> CQiStatusOk: def ext_corpus_static_data(corpus: str) -> Dict: db_corpus_id: int = session['cqi_over_sio']['db_corpus_id'] db_corpus: Corpus = Corpus.query.get(db_corpus_id) - cache_file_path: str = os.path.join(db_corpus.path, 'cwb', 'static.json.gz') - if os.path.exists(cache_file_path): - with open(cache_file_path, 'rb') as f: + + static_data_file_path: str = os.path.join(db_corpus.path, 'cwb', 'static.json.gz') + if os.path.exists(static_data_file_path): + with open(static_data_file_path, 'rb') as f: return f.read() + cqi_client: CQiClient = session['cqi_over_sio']['cqi_client'] cqi_corpus: CQiCorpus = cqi_client.corpora.get(corpus) - cqi_p_attrs: Dict[str, CQiPositionalAttribute] = { - p_attr.name: p_attr - for p_attr in cqi_corpus.positional_attributes.list() - } - cqi_s_attrs: Dict[str, CQiStructuralAttribute] = { - s_attr.name: s_attr - for s_attr in cqi_corpus.structural_attributes.list() - } - static_corpus_data = { + cqi_p_attrs: List[CQiPositionalAttribute] = cqi_corpus.positional_attributes.list() + cqi_s_attrs: List[CQiStructuralAttribute] = cqi_corpus.structural_attributes.list() + + static_data = { 'corpus': { 'bounds': [0, cqi_corpus.size - 1], - 'counts': { - 'token': cqi_corpus.size - }, 'freqs': {} }, 'p_attrs': {}, 's_attrs': {}, 'values': {'p_attrs': {}, 's_attrs': {}} } - for p_attr in cqi_p_attrs.values(): - static_corpus_data['corpus']['freqs'][p_attr.name] = {} - chunk_size = 10000 - p_attr_id_list = list(range(p_attr.lexicon_size)) - chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)] + + for p_attr in cqi_p_attrs: + print(f'corpus.freqs.{p_attr.name}') + static_data['corpus']['freqs'][p_attr.name] = [] + p_attr_id_list: List[int] = list(range(p_attr.lexicon_size)) + static_data['corpus']['freqs'][p_attr.name].extend(p_attr.freqs_by_ids(p_attr_id_list)) del p_attr_id_list - for chunk in chunks: - # print(f'corpus.freqs.{p_attr.name}: {chunk[0]} - {chunk[-1]}') - static_corpus_data['corpus']['freqs'][p_attr.name].update( - dict(zip(chunk, p_attr.freqs_by_ids(chunk))) - ) - del chunks - static_corpus_data['p_attrs'][p_attr.name] = {} - cpos_list = list(range(cqi_corpus.size)) - chunks = [cpos_list[i:i+chunk_size] for i in range(0, len(cpos_list), chunk_size)] + + print(f'p_attrs.{p_attr.name}') + static_data['p_attrs'][p_attr.name] = [] + cpos_list: List[int] = list(range(cqi_corpus.size)) + static_data['p_attrs'][p_attr.name].extend(p_attr.ids_by_cpos(cpos_list)) del cpos_list - for chunk in chunks: - # print(f'p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}') - static_corpus_data['p_attrs'][p_attr.name].update( - dict(zip(chunk, p_attr.ids_by_cpos(chunk))) - ) - del chunks - static_corpus_data['values']['p_attrs'][p_attr.name] = {} - p_attr_id_list = list(range(p_attr.lexicon_size)) - chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)] + + print(f'values.p_attrs.{p_attr.name}') + static_data['values']['p_attrs'][p_attr.name] = [] + p_attr_id_list: List[int] = list(range(p_attr.lexicon_size)) + static_data['values']['p_attrs'][p_attr.name].extend(p_attr.values_by_ids(p_attr_id_list)) del p_attr_id_list - for chunk in chunks: - # print(f'values.p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}') - static_corpus_data['values']['p_attrs'][p_attr.name].update( - dict(zip(chunk, p_attr.values_by_ids(chunk))) - ) - del chunks - for s_attr in cqi_s_attrs.values(): + + for s_attr in cqi_s_attrs: if s_attr.has_values: continue - static_corpus_data['corpus']['counts'][s_attr.name] = s_attr.size - static_corpus_data['s_attrs'][s_attr.name] = {'lexicon': {}, 'values': None} - static_corpus_data['values']['s_attrs'][s_attr.name] = {} - ########################################################################## - # A faster way to get cpos boundaries for smaller s_attrs # - ########################################################################## - # if s_attr.name in ['s', 'ent']: - # cqi_corpus.query('Last', f'<{s_attr.name}> []* ;') - # cqi_subcorpus = cqi_corpus.subcorpora.get('Last') - # first_match = 0 - # last_match = cqi_subcorpus.size - 1 - # match_boundaries = zip( - # range(first_match, last_match + 1), - # cqi_subcorpus.dump(cqi_subcorpus.fields['match'], first_match, last_match), - # cqi_subcorpus.dump(cqi_subcorpus.fields['matchend'], first_match, last_match) - # ) - # for id, lbound, rbound in match_boundaries: - # static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {} - # static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound] - # static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {} - # static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1 - # cqi_subcorpus.drop() + + static_data['s_attrs'][s_attr.name] = {'lexicon': [], 'values': None} + + if s_attr.name in ['s', 'ent']: + ############################################################## + # A faster way to get cpos boundaries for smaller s_attrs # + # Note: Needs more testing, don't use it in production # + ############################################################## + cqi_corpus.query('Last', f'<{s_attr.name}> []* ;') + cqi_subcorpus: CQiSubcorpus = cqi_corpus.subcorpora.get('Last') + first_match: int = 0 + last_match: int = cqi_subcorpus.size - 1 + match_boundaries = zip( + range(first_match, last_match + 1), + cqi_subcorpus.dump( + cqi_subcorpus.fields['match'], + first_match, + last_match + ), + cqi_subcorpus.dump( + cqi_subcorpus.fields['matchend'], + first_match, + last_match + ) + ) + cqi_subcorpus.drop() + del cqi_subcorpus, first_match, last_match + for id, lbound, rbound in match_boundaries: + static_data['s_attrs'][s_attr.name]['lexicon'].append({}) + print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds') + static_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound] + del match_boundaries + + if s_attr.name != 'text': + continue + for id in range(0, s_attr.size): - # print(f's_attrs.{s_attr.name}.lexicon.{id}') - static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = { - 'bounds': None, - 'counts': None, - 'freqs': None - } - if s_attr.name != 'text': - continue + static_data['s_attrs'][s_attr.name]['lexicon'].append({}) + # This is a very slow operation, thats why we only use it for + # the text attribute lbound, rbound = s_attr.cpos_by_id(id) - # print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds') - static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound] - # print(f's_attrs.{s_attr.name}.lexicon.{id}.counts') - static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {} - static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1 - cpos_list = list(range(lbound, rbound + 1)) - chunks = [cpos_list[i:i+chunk_size] for i in range(0, len(cpos_list), chunk_size)] - del cpos_list - ent_ids = set() - for chunk in chunks: - # print(f'Gather ent_ids from cpos: {chunk[0]} - {chunk[-1]}') - ent_ids.update({x for x in cqi_s_attrs['ent'].ids_by_cpos(chunk) if x != -1}) - static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['ent'] = len(ent_ids) - del ent_ids - s_ids = set() - for chunk in chunks: - # print(f'Gather s_ids from cpos: {chunk[0]} - {chunk[-1]}') - s_ids.update({x for x in cqi_s_attrs['s'].ids_by_cpos(chunk) if x != -1}) - static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['s'] = len(s_ids) - del s_ids - # print(f's_attrs.{s_attr.name}.lexicon.{id}.freqs') - static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'] = {} - for p_attr in cqi_p_attrs.values(): - p_attr_ids = [] - for chunk in chunks: - # print(f'Gather p_attr_ids from cpos: {chunk[0]} - {chunk[-1]}') - p_attr_ids.extend(p_attr.ids_by_cpos(chunk)) - static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'][p_attr.name] = dict(Counter(p_attr_ids)) + print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds') + static_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound] + static_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'] = {} + cpos_list: List[int] = list(range(lbound, rbound + 1)) + for p_attr in cqi_p_attrs: + p_attr_ids: List[int] = [] + p_attr_ids.extend(p_attr.ids_by_cpos(cpos_list)) + print(f's_attrs.{s_attr.name}.lexicon.{id}.freqs.{p_attr.name}') + static_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'][p_attr.name] = dict(Counter(p_attr_ids)) del p_attr_ids - del chunks - sub_s_attrs = cqi_corpus.structural_attributes.list(filters={'part_of': s_attr}) - s_attr_value_names: List[str] = [ + del cpos_list + + sub_s_attrs: List[CQiStructuralAttribute] = cqi_corpus.structural_attributes.list(filters={'part_of': s_attr}) + print(f's_attrs.{s_attr.name}.values') + static_data['s_attrs'][s_attr.name]['values'] = [ sub_s_attr.name[(len(s_attr.name) + 1):] for sub_s_attr in sub_s_attrs ] - s_attr_id_list = list(range(s_attr.size)) - chunks = [s_attr_id_list[i:i+chunk_size] for i in range(0, len(s_attr_id_list), chunk_size)] - del s_attr_id_list - sub_s_attr_values = [] + s_attr_id_list: List[int] = list(range(s_attr.size)) + sub_s_attr_values: List[str] = [] for sub_s_attr in sub_s_attrs: tmp = [] - for chunk in chunks: - tmp.extend(sub_s_attr.values_by_ids(chunk)) + tmp.extend(sub_s_attr.values_by_ids(s_attr_id_list)) sub_s_attr_values.append(tmp) del tmp - del chunks - # print(f's_attrs.{s_attr.name}.values') - static_corpus_data['s_attrs'][s_attr.name]['values'] = s_attr_value_names - # print(f'values.s_attrs.{s_attr.name}') - static_corpus_data['values']['s_attrs'][s_attr.name] = { - s_attr_id: { - s_attr_value_name: sub_s_attr_values[s_attr_value_name_idx][s_attr_id_idx] + del s_attr_id_list + print(f'values.s_attrs.{s_attr.name}') + static_data['values']['s_attrs'][s_attr.name] = [ + { + s_attr_value_name: sub_s_attr_values[s_attr_value_name_idx][s_attr_id] for s_attr_value_name_idx, s_attr_value_name in enumerate( - static_corpus_data['s_attrs'][s_attr.name]['values'] + static_data['s_attrs'][s_attr.name]['values'] ) - } for s_attr_id_idx, s_attr_id in enumerate(range(0, s_attr.size)) - } + } for s_attr_id in range(0, s_attr.size) + ] del sub_s_attr_values - with gzip.open(cache_file_path, 'wt') as f: - json.dump(static_corpus_data, f) - del static_corpus_data - with open(cache_file_path, 'rb') as f: + print('Saving static data to file') + with gzip.open(static_data_file_path, 'wt') as f: + json.dump(static_data, f) + del static_data + print('Sending static data to client') + with open(static_data_file_path, 'rb') as f: return f.read() diff --git a/app/static/js/CorpusAnalysis/CorpusAnalysisStaticVisualization.js b/app/static/js/CorpusAnalysis/CorpusAnalysisStaticVisualization.js index e51a3534..38c48c3c 100644 --- a/app/static/js/CorpusAnalysis/CorpusAnalysisStaticVisualization.js +++ b/app/static/js/CorpusAnalysis/CorpusAnalysisStaticVisualization.js @@ -93,8 +93,8 @@ class CorpusAnalysisStaticVisualization { renderGeneralCorpusInfo() { let corpusData = this.data.corpus.o.staticData; - document.querySelector('.corpus-num-tokens').innerHTML = corpusData.corpus.counts.token; - document.querySelector('.corpus-num-s').innerHTML = corpusData.corpus.counts.s; + document.querySelector('.corpus-num-tokens').innerHTML = corpusData.corpus.bounds[1] - corpusData.corpus.bounds[0]; + document.querySelector('.corpus-num-s').innerHTML = corpusData.s_attrs.s.lexicon.length; document.querySelector('.corpus-num-unique-words').innerHTML = Object.entries(corpusData.corpus.freqs.word).length; document.querySelector('.corpus-num-unique-lemmas').innerHTML = Object.entries(corpusData.corpus.freqs.lemma).length; document.querySelector('.corpus-num-unique-pos').innerHTML = Object.entries(corpusData.corpus.freqs.pos).length; @@ -111,8 +111,11 @@ class CorpusAnalysisStaticVisualization { let resource = { title: corpusData.values.s_attrs.text[i].title, publishing_year: corpusData.values.s_attrs.text[i].publishing_year, - num_tokens: corpusData.s_attrs.text.lexicon[i].counts.token, - num_sentences: corpusData.s_attrs.text.lexicon[i].counts.s, + // num_sentences: corpusData.s_attrs.text.lexicon[i].counts.s, + num_tokens: corpusData.s_attrs.text.lexicon[i].bounds[1] - corpusData.s_attrs.text.lexicon[i].bounds[0], + num_sentences: corpusData.s_attrs.s.lexicon.filter((s) => { + return s.bounds[0] >= corpusData.s_attrs.text.lexicon[i].bounds[0] && s.bounds[1] <= corpusData.s_attrs.text.lexicon[i].bounds[1]; + }).length, num_unique_words: Object.entries(corpusData.s_attrs.text.lexicon[i].freqs.word).length, num_unique_lemmas: Object.entries(corpusData.s_attrs.text.lexicon[i].freqs.lemma).length, num_unique_pos: Object.entries(corpusData.s_attrs.text.lexicon[i].freqs.pos).length, @@ -125,7 +128,7 @@ class CorpusAnalysisStaticVisualization { corpusTextInfoList.add(textData); let textCountChipElement = document.querySelector('.text-count-chip'); - textCountChipElement.innerHTML = `Text count: ${corpusData.corpus.counts.text}`; + textCountChipElement.innerHTML = `Text count: ${corpusData.s_attrs.text.lexicon.length}`; } renderTextProportionsGraphic() { @@ -198,7 +201,7 @@ class CorpusAnalysisStaticVisualization { default: graphData = [ { - values: texts.map(text => text[1].counts.token), + values: texts.map(text => text[1].bounds[1] - text[1].bounds[0]), labels: texts.map(text => `${corpusData.values.s_attrs.text[text[0]].title} (${corpusData.values.s_attrs.text[text[0]].publishing_year})`), type: graphtype } diff --git a/requirements.txt b/requirements.txt index 30206e28..360325e0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ apifairy -cqi>=0.1.6 +cqi>=0.1.7 dnspython==2.2.1 docker eventlet