diff --git a/app/corpora/cqi_over_sio/extensions.py b/app/corpora/cqi_over_sio/extensions.py index 9f457c9b..903e51c3 100644 --- a/app/corpora/cqi_over_sio/extensions.py +++ b/app/corpora/cqi_over_sio/extensions.py @@ -12,7 +12,6 @@ import gzip import json import math import os -import shutil from app import db from app.models import Corpus from .utils import lookups_by_cpos, partial_export_subcorpus, export_subcorpus @@ -67,12 +66,12 @@ def ext_corpus_static_data(corpus: str) -> Dict: } for p_attr in cqi_p_attrs.values(): static_corpus_data['corpus']['freqs'][p_attr.name] = {} - chunk_size = 5000 + chunk_size = 10000 p_attr_id_list = list(range(p_attr.lexicon_size)) chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)] del p_attr_id_list for chunk in chunks: - print(f'corpus.freqs.{p_attr.name}: {chunk[0]} - {chunk[-1]}') + # print(f'corpus.freqs.{p_attr.name}: {chunk[0]} - {chunk[-1]}') static_corpus_data['corpus']['freqs'][p_attr.name].update( dict(zip(chunk, p_attr.freqs_by_ids(chunk))) ) @@ -82,7 +81,7 @@ def ext_corpus_static_data(corpus: str) -> Dict: chunks = [cpos_list[i:i+chunk_size] for i in range(0, len(cpos_list), chunk_size)] del cpos_list for chunk in chunks: - print(f'p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}') + # print(f'p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}') static_corpus_data['p_attrs'][p_attr.name].update( dict(zip(chunk, p_attr.ids_by_cpos(chunk))) ) @@ -92,7 +91,7 @@ def ext_corpus_static_data(corpus: str) -> Dict: chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)] del p_attr_id_list for chunk in chunks: - print(f'values.p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}') + # print(f'values.p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}') static_corpus_data['values']['p_attrs'][p_attr.name].update( dict(zip(chunk, p_attr.values_by_ids(chunk))) ) @@ -123,7 +122,7 @@ def ext_corpus_static_data(corpus: str) -> Dict: # static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1 # cqi_subcorpus.drop() for id in range(0, s_attr.size): - print(f's_attrs.{s_attr.name}.lexicon.{id}') + # print(f's_attrs.{s_attr.name}.lexicon.{id}') static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = { 'bounds': None, 'counts': None, @@ -132,9 +131,9 @@ def ext_corpus_static_data(corpus: str) -> Dict: if s_attr.name != 'text': continue lbound, rbound = s_attr.cpos_by_id(id) - print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds') + # print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds') static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound] - print(f's_attrs.{s_attr.name}.lexicon.{id}.counts') + # print(f's_attrs.{s_attr.name}.lexicon.{id}.counts') static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {} static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1 cpos_list = list(range(lbound, rbound + 1)) @@ -142,22 +141,22 @@ def ext_corpus_static_data(corpus: str) -> Dict: del cpos_list ent_ids = set() for chunk in chunks: - print(f'Gather ent_ids from cpos: {chunk[0]} - {chunk[-1]}') + # print(f'Gather ent_ids from cpos: {chunk[0]} - {chunk[-1]}') ent_ids.update({x for x in cqi_s_attrs['ent'].ids_by_cpos(chunk) if x != -1}) static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['ent'] = len(ent_ids) del ent_ids s_ids = set() for chunk in chunks: - print(f'Gather s_ids from cpos: {chunk[0]} - {chunk[-1]}') + # print(f'Gather s_ids from cpos: {chunk[0]} - {chunk[-1]}') s_ids.update({x for x in cqi_s_attrs['s'].ids_by_cpos(chunk) if x != -1}) static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['s'] = len(s_ids) del s_ids - print(f's_attrs.{s_attr.name}.lexicon.{id}.freqs') + # print(f's_attrs.{s_attr.name}.lexicon.{id}.freqs') static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'] = {} for p_attr in cqi_p_attrs.values(): p_attr_ids = [] for chunk in chunks: - print(f'Gather p_attr_ids from cpos: {chunk[0]} - {chunk[-1]}') + # print(f'Gather p_attr_ids from cpos: {chunk[0]} - {chunk[-1]}') p_attr_ids.extend(p_attr.ids_by_cpos(chunk)) static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'][p_attr.name] = dict(Counter(p_attr_ids)) del p_attr_ids @@ -178,9 +177,9 @@ def ext_corpus_static_data(corpus: str) -> Dict: sub_s_attr_values.append(tmp) del tmp del chunks - print(f's_attrs.{s_attr.name}.values') + # print(f's_attrs.{s_attr.name}.values') static_corpus_data['s_attrs'][s_attr.name]['values'] = s_attr_value_names - print(f'values.s_attrs.{s_attr.name}') + # print(f'values.s_attrs.{s_attr.name}') static_corpus_data['values']['s_attrs'][s_attr.name] = { s_attr_id: { s_attr_value_name: sub_s_attr_values[s_attr_value_name_idx][s_attr_id_idx]