remove debug messages and increase chunk size in cqi

This commit is contained in:
Patrick Jentsch 2023-07-07 11:47:34 +02:00
parent a9973e9c8e
commit 688b96ffee

View File

@ -12,7 +12,6 @@ import gzip
import json
import math
import os
import shutil
from app import db
from app.models import Corpus
from .utils import lookups_by_cpos, partial_export_subcorpus, export_subcorpus
@ -67,12 +66,12 @@ def ext_corpus_static_data(corpus: str) -> Dict:
}
for p_attr in cqi_p_attrs.values():
static_corpus_data['corpus']['freqs'][p_attr.name] = {}
chunk_size = 5000
chunk_size = 10000
p_attr_id_list = list(range(p_attr.lexicon_size))
chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)]
del p_attr_id_list
for chunk in chunks:
print(f'corpus.freqs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
# print(f'corpus.freqs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
static_corpus_data['corpus']['freqs'][p_attr.name].update(
dict(zip(chunk, p_attr.freqs_by_ids(chunk)))
)
@ -82,7 +81,7 @@ def ext_corpus_static_data(corpus: str) -> Dict:
chunks = [cpos_list[i:i+chunk_size] for i in range(0, len(cpos_list), chunk_size)]
del cpos_list
for chunk in chunks:
print(f'p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
# print(f'p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
static_corpus_data['p_attrs'][p_attr.name].update(
dict(zip(chunk, p_attr.ids_by_cpos(chunk)))
)
@ -92,7 +91,7 @@ def ext_corpus_static_data(corpus: str) -> Dict:
chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)]
del p_attr_id_list
for chunk in chunks:
print(f'values.p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
# print(f'values.p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
static_corpus_data['values']['p_attrs'][p_attr.name].update(
dict(zip(chunk, p_attr.values_by_ids(chunk)))
)
@ -123,7 +122,7 @@ def ext_corpus_static_data(corpus: str) -> Dict:
# static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
# cqi_subcorpus.drop()
for id in range(0, s_attr.size):
print(f's_attrs.{s_attr.name}.lexicon.{id}')
# print(f's_attrs.{s_attr.name}.lexicon.{id}')
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {
'bounds': None,
'counts': None,
@ -132,9 +131,9 @@ def ext_corpus_static_data(corpus: str) -> Dict:
if s_attr.name != 'text':
continue
lbound, rbound = s_attr.cpos_by_id(id)
print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
# print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
print(f's_attrs.{s_attr.name}.lexicon.{id}.counts')
# print(f's_attrs.{s_attr.name}.lexicon.{id}.counts')
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {}
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
cpos_list = list(range(lbound, rbound + 1))
@ -142,22 +141,22 @@ def ext_corpus_static_data(corpus: str) -> Dict:
del cpos_list
ent_ids = set()
for chunk in chunks:
print(f'Gather ent_ids from cpos: {chunk[0]} - {chunk[-1]}')
# print(f'Gather ent_ids from cpos: {chunk[0]} - {chunk[-1]}')
ent_ids.update({x for x in cqi_s_attrs['ent'].ids_by_cpos(chunk) if x != -1})
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['ent'] = len(ent_ids)
del ent_ids
s_ids = set()
for chunk in chunks:
print(f'Gather s_ids from cpos: {chunk[0]} - {chunk[-1]}')
# print(f'Gather s_ids from cpos: {chunk[0]} - {chunk[-1]}')
s_ids.update({x for x in cqi_s_attrs['s'].ids_by_cpos(chunk) if x != -1})
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['s'] = len(s_ids)
del s_ids
print(f's_attrs.{s_attr.name}.lexicon.{id}.freqs')
# print(f's_attrs.{s_attr.name}.lexicon.{id}.freqs')
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'] = {}
for p_attr in cqi_p_attrs.values():
p_attr_ids = []
for chunk in chunks:
print(f'Gather p_attr_ids from cpos: {chunk[0]} - {chunk[-1]}')
# print(f'Gather p_attr_ids from cpos: {chunk[0]} - {chunk[-1]}')
p_attr_ids.extend(p_attr.ids_by_cpos(chunk))
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'][p_attr.name] = dict(Counter(p_attr_ids))
del p_attr_ids
@ -178,9 +177,9 @@ def ext_corpus_static_data(corpus: str) -> Dict:
sub_s_attr_values.append(tmp)
del tmp
del chunks
print(f's_attrs.{s_attr.name}.values')
# print(f's_attrs.{s_attr.name}.values')
static_corpus_data['s_attrs'][s_attr.name]['values'] = s_attr_value_names
print(f'values.s_attrs.{s_attr.name}')
# print(f'values.s_attrs.{s_attr.name}')
static_corpus_data['values']['s_attrs'][s_attr.name] = {
s_attr_id: {
s_attr_value_name: sub_s_attr_values[s_attr_value_name_idx][s_attr_id_idx]