remove debug messages and increase chunk size in cqi

This commit is contained in:
Patrick Jentsch 2023-07-07 11:47:34 +02:00
parent a9973e9c8e
commit 688b96ffee

View File

@ -12,7 +12,6 @@ import gzip
import json import json
import math import math
import os import os
import shutil
from app import db from app import db
from app.models import Corpus from app.models import Corpus
from .utils import lookups_by_cpos, partial_export_subcorpus, export_subcorpus from .utils import lookups_by_cpos, partial_export_subcorpus, export_subcorpus
@ -67,12 +66,12 @@ def ext_corpus_static_data(corpus: str) -> Dict:
} }
for p_attr in cqi_p_attrs.values(): for p_attr in cqi_p_attrs.values():
static_corpus_data['corpus']['freqs'][p_attr.name] = {} static_corpus_data['corpus']['freqs'][p_attr.name] = {}
chunk_size = 5000 chunk_size = 10000
p_attr_id_list = list(range(p_attr.lexicon_size)) p_attr_id_list = list(range(p_attr.lexicon_size))
chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)] chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)]
del p_attr_id_list del p_attr_id_list
for chunk in chunks: for chunk in chunks:
print(f'corpus.freqs.{p_attr.name}: {chunk[0]} - {chunk[-1]}') # print(f'corpus.freqs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
static_corpus_data['corpus']['freqs'][p_attr.name].update( static_corpus_data['corpus']['freqs'][p_attr.name].update(
dict(zip(chunk, p_attr.freqs_by_ids(chunk))) dict(zip(chunk, p_attr.freqs_by_ids(chunk)))
) )
@ -82,7 +81,7 @@ def ext_corpus_static_data(corpus: str) -> Dict:
chunks = [cpos_list[i:i+chunk_size] for i in range(0, len(cpos_list), chunk_size)] chunks = [cpos_list[i:i+chunk_size] for i in range(0, len(cpos_list), chunk_size)]
del cpos_list del cpos_list
for chunk in chunks: for chunk in chunks:
print(f'p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}') # print(f'p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
static_corpus_data['p_attrs'][p_attr.name].update( static_corpus_data['p_attrs'][p_attr.name].update(
dict(zip(chunk, p_attr.ids_by_cpos(chunk))) dict(zip(chunk, p_attr.ids_by_cpos(chunk)))
) )
@ -92,7 +91,7 @@ def ext_corpus_static_data(corpus: str) -> Dict:
chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)] chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)]
del p_attr_id_list del p_attr_id_list
for chunk in chunks: for chunk in chunks:
print(f'values.p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}') # print(f'values.p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
static_corpus_data['values']['p_attrs'][p_attr.name].update( static_corpus_data['values']['p_attrs'][p_attr.name].update(
dict(zip(chunk, p_attr.values_by_ids(chunk))) dict(zip(chunk, p_attr.values_by_ids(chunk)))
) )
@ -123,7 +122,7 @@ def ext_corpus_static_data(corpus: str) -> Dict:
# static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1 # static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
# cqi_subcorpus.drop() # cqi_subcorpus.drop()
for id in range(0, s_attr.size): for id in range(0, s_attr.size):
print(f's_attrs.{s_attr.name}.lexicon.{id}') # print(f's_attrs.{s_attr.name}.lexicon.{id}')
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = { static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {
'bounds': None, 'bounds': None,
'counts': None, 'counts': None,
@ -132,9 +131,9 @@ def ext_corpus_static_data(corpus: str) -> Dict:
if s_attr.name != 'text': if s_attr.name != 'text':
continue continue
lbound, rbound = s_attr.cpos_by_id(id) lbound, rbound = s_attr.cpos_by_id(id)
print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds') # print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound] static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
print(f's_attrs.{s_attr.name}.lexicon.{id}.counts') # print(f's_attrs.{s_attr.name}.lexicon.{id}.counts')
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {} static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {}
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1 static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
cpos_list = list(range(lbound, rbound + 1)) cpos_list = list(range(lbound, rbound + 1))
@ -142,22 +141,22 @@ def ext_corpus_static_data(corpus: str) -> Dict:
del cpos_list del cpos_list
ent_ids = set() ent_ids = set()
for chunk in chunks: for chunk in chunks:
print(f'Gather ent_ids from cpos: {chunk[0]} - {chunk[-1]}') # print(f'Gather ent_ids from cpos: {chunk[0]} - {chunk[-1]}')
ent_ids.update({x for x in cqi_s_attrs['ent'].ids_by_cpos(chunk) if x != -1}) ent_ids.update({x for x in cqi_s_attrs['ent'].ids_by_cpos(chunk) if x != -1})
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['ent'] = len(ent_ids) static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['ent'] = len(ent_ids)
del ent_ids del ent_ids
s_ids = set() s_ids = set()
for chunk in chunks: for chunk in chunks:
print(f'Gather s_ids from cpos: {chunk[0]} - {chunk[-1]}') # print(f'Gather s_ids from cpos: {chunk[0]} - {chunk[-1]}')
s_ids.update({x for x in cqi_s_attrs['s'].ids_by_cpos(chunk) if x != -1}) s_ids.update({x for x in cqi_s_attrs['s'].ids_by_cpos(chunk) if x != -1})
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['s'] = len(s_ids) static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['s'] = len(s_ids)
del s_ids del s_ids
print(f's_attrs.{s_attr.name}.lexicon.{id}.freqs') # print(f's_attrs.{s_attr.name}.lexicon.{id}.freqs')
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'] = {} static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'] = {}
for p_attr in cqi_p_attrs.values(): for p_attr in cqi_p_attrs.values():
p_attr_ids = [] p_attr_ids = []
for chunk in chunks: for chunk in chunks:
print(f'Gather p_attr_ids from cpos: {chunk[0]} - {chunk[-1]}') # print(f'Gather p_attr_ids from cpos: {chunk[0]} - {chunk[-1]}')
p_attr_ids.extend(p_attr.ids_by_cpos(chunk)) p_attr_ids.extend(p_attr.ids_by_cpos(chunk))
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'][p_attr.name] = dict(Counter(p_attr_ids)) static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'][p_attr.name] = dict(Counter(p_attr_ids))
del p_attr_ids del p_attr_ids
@ -178,9 +177,9 @@ def ext_corpus_static_data(corpus: str) -> Dict:
sub_s_attr_values.append(tmp) sub_s_attr_values.append(tmp)
del tmp del tmp
del chunks del chunks
print(f's_attrs.{s_attr.name}.values') # print(f's_attrs.{s_attr.name}.values')
static_corpus_data['s_attrs'][s_attr.name]['values'] = s_attr_value_names static_corpus_data['s_attrs'][s_attr.name]['values'] = s_attr_value_names
print(f'values.s_attrs.{s_attr.name}') # print(f'values.s_attrs.{s_attr.name}')
static_corpus_data['values']['s_attrs'][s_attr.name] = { static_corpus_data['values']['s_attrs'][s_attr.name] = {
s_attr_id: { s_attr_id: {
s_attr_value_name: sub_s_attr_values[s_attr_value_name_idx][s_attr_id_idx] s_attr_value_name: sub_s_attr_values[s_attr_value_name_idx][s_attr_id_idx]