mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2025-01-24 16:40:35 +00:00
remove debug messages and increase chunk size in cqi
This commit is contained in:
parent
a9973e9c8e
commit
688b96ffee
@ -12,7 +12,6 @@ import gzip
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import shutil
|
||||
from app import db
|
||||
from app.models import Corpus
|
||||
from .utils import lookups_by_cpos, partial_export_subcorpus, export_subcorpus
|
||||
@ -67,12 +66,12 @@ def ext_corpus_static_data(corpus: str) -> Dict:
|
||||
}
|
||||
for p_attr in cqi_p_attrs.values():
|
||||
static_corpus_data['corpus']['freqs'][p_attr.name] = {}
|
||||
chunk_size = 5000
|
||||
chunk_size = 10000
|
||||
p_attr_id_list = list(range(p_attr.lexicon_size))
|
||||
chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)]
|
||||
del p_attr_id_list
|
||||
for chunk in chunks:
|
||||
print(f'corpus.freqs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
|
||||
# print(f'corpus.freqs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
|
||||
static_corpus_data['corpus']['freqs'][p_attr.name].update(
|
||||
dict(zip(chunk, p_attr.freqs_by_ids(chunk)))
|
||||
)
|
||||
@ -82,7 +81,7 @@ def ext_corpus_static_data(corpus: str) -> Dict:
|
||||
chunks = [cpos_list[i:i+chunk_size] for i in range(0, len(cpos_list), chunk_size)]
|
||||
del cpos_list
|
||||
for chunk in chunks:
|
||||
print(f'p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
|
||||
# print(f'p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
|
||||
static_corpus_data['p_attrs'][p_attr.name].update(
|
||||
dict(zip(chunk, p_attr.ids_by_cpos(chunk)))
|
||||
)
|
||||
@ -92,7 +91,7 @@ def ext_corpus_static_data(corpus: str) -> Dict:
|
||||
chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)]
|
||||
del p_attr_id_list
|
||||
for chunk in chunks:
|
||||
print(f'values.p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
|
||||
# print(f'values.p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
|
||||
static_corpus_data['values']['p_attrs'][p_attr.name].update(
|
||||
dict(zip(chunk, p_attr.values_by_ids(chunk)))
|
||||
)
|
||||
@ -123,7 +122,7 @@ def ext_corpus_static_data(corpus: str) -> Dict:
|
||||
# static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
|
||||
# cqi_subcorpus.drop()
|
||||
for id in range(0, s_attr.size):
|
||||
print(f's_attrs.{s_attr.name}.lexicon.{id}')
|
||||
# print(f's_attrs.{s_attr.name}.lexicon.{id}')
|
||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {
|
||||
'bounds': None,
|
||||
'counts': None,
|
||||
@ -132,9 +131,9 @@ def ext_corpus_static_data(corpus: str) -> Dict:
|
||||
if s_attr.name != 'text':
|
||||
continue
|
||||
lbound, rbound = s_attr.cpos_by_id(id)
|
||||
print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
|
||||
# print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
|
||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
|
||||
print(f's_attrs.{s_attr.name}.lexicon.{id}.counts')
|
||||
# print(f's_attrs.{s_attr.name}.lexicon.{id}.counts')
|
||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {}
|
||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
|
||||
cpos_list = list(range(lbound, rbound + 1))
|
||||
@ -142,22 +141,22 @@ def ext_corpus_static_data(corpus: str) -> Dict:
|
||||
del cpos_list
|
||||
ent_ids = set()
|
||||
for chunk in chunks:
|
||||
print(f'Gather ent_ids from cpos: {chunk[0]} - {chunk[-1]}')
|
||||
# print(f'Gather ent_ids from cpos: {chunk[0]} - {chunk[-1]}')
|
||||
ent_ids.update({x for x in cqi_s_attrs['ent'].ids_by_cpos(chunk) if x != -1})
|
||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['ent'] = len(ent_ids)
|
||||
del ent_ids
|
||||
s_ids = set()
|
||||
for chunk in chunks:
|
||||
print(f'Gather s_ids from cpos: {chunk[0]} - {chunk[-1]}')
|
||||
# print(f'Gather s_ids from cpos: {chunk[0]} - {chunk[-1]}')
|
||||
s_ids.update({x for x in cqi_s_attrs['s'].ids_by_cpos(chunk) if x != -1})
|
||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['s'] = len(s_ids)
|
||||
del s_ids
|
||||
print(f's_attrs.{s_attr.name}.lexicon.{id}.freqs')
|
||||
# print(f's_attrs.{s_attr.name}.lexicon.{id}.freqs')
|
||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'] = {}
|
||||
for p_attr in cqi_p_attrs.values():
|
||||
p_attr_ids = []
|
||||
for chunk in chunks:
|
||||
print(f'Gather p_attr_ids from cpos: {chunk[0]} - {chunk[-1]}')
|
||||
# print(f'Gather p_attr_ids from cpos: {chunk[0]} - {chunk[-1]}')
|
||||
p_attr_ids.extend(p_attr.ids_by_cpos(chunk))
|
||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'][p_attr.name] = dict(Counter(p_attr_ids))
|
||||
del p_attr_ids
|
||||
@ -178,9 +177,9 @@ def ext_corpus_static_data(corpus: str) -> Dict:
|
||||
sub_s_attr_values.append(tmp)
|
||||
del tmp
|
||||
del chunks
|
||||
print(f's_attrs.{s_attr.name}.values')
|
||||
# print(f's_attrs.{s_attr.name}.values')
|
||||
static_corpus_data['s_attrs'][s_attr.name]['values'] = s_attr_value_names
|
||||
print(f'values.s_attrs.{s_attr.name}')
|
||||
# print(f'values.s_attrs.{s_attr.name}')
|
||||
static_corpus_data['values']['s_attrs'][s_attr.name] = {
|
||||
s_attr_id: {
|
||||
s_attr_value_name: sub_s_attr_values[s_attr_value_name_idx][s_attr_id_idx]
|
||||
|
Loading…
x
Reference in New Issue
Block a user