mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2024-12-25 10:54:18 +00:00
remove debug messages and increase chunk size in cqi
This commit is contained in:
parent
a9973e9c8e
commit
688b96ffee
@ -12,7 +12,6 @@ import gzip
|
|||||||
import json
|
import json
|
||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
import shutil
|
|
||||||
from app import db
|
from app import db
|
||||||
from app.models import Corpus
|
from app.models import Corpus
|
||||||
from .utils import lookups_by_cpos, partial_export_subcorpus, export_subcorpus
|
from .utils import lookups_by_cpos, partial_export_subcorpus, export_subcorpus
|
||||||
@ -67,12 +66,12 @@ def ext_corpus_static_data(corpus: str) -> Dict:
|
|||||||
}
|
}
|
||||||
for p_attr in cqi_p_attrs.values():
|
for p_attr in cqi_p_attrs.values():
|
||||||
static_corpus_data['corpus']['freqs'][p_attr.name] = {}
|
static_corpus_data['corpus']['freqs'][p_attr.name] = {}
|
||||||
chunk_size = 5000
|
chunk_size = 10000
|
||||||
p_attr_id_list = list(range(p_attr.lexicon_size))
|
p_attr_id_list = list(range(p_attr.lexicon_size))
|
||||||
chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)]
|
chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)]
|
||||||
del p_attr_id_list
|
del p_attr_id_list
|
||||||
for chunk in chunks:
|
for chunk in chunks:
|
||||||
print(f'corpus.freqs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
|
# print(f'corpus.freqs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
|
||||||
static_corpus_data['corpus']['freqs'][p_attr.name].update(
|
static_corpus_data['corpus']['freqs'][p_attr.name].update(
|
||||||
dict(zip(chunk, p_attr.freqs_by_ids(chunk)))
|
dict(zip(chunk, p_attr.freqs_by_ids(chunk)))
|
||||||
)
|
)
|
||||||
@ -82,7 +81,7 @@ def ext_corpus_static_data(corpus: str) -> Dict:
|
|||||||
chunks = [cpos_list[i:i+chunk_size] for i in range(0, len(cpos_list), chunk_size)]
|
chunks = [cpos_list[i:i+chunk_size] for i in range(0, len(cpos_list), chunk_size)]
|
||||||
del cpos_list
|
del cpos_list
|
||||||
for chunk in chunks:
|
for chunk in chunks:
|
||||||
print(f'p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
|
# print(f'p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
|
||||||
static_corpus_data['p_attrs'][p_attr.name].update(
|
static_corpus_data['p_attrs'][p_attr.name].update(
|
||||||
dict(zip(chunk, p_attr.ids_by_cpos(chunk)))
|
dict(zip(chunk, p_attr.ids_by_cpos(chunk)))
|
||||||
)
|
)
|
||||||
@ -92,7 +91,7 @@ def ext_corpus_static_data(corpus: str) -> Dict:
|
|||||||
chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)]
|
chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)]
|
||||||
del p_attr_id_list
|
del p_attr_id_list
|
||||||
for chunk in chunks:
|
for chunk in chunks:
|
||||||
print(f'values.p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
|
# print(f'values.p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
|
||||||
static_corpus_data['values']['p_attrs'][p_attr.name].update(
|
static_corpus_data['values']['p_attrs'][p_attr.name].update(
|
||||||
dict(zip(chunk, p_attr.values_by_ids(chunk)))
|
dict(zip(chunk, p_attr.values_by_ids(chunk)))
|
||||||
)
|
)
|
||||||
@ -123,7 +122,7 @@ def ext_corpus_static_data(corpus: str) -> Dict:
|
|||||||
# static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
|
# static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
|
||||||
# cqi_subcorpus.drop()
|
# cqi_subcorpus.drop()
|
||||||
for id in range(0, s_attr.size):
|
for id in range(0, s_attr.size):
|
||||||
print(f's_attrs.{s_attr.name}.lexicon.{id}')
|
# print(f's_attrs.{s_attr.name}.lexicon.{id}')
|
||||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {
|
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {
|
||||||
'bounds': None,
|
'bounds': None,
|
||||||
'counts': None,
|
'counts': None,
|
||||||
@ -132,9 +131,9 @@ def ext_corpus_static_data(corpus: str) -> Dict:
|
|||||||
if s_attr.name != 'text':
|
if s_attr.name != 'text':
|
||||||
continue
|
continue
|
||||||
lbound, rbound = s_attr.cpos_by_id(id)
|
lbound, rbound = s_attr.cpos_by_id(id)
|
||||||
print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
|
# print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
|
||||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
|
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
|
||||||
print(f's_attrs.{s_attr.name}.lexicon.{id}.counts')
|
# print(f's_attrs.{s_attr.name}.lexicon.{id}.counts')
|
||||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {}
|
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {}
|
||||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
|
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
|
||||||
cpos_list = list(range(lbound, rbound + 1))
|
cpos_list = list(range(lbound, rbound + 1))
|
||||||
@ -142,22 +141,22 @@ def ext_corpus_static_data(corpus: str) -> Dict:
|
|||||||
del cpos_list
|
del cpos_list
|
||||||
ent_ids = set()
|
ent_ids = set()
|
||||||
for chunk in chunks:
|
for chunk in chunks:
|
||||||
print(f'Gather ent_ids from cpos: {chunk[0]} - {chunk[-1]}')
|
# print(f'Gather ent_ids from cpos: {chunk[0]} - {chunk[-1]}')
|
||||||
ent_ids.update({x for x in cqi_s_attrs['ent'].ids_by_cpos(chunk) if x != -1})
|
ent_ids.update({x for x in cqi_s_attrs['ent'].ids_by_cpos(chunk) if x != -1})
|
||||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['ent'] = len(ent_ids)
|
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['ent'] = len(ent_ids)
|
||||||
del ent_ids
|
del ent_ids
|
||||||
s_ids = set()
|
s_ids = set()
|
||||||
for chunk in chunks:
|
for chunk in chunks:
|
||||||
print(f'Gather s_ids from cpos: {chunk[0]} - {chunk[-1]}')
|
# print(f'Gather s_ids from cpos: {chunk[0]} - {chunk[-1]}')
|
||||||
s_ids.update({x for x in cqi_s_attrs['s'].ids_by_cpos(chunk) if x != -1})
|
s_ids.update({x for x in cqi_s_attrs['s'].ids_by_cpos(chunk) if x != -1})
|
||||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['s'] = len(s_ids)
|
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['s'] = len(s_ids)
|
||||||
del s_ids
|
del s_ids
|
||||||
print(f's_attrs.{s_attr.name}.lexicon.{id}.freqs')
|
# print(f's_attrs.{s_attr.name}.lexicon.{id}.freqs')
|
||||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'] = {}
|
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'] = {}
|
||||||
for p_attr in cqi_p_attrs.values():
|
for p_attr in cqi_p_attrs.values():
|
||||||
p_attr_ids = []
|
p_attr_ids = []
|
||||||
for chunk in chunks:
|
for chunk in chunks:
|
||||||
print(f'Gather p_attr_ids from cpos: {chunk[0]} - {chunk[-1]}')
|
# print(f'Gather p_attr_ids from cpos: {chunk[0]} - {chunk[-1]}')
|
||||||
p_attr_ids.extend(p_attr.ids_by_cpos(chunk))
|
p_attr_ids.extend(p_attr.ids_by_cpos(chunk))
|
||||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'][p_attr.name] = dict(Counter(p_attr_ids))
|
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'][p_attr.name] = dict(Counter(p_attr_ids))
|
||||||
del p_attr_ids
|
del p_attr_ids
|
||||||
@ -178,9 +177,9 @@ def ext_corpus_static_data(corpus: str) -> Dict:
|
|||||||
sub_s_attr_values.append(tmp)
|
sub_s_attr_values.append(tmp)
|
||||||
del tmp
|
del tmp
|
||||||
del chunks
|
del chunks
|
||||||
print(f's_attrs.{s_attr.name}.values')
|
# print(f's_attrs.{s_attr.name}.values')
|
||||||
static_corpus_data['s_attrs'][s_attr.name]['values'] = s_attr_value_names
|
static_corpus_data['s_attrs'][s_attr.name]['values'] = s_attr_value_names
|
||||||
print(f'values.s_attrs.{s_attr.name}')
|
# print(f'values.s_attrs.{s_attr.name}')
|
||||||
static_corpus_data['values']['s_attrs'][s_attr.name] = {
|
static_corpus_data['values']['s_attrs'][s_attr.name] = {
|
||||||
s_attr_id: {
|
s_attr_id: {
|
||||||
s_attr_value_name: sub_s_attr_values[s_attr_value_name_idx][s_attr_id_idx]
|
s_attr_value_name: sub_s_attr_values[s_attr_value_name_idx][s_attr_id_idx]
|
||||||
|
Loading…
Reference in New Issue
Block a user