From 688b96ffeebca1c9d5043a06396e094cb198ac19 Mon Sep 17 00:00:00 2001
From: Patrick Jentsch
Date: Fri, 7 Jul 2023 11:47:34 +0200
Subject: [PATCH] remove debug messages and increase chunk size in cqi
---
app/corpora/cqi_over_sio/extensions.py | 27 +++++++++++++-------------
1 file changed, 13 insertions(+), 14 deletions(-)
diff --git a/app/corpora/cqi_over_sio/extensions.py b/app/corpora/cqi_over_sio/extensions.py
index 9f457c9b..903e51c3 100644
--- a/app/corpora/cqi_over_sio/extensions.py
+++ b/app/corpora/cqi_over_sio/extensions.py
@@ -12,7 +12,6 @@ import gzip
import json
import math
import os
-import shutil
from app import db
from app.models import Corpus
from .utils import lookups_by_cpos, partial_export_subcorpus, export_subcorpus
@@ -67,12 +66,12 @@ def ext_corpus_static_data(corpus: str) -> Dict:
}
for p_attr in cqi_p_attrs.values():
static_corpus_data['corpus']['freqs'][p_attr.name] = {}
- chunk_size = 5000
+ chunk_size = 10000
p_attr_id_list = list(range(p_attr.lexicon_size))
chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)]
del p_attr_id_list
for chunk in chunks:
- print(f'corpus.freqs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
+ # print(f'corpus.freqs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
static_corpus_data['corpus']['freqs'][p_attr.name].update(
dict(zip(chunk, p_attr.freqs_by_ids(chunk)))
)
@@ -82,7 +81,7 @@ def ext_corpus_static_data(corpus: str) -> Dict:
chunks = [cpos_list[i:i+chunk_size] for i in range(0, len(cpos_list), chunk_size)]
del cpos_list
for chunk in chunks:
- print(f'p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
+ # print(f'p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
static_corpus_data['p_attrs'][p_attr.name].update(
dict(zip(chunk, p_attr.ids_by_cpos(chunk)))
)
@@ -92,7 +91,7 @@ def ext_corpus_static_data(corpus: str) -> Dict:
chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)]
del p_attr_id_list
for chunk in chunks:
- print(f'values.p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
+ # print(f'values.p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
static_corpus_data['values']['p_attrs'][p_attr.name].update(
dict(zip(chunk, p_attr.values_by_ids(chunk)))
)
@@ -123,7 +122,7 @@ def ext_corpus_static_data(corpus: str) -> Dict:
# static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
# cqi_subcorpus.drop()
for id in range(0, s_attr.size):
- print(f's_attrs.{s_attr.name}.lexicon.{id}')
+ # print(f's_attrs.{s_attr.name}.lexicon.{id}')
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {
'bounds': None,
'counts': None,
@@ -132,9 +131,9 @@ def ext_corpus_static_data(corpus: str) -> Dict:
if s_attr.name != 'text':
continue
lbound, rbound = s_attr.cpos_by_id(id)
- print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
+ # print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
- print(f's_attrs.{s_attr.name}.lexicon.{id}.counts')
+ # print(f's_attrs.{s_attr.name}.lexicon.{id}.counts')
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {}
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
cpos_list = list(range(lbound, rbound + 1))
@@ -142,22 +141,22 @@ def ext_corpus_static_data(corpus: str) -> Dict:
del cpos_list
ent_ids = set()
for chunk in chunks:
- print(f'Gather ent_ids from cpos: {chunk[0]} - {chunk[-1]}')
+ # print(f'Gather ent_ids from cpos: {chunk[0]} - {chunk[-1]}')
ent_ids.update({x for x in cqi_s_attrs['ent'].ids_by_cpos(chunk) if x != -1})
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['ent'] = len(ent_ids)
del ent_ids
s_ids = set()
for chunk in chunks:
- print(f'Gather s_ids from cpos: {chunk[0]} - {chunk[-1]}')
+ # print(f'Gather s_ids from cpos: {chunk[0]} - {chunk[-1]}')
s_ids.update({x for x in cqi_s_attrs['s'].ids_by_cpos(chunk) if x != -1})
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['s'] = len(s_ids)
del s_ids
- print(f's_attrs.{s_attr.name}.lexicon.{id}.freqs')
+ # print(f's_attrs.{s_attr.name}.lexicon.{id}.freqs')
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'] = {}
for p_attr in cqi_p_attrs.values():
p_attr_ids = []
for chunk in chunks:
- print(f'Gather p_attr_ids from cpos: {chunk[0]} - {chunk[-1]}')
+ # print(f'Gather p_attr_ids from cpos: {chunk[0]} - {chunk[-1]}')
p_attr_ids.extend(p_attr.ids_by_cpos(chunk))
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'][p_attr.name] = dict(Counter(p_attr_ids))
del p_attr_ids
@@ -178,9 +177,9 @@ def ext_corpus_static_data(corpus: str) -> Dict:
sub_s_attr_values.append(tmp)
del tmp
del chunks
- print(f's_attrs.{s_attr.name}.values')
+ # print(f's_attrs.{s_attr.name}.values')
static_corpus_data['s_attrs'][s_attr.name]['values'] = s_attr_value_names
- print(f'values.s_attrs.{s_attr.name}')
+ # print(f'values.s_attrs.{s_attr.name}')
static_corpus_data['values']['s_attrs'][s_attr.name] = {
s_attr_id: {
s_attr_value_name: sub_s_attr_values[s_attr_value_name_idx][s_attr_id_idx]