mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2025-01-24 16:40:35 +00:00
Use new cqi version. No chunking needed anymore
This commit is contained in:
parent
aad347caa0
commit
9200837e63
@ -121,10 +121,7 @@ class CQiNamespace(Namespace):
|
||||
socketio.sleep(3)
|
||||
retry_counter -= 1
|
||||
db.session.refresh(db_corpus)
|
||||
cqi_client: CQiClient = CQiClient(
|
||||
f'cqpserver_{db_corpus_id}',
|
||||
timeout=float('inf')
|
||||
)
|
||||
cqi_client: CQiClient = CQiClient(f'cqpserver_{db_corpus_id}')
|
||||
session['cqi_over_sio'] = {
|
||||
'cqi_client': cqi_client,
|
||||
'cqi_client_lock': Lock(),
|
||||
|
@ -1,6 +1,7 @@
|
||||
from collections import Counter
|
||||
from cqi import CQiClient
|
||||
from cqi.models.corpora import Corpus as CQiCorpus
|
||||
from cqi.models.subcorpora import Subcorpus as CQiSubcorpus
|
||||
from cqi.models.attributes import (
|
||||
PositionalAttribute as CQiPositionalAttribute,
|
||||
StructuralAttribute as CQiStructuralAttribute
|
||||
@ -40,161 +41,132 @@ def ext_corpus_update_db(corpus: str) -> CQiStatusOk:
|
||||
def ext_corpus_static_data(corpus: str) -> Dict:
|
||||
db_corpus_id: int = session['cqi_over_sio']['db_corpus_id']
|
||||
db_corpus: Corpus = Corpus.query.get(db_corpus_id)
|
||||
cache_file_path: str = os.path.join(db_corpus.path, 'cwb', 'static.json.gz')
|
||||
if os.path.exists(cache_file_path):
|
||||
with open(cache_file_path, 'rb') as f:
|
||||
|
||||
static_data_file_path: str = os.path.join(db_corpus.path, 'cwb', 'static.json.gz')
|
||||
if os.path.exists(static_data_file_path):
|
||||
with open(static_data_file_path, 'rb') as f:
|
||||
return f.read()
|
||||
|
||||
cqi_client: CQiClient = session['cqi_over_sio']['cqi_client']
|
||||
cqi_corpus: CQiCorpus = cqi_client.corpora.get(corpus)
|
||||
cqi_p_attrs: Dict[str, CQiPositionalAttribute] = {
|
||||
p_attr.name: p_attr
|
||||
for p_attr in cqi_corpus.positional_attributes.list()
|
||||
}
|
||||
cqi_s_attrs: Dict[str, CQiStructuralAttribute] = {
|
||||
s_attr.name: s_attr
|
||||
for s_attr in cqi_corpus.structural_attributes.list()
|
||||
}
|
||||
static_corpus_data = {
|
||||
cqi_p_attrs: List[CQiPositionalAttribute] = cqi_corpus.positional_attributes.list()
|
||||
cqi_s_attrs: List[CQiStructuralAttribute] = cqi_corpus.structural_attributes.list()
|
||||
|
||||
static_data = {
|
||||
'corpus': {
|
||||
'bounds': [0, cqi_corpus.size - 1],
|
||||
'counts': {
|
||||
'token': cqi_corpus.size
|
||||
},
|
||||
'freqs': {}
|
||||
},
|
||||
'p_attrs': {},
|
||||
's_attrs': {},
|
||||
'values': {'p_attrs': {}, 's_attrs': {}}
|
||||
}
|
||||
for p_attr in cqi_p_attrs.values():
|
||||
static_corpus_data['corpus']['freqs'][p_attr.name] = {}
|
||||
chunk_size = 10000
|
||||
p_attr_id_list = list(range(p_attr.lexicon_size))
|
||||
chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)]
|
||||
|
||||
for p_attr in cqi_p_attrs:
|
||||
print(f'corpus.freqs.{p_attr.name}')
|
||||
static_data['corpus']['freqs'][p_attr.name] = []
|
||||
p_attr_id_list: List[int] = list(range(p_attr.lexicon_size))
|
||||
static_data['corpus']['freqs'][p_attr.name].extend(p_attr.freqs_by_ids(p_attr_id_list))
|
||||
del p_attr_id_list
|
||||
for chunk in chunks:
|
||||
# print(f'corpus.freqs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
|
||||
static_corpus_data['corpus']['freqs'][p_attr.name].update(
|
||||
dict(zip(chunk, p_attr.freqs_by_ids(chunk)))
|
||||
)
|
||||
del chunks
|
||||
static_corpus_data['p_attrs'][p_attr.name] = {}
|
||||
cpos_list = list(range(cqi_corpus.size))
|
||||
chunks = [cpos_list[i:i+chunk_size] for i in range(0, len(cpos_list), chunk_size)]
|
||||
|
||||
print(f'p_attrs.{p_attr.name}')
|
||||
static_data['p_attrs'][p_attr.name] = []
|
||||
cpos_list: List[int] = list(range(cqi_corpus.size))
|
||||
static_data['p_attrs'][p_attr.name].extend(p_attr.ids_by_cpos(cpos_list))
|
||||
del cpos_list
|
||||
for chunk in chunks:
|
||||
# print(f'p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
|
||||
static_corpus_data['p_attrs'][p_attr.name].update(
|
||||
dict(zip(chunk, p_attr.ids_by_cpos(chunk)))
|
||||
)
|
||||
del chunks
|
||||
static_corpus_data['values']['p_attrs'][p_attr.name] = {}
|
||||
p_attr_id_list = list(range(p_attr.lexicon_size))
|
||||
chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)]
|
||||
|
||||
print(f'values.p_attrs.{p_attr.name}')
|
||||
static_data['values']['p_attrs'][p_attr.name] = []
|
||||
p_attr_id_list: List[int] = list(range(p_attr.lexicon_size))
|
||||
static_data['values']['p_attrs'][p_attr.name].extend(p_attr.values_by_ids(p_attr_id_list))
|
||||
del p_attr_id_list
|
||||
for chunk in chunks:
|
||||
# print(f'values.p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
|
||||
static_corpus_data['values']['p_attrs'][p_attr.name].update(
|
||||
dict(zip(chunk, p_attr.values_by_ids(chunk)))
|
||||
)
|
||||
del chunks
|
||||
for s_attr in cqi_s_attrs.values():
|
||||
|
||||
for s_attr in cqi_s_attrs:
|
||||
if s_attr.has_values:
|
||||
continue
|
||||
static_corpus_data['corpus']['counts'][s_attr.name] = s_attr.size
|
||||
static_corpus_data['s_attrs'][s_attr.name] = {'lexicon': {}, 'values': None}
|
||||
static_corpus_data['values']['s_attrs'][s_attr.name] = {}
|
||||
##########################################################################
|
||||
# A faster way to get cpos boundaries for smaller s_attrs #
|
||||
##########################################################################
|
||||
# if s_attr.name in ['s', 'ent']:
|
||||
# cqi_corpus.query('Last', f'<{s_attr.name}> []* </{s_attr.name}>;')
|
||||
# cqi_subcorpus = cqi_corpus.subcorpora.get('Last')
|
||||
# first_match = 0
|
||||
# last_match = cqi_subcorpus.size - 1
|
||||
# match_boundaries = zip(
|
||||
# range(first_match, last_match + 1),
|
||||
# cqi_subcorpus.dump(cqi_subcorpus.fields['match'], first_match, last_match),
|
||||
# cqi_subcorpus.dump(cqi_subcorpus.fields['matchend'], first_match, last_match)
|
||||
# )
|
||||
# for id, lbound, rbound in match_boundaries:
|
||||
# static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {}
|
||||
# static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
|
||||
# static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {}
|
||||
# static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
|
||||
# cqi_subcorpus.drop()
|
||||
|
||||
static_data['s_attrs'][s_attr.name] = {'lexicon': [], 'values': None}
|
||||
|
||||
if s_attr.name in ['s', 'ent']:
|
||||
##############################################################
|
||||
# A faster way to get cpos boundaries for smaller s_attrs #
|
||||
# Note: Needs more testing, don't use it in production #
|
||||
##############################################################
|
||||
cqi_corpus.query('Last', f'<{s_attr.name}> []* </{s_attr.name}>;')
|
||||
cqi_subcorpus: CQiSubcorpus = cqi_corpus.subcorpora.get('Last')
|
||||
first_match: int = 0
|
||||
last_match: int = cqi_subcorpus.size - 1
|
||||
match_boundaries = zip(
|
||||
range(first_match, last_match + 1),
|
||||
cqi_subcorpus.dump(
|
||||
cqi_subcorpus.fields['match'],
|
||||
first_match,
|
||||
last_match
|
||||
),
|
||||
cqi_subcorpus.dump(
|
||||
cqi_subcorpus.fields['matchend'],
|
||||
first_match,
|
||||
last_match
|
||||
)
|
||||
)
|
||||
cqi_subcorpus.drop()
|
||||
del cqi_subcorpus, first_match, last_match
|
||||
for id, lbound, rbound in match_boundaries:
|
||||
static_data['s_attrs'][s_attr.name]['lexicon'].append({})
|
||||
print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
|
||||
static_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
|
||||
del match_boundaries
|
||||
|
||||
if s_attr.name != 'text':
|
||||
continue
|
||||
|
||||
for id in range(0, s_attr.size):
|
||||
# print(f's_attrs.{s_attr.name}.lexicon.{id}')
|
||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {
|
||||
'bounds': None,
|
||||
'counts': None,
|
||||
'freqs': None
|
||||
}
|
||||
if s_attr.name != 'text':
|
||||
continue
|
||||
static_data['s_attrs'][s_attr.name]['lexicon'].append({})
|
||||
# This is a very slow operation, thats why we only use it for
|
||||
# the text attribute
|
||||
lbound, rbound = s_attr.cpos_by_id(id)
|
||||
# print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
|
||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
|
||||
# print(f's_attrs.{s_attr.name}.lexicon.{id}.counts')
|
||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {}
|
||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
|
||||
cpos_list = list(range(lbound, rbound + 1))
|
||||
chunks = [cpos_list[i:i+chunk_size] for i in range(0, len(cpos_list), chunk_size)]
|
||||
del cpos_list
|
||||
ent_ids = set()
|
||||
for chunk in chunks:
|
||||
# print(f'Gather ent_ids from cpos: {chunk[0]} - {chunk[-1]}')
|
||||
ent_ids.update({x for x in cqi_s_attrs['ent'].ids_by_cpos(chunk) if x != -1})
|
||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['ent'] = len(ent_ids)
|
||||
del ent_ids
|
||||
s_ids = set()
|
||||
for chunk in chunks:
|
||||
# print(f'Gather s_ids from cpos: {chunk[0]} - {chunk[-1]}')
|
||||
s_ids.update({x for x in cqi_s_attrs['s'].ids_by_cpos(chunk) if x != -1})
|
||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['s'] = len(s_ids)
|
||||
del s_ids
|
||||
# print(f's_attrs.{s_attr.name}.lexicon.{id}.freqs')
|
||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'] = {}
|
||||
for p_attr in cqi_p_attrs.values():
|
||||
p_attr_ids = []
|
||||
for chunk in chunks:
|
||||
# print(f'Gather p_attr_ids from cpos: {chunk[0]} - {chunk[-1]}')
|
||||
p_attr_ids.extend(p_attr.ids_by_cpos(chunk))
|
||||
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'][p_attr.name] = dict(Counter(p_attr_ids))
|
||||
print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
|
||||
static_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
|
||||
static_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'] = {}
|
||||
cpos_list: List[int] = list(range(lbound, rbound + 1))
|
||||
for p_attr in cqi_p_attrs:
|
||||
p_attr_ids: List[int] = []
|
||||
p_attr_ids.extend(p_attr.ids_by_cpos(cpos_list))
|
||||
print(f's_attrs.{s_attr.name}.lexicon.{id}.freqs.{p_attr.name}')
|
||||
static_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'][p_attr.name] = dict(Counter(p_attr_ids))
|
||||
del p_attr_ids
|
||||
del chunks
|
||||
sub_s_attrs = cqi_corpus.structural_attributes.list(filters={'part_of': s_attr})
|
||||
s_attr_value_names: List[str] = [
|
||||
del cpos_list
|
||||
|
||||
sub_s_attrs: List[CQiStructuralAttribute] = cqi_corpus.structural_attributes.list(filters={'part_of': s_attr})
|
||||
print(f's_attrs.{s_attr.name}.values')
|
||||
static_data['s_attrs'][s_attr.name]['values'] = [
|
||||
sub_s_attr.name[(len(s_attr.name) + 1):]
|
||||
for sub_s_attr in sub_s_attrs
|
||||
]
|
||||
s_attr_id_list = list(range(s_attr.size))
|
||||
chunks = [s_attr_id_list[i:i+chunk_size] for i in range(0, len(s_attr_id_list), chunk_size)]
|
||||
del s_attr_id_list
|
||||
sub_s_attr_values = []
|
||||
s_attr_id_list: List[int] = list(range(s_attr.size))
|
||||
sub_s_attr_values: List[str] = []
|
||||
for sub_s_attr in sub_s_attrs:
|
||||
tmp = []
|
||||
for chunk in chunks:
|
||||
tmp.extend(sub_s_attr.values_by_ids(chunk))
|
||||
tmp.extend(sub_s_attr.values_by_ids(s_attr_id_list))
|
||||
sub_s_attr_values.append(tmp)
|
||||
del tmp
|
||||
del chunks
|
||||
# print(f's_attrs.{s_attr.name}.values')
|
||||
static_corpus_data['s_attrs'][s_attr.name]['values'] = s_attr_value_names
|
||||
# print(f'values.s_attrs.{s_attr.name}')
|
||||
static_corpus_data['values']['s_attrs'][s_attr.name] = {
|
||||
s_attr_id: {
|
||||
s_attr_value_name: sub_s_attr_values[s_attr_value_name_idx][s_attr_id_idx]
|
||||
del s_attr_id_list
|
||||
print(f'values.s_attrs.{s_attr.name}')
|
||||
static_data['values']['s_attrs'][s_attr.name] = [
|
||||
{
|
||||
s_attr_value_name: sub_s_attr_values[s_attr_value_name_idx][s_attr_id]
|
||||
for s_attr_value_name_idx, s_attr_value_name in enumerate(
|
||||
static_corpus_data['s_attrs'][s_attr.name]['values']
|
||||
static_data['s_attrs'][s_attr.name]['values']
|
||||
)
|
||||
} for s_attr_id_idx, s_attr_id in enumerate(range(0, s_attr.size))
|
||||
}
|
||||
} for s_attr_id in range(0, s_attr.size)
|
||||
]
|
||||
del sub_s_attr_values
|
||||
with gzip.open(cache_file_path, 'wt') as f:
|
||||
json.dump(static_corpus_data, f)
|
||||
del static_corpus_data
|
||||
with open(cache_file_path, 'rb') as f:
|
||||
print('Saving static data to file')
|
||||
with gzip.open(static_data_file_path, 'wt') as f:
|
||||
json.dump(static_data, f)
|
||||
del static_data
|
||||
print('Sending static data to client')
|
||||
with open(static_data_file_path, 'rb') as f:
|
||||
return f.read()
|
||||
|
||||
|
||||
|
@ -93,8 +93,8 @@ class CorpusAnalysisStaticVisualization {
|
||||
|
||||
renderGeneralCorpusInfo() {
|
||||
let corpusData = this.data.corpus.o.staticData;
|
||||
document.querySelector('.corpus-num-tokens').innerHTML = corpusData.corpus.counts.token;
|
||||
document.querySelector('.corpus-num-s').innerHTML = corpusData.corpus.counts.s;
|
||||
document.querySelector('.corpus-num-tokens').innerHTML = corpusData.corpus.bounds[1] - corpusData.corpus.bounds[0];
|
||||
document.querySelector('.corpus-num-s').innerHTML = corpusData.s_attrs.s.lexicon.length;
|
||||
document.querySelector('.corpus-num-unique-words').innerHTML = Object.entries(corpusData.corpus.freqs.word).length;
|
||||
document.querySelector('.corpus-num-unique-lemmas').innerHTML = Object.entries(corpusData.corpus.freqs.lemma).length;
|
||||
document.querySelector('.corpus-num-unique-pos').innerHTML = Object.entries(corpusData.corpus.freqs.pos).length;
|
||||
@ -111,8 +111,11 @@ class CorpusAnalysisStaticVisualization {
|
||||
let resource = {
|
||||
title: corpusData.values.s_attrs.text[i].title,
|
||||
publishing_year: corpusData.values.s_attrs.text[i].publishing_year,
|
||||
num_tokens: corpusData.s_attrs.text.lexicon[i].counts.token,
|
||||
num_sentences: corpusData.s_attrs.text.lexicon[i].counts.s,
|
||||
// num_sentences: corpusData.s_attrs.text.lexicon[i].counts.s,
|
||||
num_tokens: corpusData.s_attrs.text.lexicon[i].bounds[1] - corpusData.s_attrs.text.lexicon[i].bounds[0],
|
||||
num_sentences: corpusData.s_attrs.s.lexicon.filter((s) => {
|
||||
return s.bounds[0] >= corpusData.s_attrs.text.lexicon[i].bounds[0] && s.bounds[1] <= corpusData.s_attrs.text.lexicon[i].bounds[1];
|
||||
}).length,
|
||||
num_unique_words: Object.entries(corpusData.s_attrs.text.lexicon[i].freqs.word).length,
|
||||
num_unique_lemmas: Object.entries(corpusData.s_attrs.text.lexicon[i].freqs.lemma).length,
|
||||
num_unique_pos: Object.entries(corpusData.s_attrs.text.lexicon[i].freqs.pos).length,
|
||||
@ -125,7 +128,7 @@ class CorpusAnalysisStaticVisualization {
|
||||
corpusTextInfoList.add(textData);
|
||||
|
||||
let textCountChipElement = document.querySelector('.text-count-chip');
|
||||
textCountChipElement.innerHTML = `Text count: ${corpusData.corpus.counts.text}`;
|
||||
textCountChipElement.innerHTML = `Text count: ${corpusData.s_attrs.text.lexicon.length}`;
|
||||
}
|
||||
|
||||
renderTextProportionsGraphic() {
|
||||
@ -198,7 +201,7 @@ class CorpusAnalysisStaticVisualization {
|
||||
default:
|
||||
graphData = [
|
||||
{
|
||||
values: texts.map(text => text[1].counts.token),
|
||||
values: texts.map(text => text[1].bounds[1] - text[1].bounds[0]),
|
||||
labels: texts.map(text => `${corpusData.values.s_attrs.text[text[0]].title} (${corpusData.values.s_attrs.text[text[0]].publishing_year})`),
|
||||
type: graphtype
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
apifairy
|
||||
cqi>=0.1.6
|
||||
cqi>=0.1.7
|
||||
dnspython==2.2.1
|
||||
docker
|
||||
eventlet
|
||||
|
Loading…
x
Reference in New Issue
Block a user