diff --git a/Dockerfile b/Dockerfile
index fe63463e..750fee93 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.8.10-slim-buster
+FROM python:3.11.5-slim-bookworm
LABEL authors="Patrick Jentsch
"
diff --git a/app/corpora/cqi_over_sio/__init__.py b/app/corpora/cqi_over_sio/__init__.py
index 9a0f478d..888bfaae 100644
--- a/app/corpora/cqi_over_sio/__init__.py
+++ b/app/corpora/cqi_over_sio/__init__.py
@@ -121,10 +121,7 @@ class CQiNamespace(Namespace):
socketio.sleep(3)
retry_counter -= 1
db.session.refresh(db_corpus)
- cqi_client: CQiClient = CQiClient(
- f'cqpserver_{db_corpus_id}',
- timeout=float('inf')
- )
+ cqi_client: CQiClient = CQiClient(f'cqpserver_{db_corpus_id}')
session['cqi_over_sio'] = {
'cqi_client': cqi_client,
'cqi_client_lock': Lock(),
diff --git a/app/corpora/cqi_over_sio/extensions.py b/app/corpora/cqi_over_sio/extensions.py
index 70ee5d97..6748b963 100644
--- a/app/corpora/cqi_over_sio/extensions.py
+++ b/app/corpora/cqi_over_sio/extensions.py
@@ -1,6 +1,7 @@
from collections import Counter
from cqi import CQiClient
from cqi.models.corpora import Corpus as CQiCorpus
+from cqi.models.subcorpora import Subcorpus as CQiSubcorpus
from cqi.models.attributes import (
PositionalAttribute as CQiPositionalAttribute,
StructuralAttribute as CQiStructuralAttribute
@@ -40,161 +41,132 @@ def ext_corpus_update_db(corpus: str) -> CQiStatusOk:
def ext_corpus_static_data(corpus: str) -> Dict:
db_corpus_id: int = session['cqi_over_sio']['db_corpus_id']
db_corpus: Corpus = Corpus.query.get(db_corpus_id)
- cache_file_path: str = os.path.join(db_corpus.path, 'cwb', 'static.json.gz')
- if os.path.exists(cache_file_path):
- with open(cache_file_path, 'rb') as f:
+
+ static_data_file_path: str = os.path.join(db_corpus.path, 'cwb', 'static.json.gz')
+ if os.path.exists(static_data_file_path):
+ with open(static_data_file_path, 'rb') as f:
return f.read()
+
cqi_client: CQiClient = session['cqi_over_sio']['cqi_client']
cqi_corpus: CQiCorpus = cqi_client.corpora.get(corpus)
- cqi_p_attrs: Dict[str, CQiPositionalAttribute] = {
- p_attr.name: p_attr
- for p_attr in cqi_corpus.positional_attributes.list()
- }
- cqi_s_attrs: Dict[str, CQiStructuralAttribute] = {
- s_attr.name: s_attr
- for s_attr in cqi_corpus.structural_attributes.list()
- }
- static_corpus_data = {
+ cqi_p_attrs: List[CQiPositionalAttribute] = cqi_corpus.positional_attributes.list()
+ cqi_s_attrs: List[CQiStructuralAttribute] = cqi_corpus.structural_attributes.list()
+
+ static_data = {
'corpus': {
'bounds': [0, cqi_corpus.size - 1],
- 'counts': {
- 'token': cqi_corpus.size
- },
'freqs': {}
},
'p_attrs': {},
's_attrs': {},
'values': {'p_attrs': {}, 's_attrs': {}}
}
- for p_attr in cqi_p_attrs.values():
- static_corpus_data['corpus']['freqs'][p_attr.name] = {}
- chunk_size = 10000
- p_attr_id_list = list(range(p_attr.lexicon_size))
- chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)]
+
+ for p_attr in cqi_p_attrs:
+ print(f'corpus.freqs.{p_attr.name}')
+ static_data['corpus']['freqs'][p_attr.name] = []
+ p_attr_id_list: List[int] = list(range(p_attr.lexicon_size))
+ static_data['corpus']['freqs'][p_attr.name].extend(p_attr.freqs_by_ids(p_attr_id_list))
del p_attr_id_list
- for chunk in chunks:
- # print(f'corpus.freqs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
- static_corpus_data['corpus']['freqs'][p_attr.name].update(
- dict(zip(chunk, p_attr.freqs_by_ids(chunk)))
- )
- del chunks
- static_corpus_data['p_attrs'][p_attr.name] = {}
- cpos_list = list(range(cqi_corpus.size))
- chunks = [cpos_list[i:i+chunk_size] for i in range(0, len(cpos_list), chunk_size)]
+
+ print(f'p_attrs.{p_attr.name}')
+ static_data['p_attrs'][p_attr.name] = []
+ cpos_list: List[int] = list(range(cqi_corpus.size))
+ static_data['p_attrs'][p_attr.name].extend(p_attr.ids_by_cpos(cpos_list))
del cpos_list
- for chunk in chunks:
- # print(f'p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
- static_corpus_data['p_attrs'][p_attr.name].update(
- dict(zip(chunk, p_attr.ids_by_cpos(chunk)))
- )
- del chunks
- static_corpus_data['values']['p_attrs'][p_attr.name] = {}
- p_attr_id_list = list(range(p_attr.lexicon_size))
- chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)]
+
+ print(f'values.p_attrs.{p_attr.name}')
+ static_data['values']['p_attrs'][p_attr.name] = []
+ p_attr_id_list: List[int] = list(range(p_attr.lexicon_size))
+ static_data['values']['p_attrs'][p_attr.name].extend(p_attr.values_by_ids(p_attr_id_list))
del p_attr_id_list
- for chunk in chunks:
- # print(f'values.p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
- static_corpus_data['values']['p_attrs'][p_attr.name].update(
- dict(zip(chunk, p_attr.values_by_ids(chunk)))
- )
- del chunks
- for s_attr in cqi_s_attrs.values():
+
+ for s_attr in cqi_s_attrs:
if s_attr.has_values:
continue
- static_corpus_data['corpus']['counts'][s_attr.name] = s_attr.size
- static_corpus_data['s_attrs'][s_attr.name] = {'lexicon': {}, 'values': None}
- static_corpus_data['values']['s_attrs'][s_attr.name] = {}
- ##########################################################################
- # A faster way to get cpos boundaries for smaller s_attrs #
- ##########################################################################
- # if s_attr.name in ['s', 'ent']:
- # cqi_corpus.query('Last', f'<{s_attr.name}> []* {s_attr.name}>;')
- # cqi_subcorpus = cqi_corpus.subcorpora.get('Last')
- # first_match = 0
- # last_match = cqi_subcorpus.size - 1
- # match_boundaries = zip(
- # range(first_match, last_match + 1),
- # cqi_subcorpus.dump(cqi_subcorpus.fields['match'], first_match, last_match),
- # cqi_subcorpus.dump(cqi_subcorpus.fields['matchend'], first_match, last_match)
- # )
- # for id, lbound, rbound in match_boundaries:
- # static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {}
- # static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
- # static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {}
- # static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
- # cqi_subcorpus.drop()
+
+ static_data['s_attrs'][s_attr.name] = {'lexicon': [], 'values': None}
+
+ if s_attr.name in ['s', 'ent']:
+ ##############################################################
+ # A faster way to get cpos boundaries for smaller s_attrs #
+ # Note: Needs more testing, don't use it in production #
+ ##############################################################
+ cqi_corpus.query('Last', f'<{s_attr.name}> []* {s_attr.name}>;')
+ cqi_subcorpus: CQiSubcorpus = cqi_corpus.subcorpora.get('Last')
+ first_match: int = 0
+ last_match: int = cqi_subcorpus.size - 1
+ match_boundaries = zip(
+ range(first_match, last_match + 1),
+ cqi_subcorpus.dump(
+ cqi_subcorpus.fields['match'],
+ first_match,
+ last_match
+ ),
+ cqi_subcorpus.dump(
+ cqi_subcorpus.fields['matchend'],
+ first_match,
+ last_match
+ )
+ )
+ cqi_subcorpus.drop()
+ del cqi_subcorpus, first_match, last_match
+ for id, lbound, rbound in match_boundaries:
+ static_data['s_attrs'][s_attr.name]['lexicon'].append({})
+ print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
+ static_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
+ del match_boundaries
+
+ if s_attr.name != 'text':
+ continue
+
for id in range(0, s_attr.size):
- # print(f's_attrs.{s_attr.name}.lexicon.{id}')
- static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {
- 'bounds': None,
- 'counts': None,
- 'freqs': None
- }
- if s_attr.name != 'text':
- continue
+ static_data['s_attrs'][s_attr.name]['lexicon'].append({})
+ # This is a very slow operation, thats why we only use it for
+ # the text attribute
lbound, rbound = s_attr.cpos_by_id(id)
- # print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
- static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
- # print(f's_attrs.{s_attr.name}.lexicon.{id}.counts')
- static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {}
- static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
- cpos_list = list(range(lbound, rbound + 1))
- chunks = [cpos_list[i:i+chunk_size] for i in range(0, len(cpos_list), chunk_size)]
- del cpos_list
- ent_ids = set()
- for chunk in chunks:
- # print(f'Gather ent_ids from cpos: {chunk[0]} - {chunk[-1]}')
- ent_ids.update({x for x in cqi_s_attrs['ent'].ids_by_cpos(chunk) if x != -1})
- static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['ent'] = len(ent_ids)
- del ent_ids
- s_ids = set()
- for chunk in chunks:
- # print(f'Gather s_ids from cpos: {chunk[0]} - {chunk[-1]}')
- s_ids.update({x for x in cqi_s_attrs['s'].ids_by_cpos(chunk) if x != -1})
- static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['s'] = len(s_ids)
- del s_ids
- # print(f's_attrs.{s_attr.name}.lexicon.{id}.freqs')
- static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'] = {}
- for p_attr in cqi_p_attrs.values():
- p_attr_ids = []
- for chunk in chunks:
- # print(f'Gather p_attr_ids from cpos: {chunk[0]} - {chunk[-1]}')
- p_attr_ids.extend(p_attr.ids_by_cpos(chunk))
- static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'][p_attr.name] = dict(Counter(p_attr_ids))
+ print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
+ static_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
+ static_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'] = {}
+ cpos_list: List[int] = list(range(lbound, rbound + 1))
+ for p_attr in cqi_p_attrs:
+ p_attr_ids: List[int] = []
+ p_attr_ids.extend(p_attr.ids_by_cpos(cpos_list))
+ print(f's_attrs.{s_attr.name}.lexicon.{id}.freqs.{p_attr.name}')
+ static_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'][p_attr.name] = dict(Counter(p_attr_ids))
del p_attr_ids
- del chunks
- sub_s_attrs = cqi_corpus.structural_attributes.list(filters={'part_of': s_attr})
- s_attr_value_names: List[str] = [
+ del cpos_list
+
+ sub_s_attrs: List[CQiStructuralAttribute] = cqi_corpus.structural_attributes.list(filters={'part_of': s_attr})
+ print(f's_attrs.{s_attr.name}.values')
+ static_data['s_attrs'][s_attr.name]['values'] = [
sub_s_attr.name[(len(s_attr.name) + 1):]
for sub_s_attr in sub_s_attrs
]
- s_attr_id_list = list(range(s_attr.size))
- chunks = [s_attr_id_list[i:i+chunk_size] for i in range(0, len(s_attr_id_list), chunk_size)]
- del s_attr_id_list
- sub_s_attr_values = []
+ s_attr_id_list: List[int] = list(range(s_attr.size))
+ sub_s_attr_values: List[str] = []
for sub_s_attr in sub_s_attrs:
tmp = []
- for chunk in chunks:
- tmp.extend(sub_s_attr.values_by_ids(chunk))
+ tmp.extend(sub_s_attr.values_by_ids(s_attr_id_list))
sub_s_attr_values.append(tmp)
del tmp
- del chunks
- # print(f's_attrs.{s_attr.name}.values')
- static_corpus_data['s_attrs'][s_attr.name]['values'] = s_attr_value_names
- # print(f'values.s_attrs.{s_attr.name}')
- static_corpus_data['values']['s_attrs'][s_attr.name] = {
- s_attr_id: {
- s_attr_value_name: sub_s_attr_values[s_attr_value_name_idx][s_attr_id_idx]
+ del s_attr_id_list
+ print(f'values.s_attrs.{s_attr.name}')
+ static_data['values']['s_attrs'][s_attr.name] = [
+ {
+ s_attr_value_name: sub_s_attr_values[s_attr_value_name_idx][s_attr_id]
for s_attr_value_name_idx, s_attr_value_name in enumerate(
- static_corpus_data['s_attrs'][s_attr.name]['values']
+ static_data['s_attrs'][s_attr.name]['values']
)
- } for s_attr_id_idx, s_attr_id in enumerate(range(0, s_attr.size))
- }
+ } for s_attr_id in range(0, s_attr.size)
+ ]
del sub_s_attr_values
- with gzip.open(cache_file_path, 'wt') as f:
- json.dump(static_corpus_data, f)
- del static_corpus_data
- with open(cache_file_path, 'rb') as f:
+ print('Saving static data to file')
+ with gzip.open(static_data_file_path, 'wt') as f:
+ json.dump(static_data, f)
+ del static_data
+ print('Sending static data to client')
+ with open(static_data_file_path, 'rb') as f:
return f.read()
diff --git a/app/static/js/CorpusAnalysis/CorpusAnalysisStaticVisualization.js b/app/static/js/CorpusAnalysis/CorpusAnalysisStaticVisualization.js
index e51a3534..38c48c3c 100644
--- a/app/static/js/CorpusAnalysis/CorpusAnalysisStaticVisualization.js
+++ b/app/static/js/CorpusAnalysis/CorpusAnalysisStaticVisualization.js
@@ -93,8 +93,8 @@ class CorpusAnalysisStaticVisualization {
renderGeneralCorpusInfo() {
let corpusData = this.data.corpus.o.staticData;
- document.querySelector('.corpus-num-tokens').innerHTML = corpusData.corpus.counts.token;
- document.querySelector('.corpus-num-s').innerHTML = corpusData.corpus.counts.s;
+ document.querySelector('.corpus-num-tokens').innerHTML = corpusData.corpus.bounds[1] - corpusData.corpus.bounds[0];
+ document.querySelector('.corpus-num-s').innerHTML = corpusData.s_attrs.s.lexicon.length;
document.querySelector('.corpus-num-unique-words').innerHTML = Object.entries(corpusData.corpus.freqs.word).length;
document.querySelector('.corpus-num-unique-lemmas').innerHTML = Object.entries(corpusData.corpus.freqs.lemma).length;
document.querySelector('.corpus-num-unique-pos').innerHTML = Object.entries(corpusData.corpus.freqs.pos).length;
@@ -111,8 +111,11 @@ class CorpusAnalysisStaticVisualization {
let resource = {
title: corpusData.values.s_attrs.text[i].title,
publishing_year: corpusData.values.s_attrs.text[i].publishing_year,
- num_tokens: corpusData.s_attrs.text.lexicon[i].counts.token,
- num_sentences: corpusData.s_attrs.text.lexicon[i].counts.s,
+ // num_sentences: corpusData.s_attrs.text.lexicon[i].counts.s,
+ num_tokens: corpusData.s_attrs.text.lexicon[i].bounds[1] - corpusData.s_attrs.text.lexicon[i].bounds[0],
+ num_sentences: corpusData.s_attrs.s.lexicon.filter((s) => {
+ return s.bounds[0] >= corpusData.s_attrs.text.lexicon[i].bounds[0] && s.bounds[1] <= corpusData.s_attrs.text.lexicon[i].bounds[1];
+ }).length,
num_unique_words: Object.entries(corpusData.s_attrs.text.lexicon[i].freqs.word).length,
num_unique_lemmas: Object.entries(corpusData.s_attrs.text.lexicon[i].freqs.lemma).length,
num_unique_pos: Object.entries(corpusData.s_attrs.text.lexicon[i].freqs.pos).length,
@@ -125,7 +128,7 @@ class CorpusAnalysisStaticVisualization {
corpusTextInfoList.add(textData);
let textCountChipElement = document.querySelector('.text-count-chip');
- textCountChipElement.innerHTML = `Text count: ${corpusData.corpus.counts.text}`;
+ textCountChipElement.innerHTML = `Text count: ${corpusData.s_attrs.text.lexicon.length}`;
}
renderTextProportionsGraphic() {
@@ -198,7 +201,7 @@ class CorpusAnalysisStaticVisualization {
default:
graphData = [
{
- values: texts.map(text => text[1].counts.token),
+ values: texts.map(text => text[1].bounds[1] - text[1].bounds[0]),
labels: texts.map(text => `${corpusData.values.s_attrs.text[text[0]].title} (${corpusData.values.s_attrs.text[text[0]].publishing_year})`),
type: graphtype
}
diff --git a/requirements.txt b/requirements.txt
index e7d8fc9d..360325e0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
apifairy
-cqi>=0.1.6
+cqi>=0.1.7
dnspython==2.2.1
docker
eventlet
@@ -11,7 +11,8 @@ Flask-Hashids>=1.0.1
Flask-HTTPAuth
Flask-Login
Flask-Mail
-flask-marshmallow==0.14.0
+Flask-Marshmallow==0.14.0
+Flask-Menu==0.7.2
Flask-Migrate
Flask-Paranoid
Flask-SocketIO