Implementation of visdata v2

This commit is contained in:
Patrick Jentsch 2023-06-16 17:35:54 +02:00
parent e6d8d72e52
commit 972f514e6b

View File

@ -49,62 +49,131 @@ def cqi_corpora_corpus_update_db(cqi_client: cqi.CQiClient, corpus_name: str):
@socketio_login_required @socketio_login_required
@cqi_over_socketio @cqi_over_socketio
def cqi_corpora_corpus_get_visualization_data(cqi_client: cqi.CQiClient, corpus_name: str): def cqi_corpora_corpus_get_visualization_data(cqi_client: cqi.CQiClient, corpus_name: str):
cqi_corpus = cqi_client.corpora.get(corpus_name) corpus = cqi_client.corpora.get(corpus_name)
text = corpus.structural_attributes.get('text')
s = corpus.structural_attributes.get('s')
ent = corpus.structural_attributes.get('ent')
word = corpus.positional_attributes.get('word')
lemma = corpus.positional_attributes.get('lemma')
pos = corpus.positional_attributes.get('pos')
simple_pos = corpus.positional_attributes.get('simple_pos')
payload = {} payload = {}
payload['num_tokens'] = cqi_corpus.size payload['corpus'] = {'lexicon': {}, 'values': []}
cqi_word_attr = cqi_corpus.positional_attributes.get('word') payload['corpus']['lexicon'][0] = {
payload['num_unique_words'] = cqi_word_attr.lexicon_size 'bounds': [0, corpus.size - 1],
payload['word_freqs'] = dict(zip(cqi_word_attr.values_by_ids(list(range(0, cqi_word_attr.lexicon_size))), cqi_word_attr.freqs_by_ids(list(range(0, cqi_word_attr.lexicon_size))))) 'counts': {
# payload['word_freqs'].sort(key=lambda a: a[1], reverse=True) 'text': text.size,
# payload['word_freqs'] = {k: v for k, v in payload['word_freqs']} 's': s.size,
cqi_lemma_attr = cqi_corpus.positional_attributes.get('lemma') 'ent': ent.size,
payload['num_unique_lemmas'] = cqi_lemma_attr.lexicon_size 'token': corpus.size
payload['lemma_freqs'] = dict(zip(cqi_lemma_attr.values_by_ids(list(range(0, cqi_lemma_attr.lexicon_size))), cqi_lemma_attr.freqs_by_ids(list(range(0, cqi_lemma_attr.lexicon_size))))) },
# payload['lemma_freqs'].sort(key=lambda a: a[1], reverse=True) 'freqs': {
# payload['lemma_freqs'] = {k: v for k, v in payload['lemma_freqs']} 'word': dict(
cqi_s_attr = cqi_corpus.structural_attributes.get('s') zip(
payload['num_sentences'] = cqi_s_attr.size range(0, word.lexicon_size),
# assuming all tokens are in a sentence word.freqs_by_ids(list(range(0, word.lexicon_size)))
payload['average_sentence_length'] = payload['num_tokens'] / payload['num_sentences'] if payload['num_sentences'] != 0 else 0 )
# payload['average_sentence_length'] = 0 ),
# for s_id in range(0, cqi_s_attr.size): 'lemma': dict(
# s_lbound, s_rbound = cqi_s_attr.cpos_by_id(s_id) zip(
# payload['average_sentence_length'] += s_rbound - s_lbound + 1 range(0, lemma.lexicon_size),
# payload['average_sentence_length'] /= payload['num_sentences'] lemma.freqs_by_ids(list(range(0, lemma.lexicon_size)))
cqi_ent_type_attr = cqi_corpus.structural_attributes.get('ent_type') )
payload['num_ent_types'] = cqi_ent_type_attr.size ),
payload['ent_type_freqs'] = dict(Counter(cqi_ent_type_attr.values_by_ids(list(range(0, cqi_ent_type_attr.size))))) 'pos': dict(
payload['num_unique_ent_types'] = len(payload['ent_type_freqs']) zip(
payload['texts'] = [] range(0, pos.lexicon_size),
cqi_text_attr = cqi_corpus.structural_attributes.get('text') pos.freqs_by_ids(list(range(0, pos.lexicon_size)))
for text_id in range(0, cqi_text_attr.size): )
text_lbound, text_rbound = cqi_text_attr.cpos_by_id(text_id) ),
text_cpos_list = list(range(text_lbound, text_rbound + 1)) 'simple_pos': dict(
text_payload = {} zip(
text_payload['num_tokens'] = text_rbound - text_lbound + 1 range(0, simple_pos.lexicon_size),
text_word_ids = cqi_word_attr.ids_by_cpos(text_cpos_list) simple_pos.freqs_by_ids(list(range(0, simple_pos.lexicon_size)))
print(text_word_ids) )
text_payload['num_unique_words'] = len(set(text_word_ids)) )
text_payload['word_freqs'] = dict(Counter(cqi_word_attr.values_by_ids(text_word_ids))) }
text_lemma_ids = cqi_lemma_attr.ids_by_cpos(text_cpos_list) }
text_payload['num_unique_lemmas'] = len(set(text_lemma_ids)) payload['text'] = {'lexicon': {}, 'values': None}
text_payload['lemma_freqs'] = dict(Counter(cqi_word_attr.values_by_ids(text_lemma_ids))) for text_id in range(0, text.size):
text_s_attr_ids = list(filter(lambda x: x != -1, cqi_s_attr.ids_by_cpos(text_cpos_list))) text_lbound, text_rbound = text.cpos_by_id(text_id)
text_payload['num_sentences'] = len(set(text_s_attr_ids)) text_cpos_range = range(text_lbound, text_rbound + 1)
# assuming all tokens are in a sentence text_s_ids = s.ids_by_cpos(list(text_cpos_range))
text_payload['average_sentence_length'] = text_payload['num_tokens'] / text_payload['num_sentences'] if text_payload['num_sentences'] != 0 else 0 text_ent_ids = ent.ids_by_cpos(list(text_cpos_range))
# text_payload['average_sentence_length'] = 0 payload['text']['lexicon'][text_id] = {
# for text_s_id in range(0, cqi_s_attr.size): 'bounds': [text_lbound, text_rbound],
# text_s_lbound, text_s_rbound = cqi_s_attr.cpos_by_id(text_s_id) 'counts': {
# text_payload['average_sentence_length'] += text_s_rbound - text_s_lbound + 1 's': len([x for x in text_s_ids if x != -1]),
# text_payload['average_sentence_length'] /= text_payload['num_sentences'] 'ent': len([x for x in text_ent_ids if x != -1]),
text_ent_type_ids = list(filter(lambda x: x != -1, cqi_ent_type_attr.ids_by_cpos(text_cpos_list))) 'token': text_rbound - text_lbound + 1
text_payload['num_ent_types'] = len(set(text_ent_type_ids)) },
text_payload['ent_type_freqs'] = dict(Counter(cqi_ent_type_attr.values_by_ids(text_ent_type_ids))) 'freqs': {
text_payload['num_unique_ent_types'] = len(text_payload['ent_type_freqs']) 'word': dict(
for text_sub_attr in cqi_corpus.structural_attributes.list(filters={'part_of': cqi_text_attr}): Counter(word.ids_by_cpos(list(text_cpos_range)))
text_payload[text_sub_attr.name[(len(cqi_text_attr.name) + 1):]] = text_sub_attr.values_by_ids([text_id])[0] ),
payload['texts'].append(text_payload) 'lemma': dict(
Counter(lemma.ids_by_cpos(list(text_cpos_range)))
),
'pos': dict(
Counter(pos.ids_by_cpos(list(text_cpos_range)))
),
'simple_pos': dict(
Counter(simple_pos.ids_by_cpos(list(text_cpos_range)))
)
}
}
payload['text']['values'] = [
sub_attr.name[(len(text.name) + 1):]
for sub_attr in corpus.structural_attributes.list(filters={'part_of': text})
]
payload['s'] = {'lexicon': {}, 'values': None}
for s_id in range(0, s.size):
payload['s']['lexicon'][s_id] = {
# 'bounds': s.cpos_by_id(s_id)
}
payload['s']['values'] = [
sub_attr.name[(len(s.name) + 1):]
for sub_attr in corpus.structural_attributes.list(filters={'part_of': s})
]
payload['ent'] = {'lexicon': {}, 'values': None}
for ent_id in range(0, ent.size):
payload['ent']['lexicon'][ent_id] = {
# 'bounds': ent.cpos_by_id(ent_id)
}
payload['ent']['values'] = [
sub_attr.name[(len(ent.name) + 1):]
for sub_attr in corpus.structural_attributes.list(filters={'part_of': ent})
]
payload['lookups'] = {
'corpus': {},
'text': {},
's': {},
'ent': {},
'word': dict(
zip(
range(0, word.lexicon_size),
word.values_by_ids(list(range(0, word.lexicon_size)))
)
),
'lemma': dict(
zip(
range(0, lemma.lexicon_size),
lemma.values_by_ids(list(range(0, lemma.lexicon_size)))
)
),
'pos': dict(
zip(
range(0, pos.lexicon_size),
pos.values_by_ids(list(range(0, pos.lexicon_size)))
)
),
'simple_pos': dict(
zip(
range(0, simple_pos.lexicon_size),
simple_pos.values_by_ids(list(range(0, simple_pos.lexicon_size)))
)
)
}
# print(payload) # print(payload)
return {'code': 200, 'msg': 'OK', 'payload': payload} return {'code': 200, 'msg': 'OK', 'payload': payload}