diff --git a/app/corpora/cqi_over_socketio/cqi_corpora_corpus.py b/app/corpora/cqi_over_socketio/cqi_corpora_corpus.py index 79b1a800..fff6a0b3 100644 --- a/app/corpora/cqi_over_socketio/cqi_corpora_corpus.py +++ b/app/corpora/cqi_over_socketio/cqi_corpora_corpus.py @@ -49,62 +49,131 @@ def cqi_corpora_corpus_update_db(cqi_client: cqi.CQiClient, corpus_name: str): @socketio_login_required @cqi_over_socketio def cqi_corpora_corpus_get_visualization_data(cqi_client: cqi.CQiClient, corpus_name: str): - cqi_corpus = cqi_client.corpora.get(corpus_name) + corpus = cqi_client.corpora.get(corpus_name) + text = corpus.structural_attributes.get('text') + s = corpus.structural_attributes.get('s') + ent = corpus.structural_attributes.get('ent') + word = corpus.positional_attributes.get('word') + lemma = corpus.positional_attributes.get('lemma') + pos = corpus.positional_attributes.get('pos') + simple_pos = corpus.positional_attributes.get('simple_pos') payload = {} - payload['num_tokens'] = cqi_corpus.size - cqi_word_attr = cqi_corpus.positional_attributes.get('word') - payload['num_unique_words'] = cqi_word_attr.lexicon_size - payload['word_freqs'] = dict(zip(cqi_word_attr.values_by_ids(list(range(0, cqi_word_attr.lexicon_size))), cqi_word_attr.freqs_by_ids(list(range(0, cqi_word_attr.lexicon_size))))) - # payload['word_freqs'].sort(key=lambda a: a[1], reverse=True) - # payload['word_freqs'] = {k: v for k, v in payload['word_freqs']} - cqi_lemma_attr = cqi_corpus.positional_attributes.get('lemma') - payload['num_unique_lemmas'] = cqi_lemma_attr.lexicon_size - payload['lemma_freqs'] = dict(zip(cqi_lemma_attr.values_by_ids(list(range(0, cqi_lemma_attr.lexicon_size))), cqi_lemma_attr.freqs_by_ids(list(range(0, cqi_lemma_attr.lexicon_size))))) - # payload['lemma_freqs'].sort(key=lambda a: a[1], reverse=True) - # payload['lemma_freqs'] = {k: v for k, v in payload['lemma_freqs']} - cqi_s_attr = cqi_corpus.structural_attributes.get('s') - payload['num_sentences'] = cqi_s_attr.size - # assuming all tokens are in a sentence - payload['average_sentence_length'] = payload['num_tokens'] / payload['num_sentences'] if payload['num_sentences'] != 0 else 0 - # payload['average_sentence_length'] = 0 - # for s_id in range(0, cqi_s_attr.size): - # s_lbound, s_rbound = cqi_s_attr.cpos_by_id(s_id) - # payload['average_sentence_length'] += s_rbound - s_lbound + 1 - # payload['average_sentence_length'] /= payload['num_sentences'] - cqi_ent_type_attr = cqi_corpus.structural_attributes.get('ent_type') - payload['num_ent_types'] = cqi_ent_type_attr.size - payload['ent_type_freqs'] = dict(Counter(cqi_ent_type_attr.values_by_ids(list(range(0, cqi_ent_type_attr.size))))) - payload['num_unique_ent_types'] = len(payload['ent_type_freqs']) - payload['texts'] = [] - cqi_text_attr = cqi_corpus.structural_attributes.get('text') - for text_id in range(0, cqi_text_attr.size): - text_lbound, text_rbound = cqi_text_attr.cpos_by_id(text_id) - text_cpos_list = list(range(text_lbound, text_rbound + 1)) - text_payload = {} - text_payload['num_tokens'] = text_rbound - text_lbound + 1 - text_word_ids = cqi_word_attr.ids_by_cpos(text_cpos_list) - print(text_word_ids) - text_payload['num_unique_words'] = len(set(text_word_ids)) - text_payload['word_freqs'] = dict(Counter(cqi_word_attr.values_by_ids(text_word_ids))) - text_lemma_ids = cqi_lemma_attr.ids_by_cpos(text_cpos_list) - text_payload['num_unique_lemmas'] = len(set(text_lemma_ids)) - text_payload['lemma_freqs'] = dict(Counter(cqi_word_attr.values_by_ids(text_lemma_ids))) - text_s_attr_ids = list(filter(lambda x: x != -1, cqi_s_attr.ids_by_cpos(text_cpos_list))) - text_payload['num_sentences'] = len(set(text_s_attr_ids)) - # assuming all tokens are in a sentence - text_payload['average_sentence_length'] = text_payload['num_tokens'] / text_payload['num_sentences'] if text_payload['num_sentences'] != 0 else 0 - # text_payload['average_sentence_length'] = 0 - # for text_s_id in range(0, cqi_s_attr.size): - # text_s_lbound, text_s_rbound = cqi_s_attr.cpos_by_id(text_s_id) - # text_payload['average_sentence_length'] += text_s_rbound - text_s_lbound + 1 - # text_payload['average_sentence_length'] /= text_payload['num_sentences'] - text_ent_type_ids = list(filter(lambda x: x != -1, cqi_ent_type_attr.ids_by_cpos(text_cpos_list))) - text_payload['num_ent_types'] = len(set(text_ent_type_ids)) - text_payload['ent_type_freqs'] = dict(Counter(cqi_ent_type_attr.values_by_ids(text_ent_type_ids))) - text_payload['num_unique_ent_types'] = len(text_payload['ent_type_freqs']) - for text_sub_attr in cqi_corpus.structural_attributes.list(filters={'part_of': cqi_text_attr}): - text_payload[text_sub_attr.name[(len(cqi_text_attr.name) + 1):]] = text_sub_attr.values_by_ids([text_id])[0] - payload['texts'].append(text_payload) + payload['corpus'] = {'lexicon': {}, 'values': []} + payload['corpus']['lexicon'][0] = { + 'bounds': [0, corpus.size - 1], + 'counts': { + 'text': text.size, + 's': s.size, + 'ent': ent.size, + 'token': corpus.size + }, + 'freqs': { + 'word': dict( + zip( + range(0, word.lexicon_size), + word.freqs_by_ids(list(range(0, word.lexicon_size))) + ) + ), + 'lemma': dict( + zip( + range(0, lemma.lexicon_size), + lemma.freqs_by_ids(list(range(0, lemma.lexicon_size))) + ) + ), + 'pos': dict( + zip( + range(0, pos.lexicon_size), + pos.freqs_by_ids(list(range(0, pos.lexicon_size))) + ) + ), + 'simple_pos': dict( + zip( + range(0, simple_pos.lexicon_size), + simple_pos.freqs_by_ids(list(range(0, simple_pos.lexicon_size))) + ) + ) + } + } + payload['text'] = {'lexicon': {}, 'values': None} + for text_id in range(0, text.size): + text_lbound, text_rbound = text.cpos_by_id(text_id) + text_cpos_range = range(text_lbound, text_rbound + 1) + text_s_ids = s.ids_by_cpos(list(text_cpos_range)) + text_ent_ids = ent.ids_by_cpos(list(text_cpos_range)) + payload['text']['lexicon'][text_id] = { + 'bounds': [text_lbound, text_rbound], + 'counts': { + 's': len([x for x in text_s_ids if x != -1]), + 'ent': len([x for x in text_ent_ids if x != -1]), + 'token': text_rbound - text_lbound + 1 + }, + 'freqs': { + 'word': dict( + Counter(word.ids_by_cpos(list(text_cpos_range))) + ), + 'lemma': dict( + Counter(lemma.ids_by_cpos(list(text_cpos_range))) + ), + 'pos': dict( + Counter(pos.ids_by_cpos(list(text_cpos_range))) + ), + 'simple_pos': dict( + Counter(simple_pos.ids_by_cpos(list(text_cpos_range))) + ) + } + } + payload['text']['values'] = [ + sub_attr.name[(len(text.name) + 1):] + for sub_attr in corpus.structural_attributes.list(filters={'part_of': text}) + ] + payload['s'] = {'lexicon': {}, 'values': None} + for s_id in range(0, s.size): + payload['s']['lexicon'][s_id] = { + # 'bounds': s.cpos_by_id(s_id) + } + payload['s']['values'] = [ + sub_attr.name[(len(s.name) + 1):] + for sub_attr in corpus.structural_attributes.list(filters={'part_of': s}) + ] + payload['ent'] = {'lexicon': {}, 'values': None} + for ent_id in range(0, ent.size): + payload['ent']['lexicon'][ent_id] = { + # 'bounds': ent.cpos_by_id(ent_id) + } + payload['ent']['values'] = [ + sub_attr.name[(len(ent.name) + 1):] + for sub_attr in corpus.structural_attributes.list(filters={'part_of': ent}) + ] + payload['lookups'] = { + 'corpus': {}, + 'text': {}, + 's': {}, + 'ent': {}, + 'word': dict( + zip( + range(0, word.lexicon_size), + word.values_by_ids(list(range(0, word.lexicon_size))) + ) + ), + 'lemma': dict( + zip( + range(0, lemma.lexicon_size), + lemma.values_by_ids(list(range(0, lemma.lexicon_size))) + ) + ), + 'pos': dict( + zip( + range(0, pos.lexicon_size), + pos.values_by_ids(list(range(0, pos.lexicon_size))) + ) + ), + 'simple_pos': dict( + zip( + range(0, simple_pos.lexicon_size), + simple_pos.values_by_ids(list(range(0, simple_pos.lexicon_size))) + ) + ) + } # print(payload) return {'code': 200, 'msg': 'OK', 'payload': payload}