From e6d8d72e52f16033990c327c7b005627199bf01c Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Wed, 14 Jun 2023 14:50:04 +0200 Subject: [PATCH] Add visualization data method to cqi over socketio --- .../cqi_over_socketio/cqi_corpora_corpus.py | 77 +++++++++++++++++-- app/static/js/CorpusAnalysis/CQiClient.js | 14 ++++ .../js/CorpusAnalysis/CorpusAnalysisApp.js | 9 +++ 3 files changed, 95 insertions(+), 5 deletions(-) diff --git a/app/corpora/cqi_over_socketio/cqi_corpora_corpus.py b/app/corpora/cqi_over_socketio/cqi_corpora_corpus.py index 9a976dd7..79b1a800 100644 --- a/app/corpora/cqi_over_socketio/cqi_corpora_corpus.py +++ b/app/corpora/cqi_over_socketio/cqi_corpora_corpus.py @@ -1,6 +1,8 @@ +from collections import Counter from flask import session import cqi import math +import random from app import db, socketio from app.decorators import socketio_login_required from app.models import Corpus @@ -38,10 +40,75 @@ def cqi_corpora_corpus_query(cqi_client: cqi.CQiClient, corpus_name: str, subcor @cqi_over_socketio def cqi_corpora_corpus_update_db(cqi_client: cqi.CQiClient, corpus_name: str): corpus = Corpus.query.get(session['d']['corpus_id']) - corpus.num_tokens = cqi_client.corpora.get(corpus_name).attrs['size'] + cqi_corpus = cqi_client.corpora.get(corpus_name) + corpus.num_tokens = cqi_corpus.size db.session.commit() +@socketio.on('cqi.corpora.corpus.get_visualization_data', namespace=ns) +@socketio_login_required +@cqi_over_socketio +def cqi_corpora_corpus_get_visualization_data(cqi_client: cqi.CQiClient, corpus_name: str): + cqi_corpus = cqi_client.corpora.get(corpus_name) + payload = {} + payload['num_tokens'] = cqi_corpus.size + cqi_word_attr = cqi_corpus.positional_attributes.get('word') + payload['num_unique_words'] = cqi_word_attr.lexicon_size + payload['word_freqs'] = dict(zip(cqi_word_attr.values_by_ids(list(range(0, cqi_word_attr.lexicon_size))), cqi_word_attr.freqs_by_ids(list(range(0, cqi_word_attr.lexicon_size))))) + # payload['word_freqs'].sort(key=lambda a: a[1], reverse=True) + # payload['word_freqs'] = {k: v for k, v in payload['word_freqs']} + cqi_lemma_attr = cqi_corpus.positional_attributes.get('lemma') + payload['num_unique_lemmas'] = cqi_lemma_attr.lexicon_size + payload['lemma_freqs'] = dict(zip(cqi_lemma_attr.values_by_ids(list(range(0, cqi_lemma_attr.lexicon_size))), cqi_lemma_attr.freqs_by_ids(list(range(0, cqi_lemma_attr.lexicon_size))))) + # payload['lemma_freqs'].sort(key=lambda a: a[1], reverse=True) + # payload['lemma_freqs'] = {k: v for k, v in payload['lemma_freqs']} + cqi_s_attr = cqi_corpus.structural_attributes.get('s') + payload['num_sentences'] = cqi_s_attr.size + # assuming all tokens are in a sentence + payload['average_sentence_length'] = payload['num_tokens'] / payload['num_sentences'] if payload['num_sentences'] != 0 else 0 + # payload['average_sentence_length'] = 0 + # for s_id in range(0, cqi_s_attr.size): + # s_lbound, s_rbound = cqi_s_attr.cpos_by_id(s_id) + # payload['average_sentence_length'] += s_rbound - s_lbound + 1 + # payload['average_sentence_length'] /= payload['num_sentences'] + cqi_ent_type_attr = cqi_corpus.structural_attributes.get('ent_type') + payload['num_ent_types'] = cqi_ent_type_attr.size + payload['ent_type_freqs'] = dict(Counter(cqi_ent_type_attr.values_by_ids(list(range(0, cqi_ent_type_attr.size))))) + payload['num_unique_ent_types'] = len(payload['ent_type_freqs']) + payload['texts'] = [] + cqi_text_attr = cqi_corpus.structural_attributes.get('text') + for text_id in range(0, cqi_text_attr.size): + text_lbound, text_rbound = cqi_text_attr.cpos_by_id(text_id) + text_cpos_list = list(range(text_lbound, text_rbound + 1)) + text_payload = {} + text_payload['num_tokens'] = text_rbound - text_lbound + 1 + text_word_ids = cqi_word_attr.ids_by_cpos(text_cpos_list) + print(text_word_ids) + text_payload['num_unique_words'] = len(set(text_word_ids)) + text_payload['word_freqs'] = dict(Counter(cqi_word_attr.values_by_ids(text_word_ids))) + text_lemma_ids = cqi_lemma_attr.ids_by_cpos(text_cpos_list) + text_payload['num_unique_lemmas'] = len(set(text_lemma_ids)) + text_payload['lemma_freqs'] = dict(Counter(cqi_word_attr.values_by_ids(text_lemma_ids))) + text_s_attr_ids = list(filter(lambda x: x != -1, cqi_s_attr.ids_by_cpos(text_cpos_list))) + text_payload['num_sentences'] = len(set(text_s_attr_ids)) + # assuming all tokens are in a sentence + text_payload['average_sentence_length'] = text_payload['num_tokens'] / text_payload['num_sentences'] if text_payload['num_sentences'] != 0 else 0 + # text_payload['average_sentence_length'] = 0 + # for text_s_id in range(0, cqi_s_attr.size): + # text_s_lbound, text_s_rbound = cqi_s_attr.cpos_by_id(text_s_id) + # text_payload['average_sentence_length'] += text_s_rbound - text_s_lbound + 1 + # text_payload['average_sentence_length'] /= text_payload['num_sentences'] + text_ent_type_ids = list(filter(lambda x: x != -1, cqi_ent_type_attr.ids_by_cpos(text_cpos_list))) + text_payload['num_ent_types'] = len(set(text_ent_type_ids)) + text_payload['ent_type_freqs'] = dict(Counter(cqi_ent_type_attr.values_by_ids(text_ent_type_ids))) + text_payload['num_unique_ent_types'] = len(text_payload['ent_type_freqs']) + for text_sub_attr in cqi_corpus.structural_attributes.list(filters={'part_of': cqi_text_attr}): + text_payload[text_sub_attr.name[(len(cqi_text_attr.name) + 1):]] = text_sub_attr.values_by_ids([text_id])[0] + payload['texts'].append(text_payload) + # print(payload) + return {'code': 200, 'msg': 'OK', 'payload': payload} + + @socketio.on('cqi.corpora.corpus.paginate', namespace=ns) @socketio_login_required @cqi_over_socketio @@ -52,13 +119,13 @@ def cqi_corpora_corpus_paginate(cqi_client: cqi.CQiClient, corpus_name: str, pag per_page < 1 or page < 1 or ( - cqi_corpus.attrs['size'] > 0 - and page > math.ceil(cqi_corpus.attrs['size'] / per_page) + cqi_corpus.size > 0 + and page > math.ceil(cqi_corpus.size / per_page) ) ): return {'code': 416, 'msg': 'Range Not Satisfiable'} first_cpos = (page - 1) * per_page - last_cpos = min(cqi_corpus.attrs['size'], first_cpos + per_page) + last_cpos = min(cqi_corpus.size, first_cpos + per_page) cpos_list = [*range(first_cpos, last_cpos)] lookups = lookups_by_cpos(cqi_corpus, cpos_list) payload = {} @@ -67,7 +134,7 @@ def cqi_corpora_corpus_paginate(cqi_client: cqi.CQiClient, corpus_name: str, pag # the lookups for the items payload['lookups'] = lookups # the total number of items matching the query - payload['total'] = cqi_corpus.attrs['size'] + payload['total'] = cqi_corpus.size # the number of items to be displayed on a page. payload['per_page'] = per_page # The total number of pages diff --git a/app/static/js/CorpusAnalysis/CQiClient.js b/app/static/js/CorpusAnalysis/CQiClient.js index fcc0c87d..5baa59e8 100644 --- a/app/static/js/CorpusAnalysis/CQiClient.js +++ b/app/static/js/CorpusAnalysis/CQiClient.js @@ -98,6 +98,20 @@ class CQiCorpus { this.subcorpora = new CQiSubcorpusCollection(this.socket, this); } + getVisualizationData() { + return new Promise((resolve, reject) => { + const args = {corpus_name: this.name}; + + this.socket.emit('cqi.corpora.corpus.get_visualization_data', args, response => { + if (response.code === 200) { + resolve(response.payload); + } else { + reject(response); + } + }); + }); + } + getCorpusData() { return new Promise((resolve, reject) => { const dummyData = { diff --git a/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js b/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js index cb012730..c955117b 100644 --- a/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js +++ b/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js @@ -34,6 +34,15 @@ class CorpusAnalysisApp { .then( cQiCorpus => { this.data.corpus = {o: cQiCorpus}; + // this.data.corpus.o.getVisualizationData() + // .then( + // (visualizationData) => { + // console.log(visualizationData); + // this.renderGeneralCorpusInfo(visualizationData); + // this.renderTextInfoList(visualizationData); + // this.renderTextProportionsGraphic(visualizationData); + // } + // ); this.data.corpus.o.getCorpusData() .then(corpusData => { this.renderGeneralCorpusInfo(corpusData);