mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
				synced 2025-11-04 04:12:45 +00:00 
			
		
		
		
	Implementation of visdata v2
This commit is contained in:
		@@ -49,62 +49,131 @@ def cqi_corpora_corpus_update_db(cqi_client: cqi.CQiClient, corpus_name: str):
 | 
			
		||||
@socketio_login_required
 | 
			
		||||
@cqi_over_socketio
 | 
			
		||||
def cqi_corpora_corpus_get_visualization_data(cqi_client: cqi.CQiClient, corpus_name: str):
 | 
			
		||||
    cqi_corpus = cqi_client.corpora.get(corpus_name)
 | 
			
		||||
    corpus = cqi_client.corpora.get(corpus_name)
 | 
			
		||||
    text = corpus.structural_attributes.get('text')
 | 
			
		||||
    s = corpus.structural_attributes.get('s')
 | 
			
		||||
    ent = corpus.structural_attributes.get('ent')
 | 
			
		||||
    word = corpus.positional_attributes.get('word')
 | 
			
		||||
    lemma = corpus.positional_attributes.get('lemma')
 | 
			
		||||
    pos = corpus.positional_attributes.get('pos')
 | 
			
		||||
    simple_pos = corpus.positional_attributes.get('simple_pos')
 | 
			
		||||
    payload = {}
 | 
			
		||||
    payload['num_tokens'] = cqi_corpus.size
 | 
			
		||||
    cqi_word_attr = cqi_corpus.positional_attributes.get('word')
 | 
			
		||||
    payload['num_unique_words'] = cqi_word_attr.lexicon_size
 | 
			
		||||
    payload['word_freqs'] = dict(zip(cqi_word_attr.values_by_ids(list(range(0, cqi_word_attr.lexicon_size))), cqi_word_attr.freqs_by_ids(list(range(0, cqi_word_attr.lexicon_size)))))
 | 
			
		||||
    # payload['word_freqs'].sort(key=lambda a: a[1], reverse=True)
 | 
			
		||||
    # payload['word_freqs'] = {k: v for k, v in payload['word_freqs']}
 | 
			
		||||
    cqi_lemma_attr = cqi_corpus.positional_attributes.get('lemma')
 | 
			
		||||
    payload['num_unique_lemmas'] = cqi_lemma_attr.lexicon_size
 | 
			
		||||
    payload['lemma_freqs'] = dict(zip(cqi_lemma_attr.values_by_ids(list(range(0, cqi_lemma_attr.lexicon_size))), cqi_lemma_attr.freqs_by_ids(list(range(0, cqi_lemma_attr.lexicon_size)))))
 | 
			
		||||
    # payload['lemma_freqs'].sort(key=lambda a: a[1], reverse=True)
 | 
			
		||||
    # payload['lemma_freqs'] = {k: v for k, v in payload['lemma_freqs']}
 | 
			
		||||
    cqi_s_attr = cqi_corpus.structural_attributes.get('s')
 | 
			
		||||
    payload['num_sentences'] = cqi_s_attr.size
 | 
			
		||||
    # assuming all tokens are in a sentence
 | 
			
		||||
    payload['average_sentence_length'] = payload['num_tokens'] / payload['num_sentences'] if payload['num_sentences'] != 0 else 0
 | 
			
		||||
    # payload['average_sentence_length'] = 0
 | 
			
		||||
    # for s_id in range(0, cqi_s_attr.size):
 | 
			
		||||
    #     s_lbound, s_rbound = cqi_s_attr.cpos_by_id(s_id)
 | 
			
		||||
    #     payload['average_sentence_length'] += s_rbound - s_lbound + 1
 | 
			
		||||
    # payload['average_sentence_length'] /= payload['num_sentences']
 | 
			
		||||
    cqi_ent_type_attr = cqi_corpus.structural_attributes.get('ent_type')
 | 
			
		||||
    payload['num_ent_types'] = cqi_ent_type_attr.size
 | 
			
		||||
    payload['ent_type_freqs'] = dict(Counter(cqi_ent_type_attr.values_by_ids(list(range(0, cqi_ent_type_attr.size)))))
 | 
			
		||||
    payload['num_unique_ent_types'] = len(payload['ent_type_freqs'])
 | 
			
		||||
    payload['texts'] = []
 | 
			
		||||
    cqi_text_attr = cqi_corpus.structural_attributes.get('text')
 | 
			
		||||
    for text_id in range(0, cqi_text_attr.size):
 | 
			
		||||
        text_lbound, text_rbound = cqi_text_attr.cpos_by_id(text_id)
 | 
			
		||||
        text_cpos_list = list(range(text_lbound, text_rbound + 1))
 | 
			
		||||
        text_payload = {}
 | 
			
		||||
        text_payload['num_tokens'] = text_rbound - text_lbound + 1
 | 
			
		||||
        text_word_ids = cqi_word_attr.ids_by_cpos(text_cpos_list)
 | 
			
		||||
        print(text_word_ids)
 | 
			
		||||
        text_payload['num_unique_words'] = len(set(text_word_ids))
 | 
			
		||||
        text_payload['word_freqs'] = dict(Counter(cqi_word_attr.values_by_ids(text_word_ids)))
 | 
			
		||||
        text_lemma_ids = cqi_lemma_attr.ids_by_cpos(text_cpos_list)
 | 
			
		||||
        text_payload['num_unique_lemmas'] = len(set(text_lemma_ids))
 | 
			
		||||
        text_payload['lemma_freqs'] = dict(Counter(cqi_word_attr.values_by_ids(text_lemma_ids)))
 | 
			
		||||
        text_s_attr_ids = list(filter(lambda x: x != -1, cqi_s_attr.ids_by_cpos(text_cpos_list)))
 | 
			
		||||
        text_payload['num_sentences'] = len(set(text_s_attr_ids))
 | 
			
		||||
        # assuming all tokens are in a sentence
 | 
			
		||||
        text_payload['average_sentence_length'] = text_payload['num_tokens'] / text_payload['num_sentences'] if text_payload['num_sentences'] != 0 else 0
 | 
			
		||||
        # text_payload['average_sentence_length'] = 0
 | 
			
		||||
        # for text_s_id in range(0, cqi_s_attr.size):
 | 
			
		||||
        #     text_s_lbound, text_s_rbound = cqi_s_attr.cpos_by_id(text_s_id)
 | 
			
		||||
        #     text_payload['average_sentence_length'] += text_s_rbound - text_s_lbound + 1
 | 
			
		||||
        # text_payload['average_sentence_length'] /= text_payload['num_sentences']
 | 
			
		||||
        text_ent_type_ids = list(filter(lambda x: x != -1, cqi_ent_type_attr.ids_by_cpos(text_cpos_list)))
 | 
			
		||||
        text_payload['num_ent_types'] = len(set(text_ent_type_ids))
 | 
			
		||||
        text_payload['ent_type_freqs'] = dict(Counter(cqi_ent_type_attr.values_by_ids(text_ent_type_ids)))
 | 
			
		||||
        text_payload['num_unique_ent_types'] = len(text_payload['ent_type_freqs'])
 | 
			
		||||
        for text_sub_attr in cqi_corpus.structural_attributes.list(filters={'part_of': cqi_text_attr}):
 | 
			
		||||
            text_payload[text_sub_attr.name[(len(cqi_text_attr.name) + 1):]] = text_sub_attr.values_by_ids([text_id])[0]
 | 
			
		||||
        payload['texts'].append(text_payload)
 | 
			
		||||
    payload['corpus'] = {'lexicon': {}, 'values': []}
 | 
			
		||||
    payload['corpus']['lexicon'][0] = {
 | 
			
		||||
        'bounds': [0, corpus.size - 1],
 | 
			
		||||
        'counts': {
 | 
			
		||||
            'text': text.size,
 | 
			
		||||
            's': s.size,
 | 
			
		||||
            'ent': ent.size,
 | 
			
		||||
            'token': corpus.size
 | 
			
		||||
        },
 | 
			
		||||
        'freqs': {
 | 
			
		||||
            'word': dict(
 | 
			
		||||
                zip(
 | 
			
		||||
                    range(0, word.lexicon_size),
 | 
			
		||||
                    word.freqs_by_ids(list(range(0, word.lexicon_size)))
 | 
			
		||||
                )
 | 
			
		||||
            ),
 | 
			
		||||
            'lemma': dict(
 | 
			
		||||
                zip(
 | 
			
		||||
                    range(0, lemma.lexicon_size),
 | 
			
		||||
                    lemma.freqs_by_ids(list(range(0, lemma.lexicon_size)))
 | 
			
		||||
                )
 | 
			
		||||
            ),
 | 
			
		||||
            'pos': dict(
 | 
			
		||||
                zip(
 | 
			
		||||
                    range(0, pos.lexicon_size),
 | 
			
		||||
                    pos.freqs_by_ids(list(range(0, pos.lexicon_size)))
 | 
			
		||||
                )
 | 
			
		||||
            ),
 | 
			
		||||
            'simple_pos': dict(
 | 
			
		||||
                zip(
 | 
			
		||||
                    range(0, simple_pos.lexicon_size),
 | 
			
		||||
                    simple_pos.freqs_by_ids(list(range(0, simple_pos.lexicon_size)))
 | 
			
		||||
                )
 | 
			
		||||
            )
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    payload['text'] = {'lexicon': {}, 'values': None}
 | 
			
		||||
    for text_id in range(0, text.size):
 | 
			
		||||
        text_lbound, text_rbound = text.cpos_by_id(text_id)
 | 
			
		||||
        text_cpos_range = range(text_lbound, text_rbound + 1)
 | 
			
		||||
        text_s_ids = s.ids_by_cpos(list(text_cpos_range))
 | 
			
		||||
        text_ent_ids = ent.ids_by_cpos(list(text_cpos_range))
 | 
			
		||||
        payload['text']['lexicon'][text_id] = {
 | 
			
		||||
            'bounds': [text_lbound, text_rbound],
 | 
			
		||||
            'counts': {
 | 
			
		||||
                's': len([x for x in text_s_ids if x != -1]),
 | 
			
		||||
                'ent': len([x for x in text_ent_ids if x != -1]),
 | 
			
		||||
                'token': text_rbound - text_lbound + 1
 | 
			
		||||
            },
 | 
			
		||||
            'freqs': {
 | 
			
		||||
                'word': dict(
 | 
			
		||||
                    Counter(word.ids_by_cpos(list(text_cpos_range)))
 | 
			
		||||
                ),
 | 
			
		||||
                'lemma': dict(
 | 
			
		||||
                    Counter(lemma.ids_by_cpos(list(text_cpos_range)))
 | 
			
		||||
                ),
 | 
			
		||||
                'pos': dict(
 | 
			
		||||
                    Counter(pos.ids_by_cpos(list(text_cpos_range)))
 | 
			
		||||
                ),
 | 
			
		||||
                'simple_pos': dict(
 | 
			
		||||
                    Counter(simple_pos.ids_by_cpos(list(text_cpos_range)))
 | 
			
		||||
                )
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    payload['text']['values'] = [
 | 
			
		||||
        sub_attr.name[(len(text.name) + 1):]
 | 
			
		||||
        for sub_attr in corpus.structural_attributes.list(filters={'part_of': text})
 | 
			
		||||
    ]
 | 
			
		||||
    payload['s'] = {'lexicon': {}, 'values': None}
 | 
			
		||||
    for s_id in range(0, s.size):
 | 
			
		||||
        payload['s']['lexicon'][s_id] = {
 | 
			
		||||
            # 'bounds': s.cpos_by_id(s_id)
 | 
			
		||||
        }
 | 
			
		||||
    payload['s']['values'] = [
 | 
			
		||||
        sub_attr.name[(len(s.name) + 1):]
 | 
			
		||||
        for sub_attr in corpus.structural_attributes.list(filters={'part_of': s})
 | 
			
		||||
    ]
 | 
			
		||||
    payload['ent'] = {'lexicon': {}, 'values': None}
 | 
			
		||||
    for ent_id in range(0, ent.size):
 | 
			
		||||
        payload['ent']['lexicon'][ent_id] = {
 | 
			
		||||
            # 'bounds': ent.cpos_by_id(ent_id)
 | 
			
		||||
        }
 | 
			
		||||
    payload['ent']['values'] = [
 | 
			
		||||
        sub_attr.name[(len(ent.name) + 1):]
 | 
			
		||||
        for sub_attr in corpus.structural_attributes.list(filters={'part_of': ent})
 | 
			
		||||
    ]
 | 
			
		||||
    payload['lookups'] = {
 | 
			
		||||
        'corpus': {},
 | 
			
		||||
        'text': {},
 | 
			
		||||
        's': {},
 | 
			
		||||
        'ent': {},
 | 
			
		||||
        'word': dict(
 | 
			
		||||
            zip(
 | 
			
		||||
                range(0, word.lexicon_size),
 | 
			
		||||
                word.values_by_ids(list(range(0, word.lexicon_size)))
 | 
			
		||||
            )
 | 
			
		||||
        ),
 | 
			
		||||
        'lemma': dict(
 | 
			
		||||
            zip(
 | 
			
		||||
                range(0, lemma.lexicon_size),
 | 
			
		||||
                lemma.values_by_ids(list(range(0, lemma.lexicon_size)))
 | 
			
		||||
            )
 | 
			
		||||
        ),
 | 
			
		||||
        'pos': dict(
 | 
			
		||||
            zip(
 | 
			
		||||
                range(0, pos.lexicon_size),
 | 
			
		||||
                pos.values_by_ids(list(range(0, pos.lexicon_size)))
 | 
			
		||||
            )
 | 
			
		||||
        ),
 | 
			
		||||
        'simple_pos': dict(
 | 
			
		||||
            zip(
 | 
			
		||||
                range(0, simple_pos.lexicon_size),
 | 
			
		||||
                simple_pos.values_by_ids(list(range(0, simple_pos.lexicon_size)))
 | 
			
		||||
            )
 | 
			
		||||
        )
 | 
			
		||||
    }
 | 
			
		||||
    # print(payload)
 | 
			
		||||
    return {'code': 200, 'msg': 'OK', 'payload': payload}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user