From dbd580b3c037645560bd7c9bccb932a3280d85df Mon Sep 17 00:00:00 2001 From: Stephan Porada Date: Thu, 28 Nov 2019 14:14:56 +0100 Subject: [PATCH] Get results with wrapper 3.0 --- app/corpora/CQiWrapper/CQiWrapper.py | 122 ++++++++++--------- app/corpora/events.py | 47 +------ app/templates/corpora/analyse_corpus.html.j2 | 9 +- 3 files changed, 76 insertions(+), 102 deletions(-) diff --git a/app/corpora/CQiWrapper/CQiWrapper.py b/app/corpora/CQiWrapper/CQiWrapper.py index d8b133ac..ea0acff9 100644 --- a/app/corpora/CQiWrapper/CQiWrapper.py +++ b/app/corpora/CQiWrapper/CQiWrapper.py @@ -1,6 +1,6 @@ from .CQiClient import CQiClient from .CQi import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND -import re +import time from app import logger # only works if imported into opaque web app @@ -94,6 +94,7 @@ class CQiWrapper(CQiClient): + result_subcorpus_name) self.SUBCORPUS_NAMES.append(self.result_subcorpus) self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus) + print('Nr of all matches is:', self.nr_matches) # logger.warning('Nr of all matches is: {}'.format(self.nr_matches)) def show_subcorpora(self): @@ -104,7 +105,8 @@ class CQiWrapper(CQiClient): def show_query_results(self, context_len=10, - result_len=1000): + result_len=1000, + result_offset=0): """ Show query results @@ -131,14 +133,16 @@ class CQiWrapper(CQiClient): # match_boundries shows the start and end cpos of one match as a # pair of cpositions # [(1355, 1357), (1477, 1479)] Example for two boundry pairs + offset_start = 0 + (result_offset + 1) if result_offset != 0 else result_offset + offset_end = self.nr_matches + result_offset match_boundaries = zip(self.cqp_dump_subcorpus(self.result_subcorpus, CONST_FIELD_MATCH, - 0, - self.nr_matches - 1), + offset_start, + offset_end), self.cqp_dump_subcorpus(self.result_subcorpus, CONST_FIELD_MATCHEND, - 0, - self.nr_matches - 1)) + offset_start, + offset_end)) # Generate all cpos between match boundries including start and end boundries. # Also generate cpos for left and right context. @@ -152,7 +156,7 @@ class CQiWrapper(CQiClient): lc = {'lc': lc_cpos} match_cpos = list(range(start, end + 1)) match = {'hit': match_cpos} - rc_cpos = list(range(end + 1, min([self.corpus_max_len, end + self.context_len + 1]))) + rc_cpos = list(range(end, min([self.corpus_max_len, end + self.context_len]))) rc = {'rc': rc_cpos} lc.update(match) lc.update(rc) @@ -161,81 +165,87 @@ class CQiWrapper(CQiClient): # print(all_matches) # print(all_cpos) - # Get all sentences IDs for all above collected cpos in all_cpos - s_ids = self.cl_cpos2struc('CORPUS.s', all_cpos) # CHANGE to CORPUS.s will always be like this in nopaque # Get all cpos for all sneteces boundries - s_lookup = {} - for s_id in set(s_ids): - s_start, s_end = self.cl_struc2cpos('CORPUS.s', s_id) # CHANGE to CORPUS.s will always be like this in nopaque - # print(s_start, s_end) - s_cpos = range(s_start, s_end) - s_lookup.update({s_id: list(s_cpos)}) - # print(list(s_cpos)) - all_cpos.extend(s_cpos) + # s_lookup = {} + # for s_id in set(s_ids): + # s_start, s_end = self.cl_struc2cpos('UTOPIEN.s', s_id) + # # CHANGE to UTOPIEN.s will always be like this in nopaque + # s_cpos = range(s_start, s_end) + # s_lookup.update({s_id: list(s_cpos)}) + # # print(list(s_cpos)) + # all_cpos.extend(s_cpos) + t0 = time.time() all_cpos = list(set(all_cpos)) # get rid of cpos duplicates + t1 = time.time() + t_total = t1 - t0 + print('TIME FOR ALL CPOS:', t_total) + print('CPOS SUM:', len(all_cpos)) # Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for # all cpos entries in all_cpos_list # Also saves these informations into self.results dict + t6 = time.time() all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos) + t7 = time.time() + t_final = t7 - t6 + print('GOT ALL RESULTS IN:', t_final) self.results = {'matches': all_matches, 'cpos_lookup': all_cpos_infos, - 's_lookup': s_lookup, 'text_lookup': text_lookup} + 'text_lookup': text_lookup} return self.results - # print(self.results) def get_cpos_infos(self, all_cpos): ''' Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for all cpos entries specified in the parameter all_cpos. ''' + # Get all positional attribute informations cpos_infos = {} for p_attr_key in self.attr_strings['positional_attrs'].keys(): match_strs = self.cl_cpos2str(self.attr_strings['positional_attrs'][p_attr_key], all_cpos) cpos_infos[p_attr_key] = match_strs - tmp_s_info = [] - tmp_text_info = [] - text_lookup = {} - tmp_dict = {} + # Get all strucutural attribute informations + tmp_info = {} + structs_to_check = [] for struct_attr_key in self.attr_strings['struct_attrs'].keys(): - check = self.attr_strings['struct_attrs'][struct_attr_key] - if check == 'CORPUS.s': - struct_ids = self.cl_cpos2struc(check, all_cpos) + key = self.attr_strings['struct_attrs'][struct_attr_key] + has_value = self.corpus_structural_attribute_has_values(key) + struct_ids = self.cl_cpos2struc(key, all_cpos) + if has_value is False: # Get IDs of strucutural elements without values (this means get IDs of XML tags. Struct elements only have values if they are XML attributes) + tmp_info[struct_attr_key] = [] for id in struct_ids: - tmp_s_info.append({struct_attr_key: id}) - elif check == 'CORPUS.text': - struct_ids = self.cl_cpos2struc(check, all_cpos) - for id in struct_ids: - tmp_text_info.append({struct_attr_key: id}) + tmp_info[struct_attr_key].append(id) else: - struct_ids = struct_ids = self.cl_cpos2struc(check, all_cpos) - struct_values = self.cl_struc2str(self.attr_strings['struct_attrs'][struct_attr_key], struct_ids) - for value in struct_values: - for id in struct_ids: - tmp_dict.update({id: {struct_attr_key: value}}) - print(tmp_dict) - print(text_lookup) + structs_to_check.append({key: struct_attr_key}) + struct_attr_values = list(tmp_info.values()) + struct_attr_keys = list(tmp_info.keys()) - # struct_entry = self.cl_cpos2struc(self.attr_strings['struct_attrs'][struct_attr_key], all_cpos) - # has_value = self.corpus_structural_attribute_has_values(self.attr_strings['struct_attrs'][struct_attr_key]) - # if has_value: - # match_strs = self.cl_struc2str(self.attr_strings['struct_attrs'][struct_attr_key], struct_entry) - # elif self.attr_strings['struct_attrs'][struct_attr_key] == 'CORPUS.s': - # pass - # else: - # match_strs = [None for i in struct_entry] - # cpos_infos[struct_attr_key] = zip(struct_entry, match_strs) - tmp_list = [] - attr_key_list = [] + # Build textlookup dictionary + text_lookup_ids = list(set(struct_attr_values[0])) # First is always one text + text_lookup = {} + for d in structs_to_check: + s_key, s_value = zip(*d.items()) + s_value = s_value[0].split('_')[1] + struct_values = self.cl_struc2str(s_key[0], text_lookup_ids) + zipped = dict(zip(text_lookup_ids, struct_values)) + for zip_key, zip_value in zipped.items(): + check = text_lookup.get(zip_key) + if check is None: + text_lookup[zip_key] = {s_value: zip_value} + else: + text_lookup[zip_key].update({s_value: zip_value}) + + # zip keys and values together + attr_values_list = [] + attr_keys_list = [] for key in cpos_infos.keys(): - tmp_list.append(cpos_infos[key]) - attr_key_list.append(key) - joined_cpos_infos = zip(all_cpos, *tmp_list) + attr_values_list.append(cpos_infos[key]) + attr_keys_list.append(key) + attr_keys_list.extend(struct_attr_keys) + attr_values_list.extend(struct_attr_values) + joined_cpos_infos = zip(all_cpos, *attr_values_list) dict_cpos_infos = {} for info in joined_cpos_infos: - dict_cpos_infos[info[0]] = dict(zip(attr_key_list, info[1:])) - for key, s_id, text_id in zip(dict_cpos_infos.keys(), tmp_s_info, tmp_text_info): - dict_cpos_infos[key].update(s_id) - dict_cpos_infos[key].update(text_id) + dict_cpos_infos[info[0]] = dict(zip(attr_keys_list, info[1:])) return dict_cpos_infos, text_lookup diff --git a/app/corpora/events.py b/app/corpora/events.py index 265285b5..8d1580c3 100644 --- a/app/corpora/events.py +++ b/app/corpora/events.py @@ -4,10 +4,6 @@ from app.models import Corpus from flask import current_app, request from flask_login import current_user, login_required from .CQiWrapper.CQiWrapper import CQiWrapper -import sys -import gzip -import zlib -import json ''' ' A dictionary containing lists of, with corpus ids associated, Socket.IO @@ -47,46 +43,13 @@ def corpus_analysis(message): room=request.sid) return """ Prepare and execute a query """ - corpus = 'CORPUS' + corpus_name = 'CORPUS' query = (message['query']) - query_subcorpus = 'Results' - client.cqp_query(corpus, query_subcorpus, query) + client.select_corpus(corpus_name) + client.query_subcorpus(query) + results = client.show_query_results(result_len=int(message['hits_per_page']), context_len=int(message['context'])) - data = {'matches': [], 'cpos_lookup': {}, 'text_loopup': {}} - - """ Evaluate query results """ - match_corpus = '{}:{}'.format(corpus, query_subcorpus) - match_num = min(int(message['hits_per_page']), client.cqp_subcorpus_size(match_corpus)) - match_boundaries = zip(client.cqp_dump_subcorpus(match_corpus, - 0x10, - 0, match_num - 1), - client.cqp_dump_subcorpus(match_corpus, - 0x11, - 0, match_num - 1)) - context = 15 - corpus_len = 10000 - for match_start, match_end in match_boundaries: - data['matches'].append({'lc': list(range(max(0, match_start - int(message['context'])), match_start)), - 'hit': list(range(match_start, match_end + 1)), - 'rc': list(range(match_end + 1, min(corpus_len, match_end + 1 + int(message['context']))))}) - cpos_list = [] - for match in data['matches']: - cpos_list += match['lc'] + match['hit'] + match['rc'] - cpos_list = list(set(cpos_list)) - lemma_list = client.cl_cpos2str('{}.lemma'.format(corpus), cpos_list) - pos_list = client.cl_cpos2str('{}.pos'.format(corpus), cpos_list) - simple_pos_list = client.cl_cpos2str('{}.simple_pos'.format(corpus), cpos_list) - s_id_list = client.cl_cpos2struc('{}.s'.format(corpus), cpos_list) - text_id_list = client.cl_cpos2struc('{}.text'.format(corpus), cpos_list) - word_list = client.cl_cpos2str('{}.word'.format(corpus), cpos_list) - for cpos, lemma, pos, simple_pos, s_id, text_id, word in zip(cpos_list, lemma_list, pos_list, simple_pos_list, s_id_list, text_id_list, word_list): - data['cpos_lookup'][cpos] = {'lemma': lemma, 'pos': pos, 'simple_pos': simple_pos, 's_id': s_id, 'text_id': text_id, 'word': word} - text_author_list = client.cl_struc2str('{}.text_author'.format(corpus), text_id_list) - text_publishing_year_list = client.cl_struc2str('{}.text_publishing_year'.format(corpus), text_id_list) - text_title_list = client.cl_struc2str('{}.text_title'.format(corpus), text_id_list) - for text_id, text_author, text_publishing_year, text_title in zip(text_id_list, text_author_list, text_publishing_year_list, text_title_list): - data['text_loopup'][text_id] = {'author': text_author, 'publishing_year': text_publishing_year, 'title': text_title} - socketio.emit('corpus_analysis', data, room=request.sid) + socketio.emit('corpus_analysis', results, room=request.sid) def corpus_analysis_session_handler(app, corpus_id, session_id): diff --git a/app/templates/corpora/analyse_corpus.html.j2 b/app/templates/corpora/analyse_corpus.html.j2 index e26f6e52..5db09786 100644 --- a/app/templates/corpora/analyse_corpus.html.j2 +++ b/app/templates/corpora/analyse_corpus.html.j2 @@ -182,6 +182,7 @@ }); socket.on("corpus_analysis", function(message) { + console.log(message); var matchElement; var matchTextTitlesElement; var matchLeftContextElement; @@ -234,7 +235,7 @@ matchHitElement.append(tokenElement); matchHitElement.append(document.createTextNode(" ")); tokenElements.push(tokenElement); - textTitles.add(result["text_loopup"][token["text_id"]]["title"]); + textTitles.add(result["text_lookup"][token["text"]]["title"]); } matchTextTitlesElement.innerText = [...textTitles].join(","); matchElement.append(matchHitElement); @@ -274,9 +275,9 @@ simple_pos: ${token["simple_pos"]} - Title: ${result["text_loopup"][token["text_id"]]["title"]}
- Author: ${result["text_loopup"][token["text_id"]]["title"]}
- Publishing year: ${result["text_loopup"][token["text_id"]]["publishing_year"]} + Title: ${result["text_lookup"][token["text"]]["title"]}
+ Author: ${result["text_lookup"][token["text"]]["title"]}
+ Publishing year: ${result["text_lookup"][token["text"]]["publishing_year"]} `,