From ffed8592c87350185847d077a785ede454eb187f Mon Sep 17 00:00:00 2001 From: Stephan Porada Date: Mon, 11 Nov 2019 15:35:37 +0100 Subject: [PATCH] Add new CQiWrapper --- app/corpora/CQiWrapper/CQiWrapper.py | 164 +++++++++++++++------------ app/corpora/events.py | 8 +- 2 files changed, 97 insertions(+), 75 deletions(-) diff --git a/app/corpora/CQiWrapper/CQiWrapper.py b/app/corpora/CQiWrapper/CQiWrapper.py index 4739a655..dbe425e7 100644 --- a/app/corpora/CQiWrapper/CQiWrapper.py +++ b/app/corpora/CQiWrapper/CQiWrapper.py @@ -1,7 +1,6 @@ from .CQiClient import CQiClient import multiprocessing import collections -import socket class CQiWrapper(CQiClient): @@ -33,21 +32,41 @@ class CQiWrapper(CQiClient): """ self.ctrl_connect(self.username, self.password) - def create_attribute_strings(self, corpus_name): - self.word_str = corpus_name + '.word' - self.lemma_str = corpus_name + '.lemma' - self.pos_str = corpus_name + '.pos' - self.sem_str = corpus_name + '.sem' - self.entry_str = corpus_name + '.entry' - self.entry_author_str = self.entry_str + '_author' - self.entry_title_str = self.entry_str + '_title' - self.attributes = [self.word_str, - self.lemma_str, - self.pos_str, - self.sem_str, - self.entry_str, - self.entry_author_str, - self.entry_title_str] + def create_attribute_strings(self): + p_attrs = self.corpus_positional_attributes(self.corpus_name) + struct_attrs = self.corpus_structural_attributes(self.corpus_name) + self.meta_struct_element = struct_attrs[0] + print(p_attrs) + print(struct_attrs) + self.attr_strings = {} + self.attr_strings['positional_attrs'] = {} + self.attr_strings['struct_attrs'] = {} + for p_attr in p_attrs: + self.attr_strings['positional_attrs'][p_attr] = (self.corpus_name + + '.' + + p_attr) + for struct_attr in struct_attrs[:-1]: + self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name + + '.' + + struct_attr) + # self.word_str = corpus_name + '.word' + # self.lemma_str = corpus_name + '.lemma' + # self.pos_str = corpus_name + '.pos' + # self.sem_str = corpus_name + '.sem' + # self.entry_str = corpus_name + '.entry' + # self.entry_author_str = self.entry_str + '_author' + # self.entry_title_str = self.entry_str + '_title' + # self.attributes = [self.word_str, + # self.lemma_str, + # self.pos_str, + # self.sem_str, + # self.entry_str, + # self.entry_author_str, + # self.entry_title_str] + # print(self.attributes) + + def set_corpus_name(self, corpus_name): + self.corpus_name = corpus_name def disconnect(self): """ @@ -58,7 +77,7 @@ class CQiWrapper(CQiClient): self.ctrl_bye() self.connection.close() - def query_subcorpus(self, corpus_name, result_subcorpus_name, query): + def query_subcorpus(self, result_subcorpus_name, query): """ Create subcorpus @@ -66,13 +85,12 @@ class CQiWrapper(CQiClient): positions for that query. Keyword arguments: - corpus_name -- name of the corpus the query will be used on result_subcorpus_name -- user set name of the subcorpus which holds all cpos match positions, produced by the query query -- query written in cqp query language """ - self.cqp_query(corpus_name, result_subcorpus_name, query) - self.result_subcorpus_ns = (corpus_name + self.cqp_query(self.corpus_name, result_subcorpus_name, query) + self.result_subcorpus_ns = (self.corpus_name + ':' + result_subcorpus_name) self.SUBCORPUS_NAMES.append(self.result_subcorpus_ns) @@ -80,11 +98,9 @@ class CQiWrapper(CQiClient): print('Nr of all matches is:', self.nr_matches) def show_subcorpora(self): - print('Known subcorpora:', self.SUBCORPUS_NAMES) - return self.SUBCORPUS_NAMES + return self.cqp_list_subcorpora(self.corpus_name) def show_results(self, - corpus_name, result_start_count=0, result_max_count=50, context_len=10,): @@ -116,7 +132,6 @@ class CQiWrapper(CQiClient): ]) Keyword arguments: - corpus_name -- name of the parent corpus the subcorpus is part of result_start_count -- start position of the dumped subcorpus. (default 0) If it is 0 matches 0 to 50 will be shown. If it is 50 matches 50 to 100 will be shown. @@ -126,8 +141,7 @@ class CQiWrapper(CQiClient): shown (default 10) """ self.context_len = context_len - word_str = corpus_name + '.word' - self.corpus_max_len = self.cl_attribute_size(word_str) + self.corpus_max_len = self.cl_attribute_size(self.attr_strings['positional_attrs']['word']) if self.nr_matches == 0: print('Query resulted in 0 matches.') else: @@ -157,7 +171,7 @@ class CQiWrapper(CQiClient): match = multiprocessing.Process(target=self.__get_matches, args=(i, index_pair, - corpus_name, + self.corpus_name, return_dict)) matches.append(match) match.start() @@ -167,7 +181,25 @@ class CQiWrapper(CQiClient): ordered_results = collections.OrderedDict() for key in sorted(return_dict.keys()): ordered_results[key] = return_dict[key] - print('ORDERED_RESULTS', ordered_results) + return ordered_results + + def get_cpos_info(self, cpos, session): + match_dict = {} + for attr_dict in self.attr_strings: + # print(self.attr_strings[attr_dict]) + if attr_dict == 'positional_attrs': + for p_attr_key in self.attr_strings[attr_dict].keys(): + # print(p_attr_key) + match_str = session.cl_cpos2str(self.attr_strings[attr_dict][p_attr_key], range(cpos[0], cpos[1])) + match_dict[p_attr_key] = match_str + elif attr_dict == 'struct_attrs': + for struct_attr_key in self.attr_strings[attr_dict].keys(): + # print(struct_attr_key) + struct_entry = session.cl_cpos2struc(self.attr_strings['struct_attrs'][self.meta_struct_element], + range(cpos[0], cpos[1])) + match_str = session.cl_struc2str(self.attr_strings[attr_dict][struct_attr_key], struct_entry) + match_dict[struct_attr_key] = set(match_str) + return match_dict def __get_matches(self, i, index_pair, corpus_name, return_dict): """ @@ -183,58 +215,46 @@ class CQiWrapper(CQiClient): return_dict -- dictionary created with manager.dict() that holds the extracted strings tags etc. """ - print('START:', index_pair[0]) - print('END:', index_pair[1]) - print('=============================') + # print('START:', index_pair[0]) + # print('END:', index_pair[1]) + # print('=============================') + index_pair = [index_pair[0], index_pair[1] + 1] tmp_session = CQiWrapper(username=self.username, password=self.password, host=self.host, port=self.port) tmp_session.connect() - tokens = tmp_session.cl_cpos2str(self.word_str, - range(index_pair[0], - index_pair[1] + 1)) - lemmas = tmp_session.cl_cpos2str(self.lemma_str, - range(index_pair[0], - index_pair[1] + 1)) - pos_tags = tmp_session.cl_cpos2str(self.pos_str, - range(index_pair[0], - index_pair[1] + 1)) - sem_tags = tmp_session.cl_cpos2str(self.sem_str, - range(index_pair[0], - index_pair[1] + 1)) - struc_entry = tmp_session.cl_cpos2struc(self.entry_str, - range(index_pair[0], - index_pair[1] + 1)) + match = self.get_cpos_info(index_pair, tmp_session) + # tokens = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'], + # range(index_pair[0], + # index_pair[1] + 1)) + # lemmas = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['lemma'], + # range(index_pair[0], + # index_pair[1] + 1)) + # pos_tags = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['pos'], + # range(index_pair[0], + # index_pair[1] + 1)) + # sem_tags = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['sem'], + # range(index_pair[0], + # index_pair[1] + 1)) + # struc_entry = tmp_session.cl_cpos2struc(self.attr_strings['struct_attrs']['entry'], + # range(index_pair[0], + # index_pair[1] + 1)) before_index = max([0, index_pair[0] - self.context_len]) after_index = min([self.corpus_max_len, index_pair[1] + self.context_len]) - context_before = tmp_session.cl_cpos2str(self.word_str, + context_before = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'], range(before_index, index_pair[0])) - context_after = tmp_session.cl_cpos2str(self.word_str, + context_after = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'], range(index_pair[1] + 1, after_index + 1)) - entry_titles = tmp_session.cl_struc2str(self.entry_title_str, - struc_entry) - entry_authors = tmp_session.cl_struc2str(self.entry_author_str, - struc_entry) - return_dict[i] = {'tokens': tokens, - 'lemmas': lemmas, - 'pos_tags': pos_tags, - 'sem_tags': sem_tags, - 'context_before': context_before, - 'context_after': context_after, - 'entry_title': entry_titles[0], - 'entry_author': entry_authors[0], - 'cpos_start': index_pair[0], - 'cpos_end': index_pair[1]} + # entry_titles = tmp_session.cl_struc2str(self.attr_strings['struct_attrs']['entry_title'], + # struc_entry) + # entry_authors = tmp_session.cl_struc2str(self.attr_strings['struct_attrs']['entry_author'], + # struc_entry) + tmp_dict = {'context_before': context_before, + 'context_after': context_after, + 'cpos_start': index_pair[0], + 'cpos_end': index_pair[1]} + match.update(tmp_dict) + return_dict[i] = match tmp_session.disconnect() - - def get_cpos_info(self, cpos): - match_dict = collections.OrderedDict() - for attribute in self.attributes: - if '.entry' not in attribute: - match_str = self.cl_cpos2str(attribute, range(cpos[0], cpos[1])) - match_dict[attribute] = match_str - else: - continue - print(match_dict) diff --git a/app/corpora/events.py b/app/corpora/events.py index 601cd654..e84b2847 100644 --- a/app/corpora/events.py +++ b/app/corpora/events.py @@ -38,9 +38,11 @@ def recv_query(message): corpus_name = 'CORPUS' result_subcorpus_name = 'Query-results' # should be set by the user somehow query = message['query'] - analysis_client.create_attribute_strings(corpus_name) - analysis_client.query_subcorpus(corpus_name, result_subcorpus_name, query) - analysis_client.show_results(corpus_name) + analysis_client.set_corpus_name(corpus_name) + analysis_client.create_attribute_strings() + analysis_client.query_subcorpus(result_subcorpus_name, query) + results = analysis_client.show_results() + logger.warning('Query results: {}'.format(str(results))) def observe_corpus_analysis_connection(app, corpus_id, session_id):