From 5fdd67ebf28a6c4d0a2de624e29da0c68b35a50d Mon Sep 17 00:00:00 2001 From: Stephan Porada Date: Mon, 18 Nov 2019 14:24:13 +0100 Subject: [PATCH] Add new CQiWrapper --- app/corpora/CQiWrapper/CQiClient.py | 2 +- app/corpora/CQiWrapper/CQiWrapper.py | 285 ++++++++++++++------------- 2 files changed, 153 insertions(+), 134 deletions(-) diff --git a/app/corpora/CQiWrapper/CQiClient.py b/app/corpora/CQiWrapper/CQiClient.py index 39a24c4c..ab01522c 100644 --- a/app/corpora/CQiWrapper/CQiClient.py +++ b/app/corpora/CQiWrapper/CQiClient.py @@ -1,4 +1,4 @@ -from . import CQi +import CQi import socket import struct diff --git a/app/corpora/CQiWrapper/CQiWrapper.py b/app/corpora/CQiWrapper/CQiWrapper.py index 154dc25c..be16171c 100644 --- a/app/corpora/CQiWrapper/CQiWrapper.py +++ b/app/corpora/CQiWrapper/CQiWrapper.py @@ -1,6 +1,7 @@ -from .CQiClient import CQiClient -import multiprocessing +from CQiClient import CQiClient +from CQi import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND import collections +from app import logger # only works if imported into opaque web app class CQiWrapper(CQiClient): @@ -11,6 +12,8 @@ class CQiWrapper(CQiClient): for ease of use. Also structures recieved data into python dictionaries. Keyword arguments: + host -- host IP adress or hostname wher the cqp server is running + port -- port of the cqp server username -- username used to connect to the cqp server password -- password of the user to connect to the cqp server """ @@ -32,12 +35,15 @@ class CQiWrapper(CQiClient): """ self.ctrl_connect(self.username, self.password) - def create_attribute_strings(self): + def __create_attribute_strings(self): + """ + Creates all needed attribute strings to query for word, lemma etc. in + the given corpus. + For example: CORPUS_NAME.word to query words + """ p_attrs = self.corpus_positional_attributes(self.corpus_name) struct_attrs = self.corpus_structural_attributes(self.corpus_name) self.meta_struct_element = struct_attrs[0] - print(p_attrs) - print(struct_attrs) self.attr_strings = {} self.attr_strings['positional_attrs'] = {} self.attr_strings['struct_attrs'] = {} @@ -49,8 +55,17 @@ class CQiWrapper(CQiClient): self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name + '.' + struct_attr) - def set_corpus_name(self, corpus_name): - self.corpus_name = corpus_name + logger.warning(('All positional and ' + 'structural attributes: {}').format(self.attr_strings)) + + def select_corpus(self, corpus_name): + if corpus_name in self.corpus_list_coprora(): + self.corpus_name = corpus_name + self.__create_attribute_strings() + logger.warning('{} does exist.'.format(corpus_name)) + else: + self.disconnect() + logger.warning('{} does not exist.'.format(corpus_name)) def disconnect(self): """ @@ -60,8 +75,9 @@ class CQiWrapper(CQiClient): """ self.ctrl_bye() self.connection.close() + logger.warning('Disconnected from cqp server.') - def query_subcorpus(self, result_subcorpus_name, query): + def query_subcorpus(self, query, result_subcorpus_name='Query-results'): """ Create subcorpus @@ -74,152 +90,155 @@ class CQiWrapper(CQiClient): query -- query written in cqp query language """ self.cqp_query(self.corpus_name, result_subcorpus_name, query) - self.result_subcorpus_ns = (self.corpus_name - + ':' - + result_subcorpus_name) - self.SUBCORPUS_NAMES.append(self.result_subcorpus_ns) - self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus_ns) - print('Nr of all matches is:', self.nr_matches) + self.result_subcorpus = (self.corpus_name + + ':' + + result_subcorpus_name) + self.SUBCORPUS_NAMES.append(self.result_subcorpus) + self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus) + logger.warning('Nr of all matches is: {}'.format(self.nr_matches)) def show_subcorpora(self): + """ + Show all subcorpora currently saved by the cqp server. + """ return self.cqp_list_subcorpora(self.corpus_name) - def show_results(self, - result_start_count=0, - result_max_count=50, - context_len=10,): + def show_query_results(self, + context_len=10, + result_len=1000): """ Show query results Shows the actual matched strings produce by the query. Uses the cpos match indexes to grab those strings. saves them into an orderd - dictionary. Also saves coresponding tags, lemmas and context: - OrderedDict([ - (0, - { - 'tokens': ['Big', 'Brother', 'himself'], - 'lemmas': ['big', 'brother', 'himself'], - 'pos_tags': ['JJ', 'NN1', 'PPX1'], - 'sem_tags': ['|A11.1+|N3.2+|N5+|', '|S2.2m|S4m|S9/S2.2m|', - '|Z8m|'], - 'context_before': ['figures', 'of', 'the', 'Party', ',', - 'almost', 'on', 'a', 'level', 'with'], - 'context_after': [',', 'and', 'then', 'had', 'engaged', - 'in', 'counter-revolu-', 'tionary', - 'activities', ','], - 'entry_title': '1984', 'entry_author': - 'george_orwell', - 'cpos_start': 110490, - 'cpos_end': 110492 - } - ) - ]) + dictionary. Also saves coresponding tags, lemmas and context. Gets those + informations using the corresponding cpos. Keyword arguments: - result_start_count -- start position of the dumped subcorpus. - (default 0) If it is 0 matches 0 to 50 will be shown. If it is 50 - matches 50 to 100 will be shown. - result_max_count -- defines how many matches at once will be shown. - (default 50) context_len -- defines how many words before and after a match will be shown (default 10) + result_len -- defines how many results are actually grabbed """ self.context_len = context_len - self.corpus_max_len = self.cl_attribute_size(self.attr_strings['positional_attrs']['word']) + self.corpus_max_len = self.cl_attribute_size( + self.attr_strings['positional_attrs']['word'] + ) + self.nr_matches = min(result_len, self.nr_matches) if self.nr_matches == 0: - print('Query resulted in 0 matches.') + logger.warning('Query resulted in 0 matches.') + self.disconnect + return None else: - if self.nr_matches <= 50: - matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns, - 0x10, - 0, - self.nr_matches - 1) - matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns, - 0x11, - 0, self.nr_matches - 1) - else: - matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns, - 0x10, - result_start_count, - result_max_count - 1) - matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns, - 0x11, - result_start_count, - result_max_count - 1) - match_indexes = zip(matches_start, matches_end) + # Get match cpos boundries + # match_boundries shows the start and end cpos of one match as a + # pair of cpositions + # [(1355, 1357), (1477, 1479)] Example for two boundry pairs + match_boundaries = zip(self.cqp_dump_subcorpus(self.result_subcorpus, + CONST_FIELD_MATCH, + 0, + self.nr_matches - 1), + self.cqp_dump_subcorpus(self.result_subcorpus, + CONST_FIELD_MATCHEND, + 0, + self.nr_matches - 1)) - matches = [] - manager = multiprocessing.Manager() - return_dict = manager.dict() - for i, index_pair in enumerate(match_indexes): - match = multiprocessing.Process(target=self.__get_matches, - args=(i, - index_pair, - self.corpus_name, - return_dict)) - matches.append(match) - match.start() - for match in matches: - match.join() - # sort matches into ordered dict - ordered_results = collections.OrderedDict() - for key in sorted(return_dict.keys()): - ordered_results[key] = return_dict[key] - return ordered_results + # Generate all cpos between boundries including start and end boundries + # Save them as list into on match entry at serial number 'i' + ordered_matches = collections.OrderedDict() + for i, match_pair in enumerate(match_boundaries): + ordered_matches[i] = ({'match_cpos_list': + list(range(match_pair[0], + match_pair[1] + 1))}) + # Saves cpos form all match entries into one list + all_cpos_list = [] + for key in ordered_matches.keys(): + all_cpos_list += ordered_matches[key]['match_cpos_list'] - def get_cpos_info(self, cpos, session): - match_dict = {} + # Saves all cpos from before and after context into the list: + # all_context_cpos_list + all_context_cpos_list = [] + for key in ordered_matches.keys(): + cpos_list = ordered_matches[key]['match_cpos_list'] + before_index = max([0, cpos_list[0] - self.context_len]) + after_index = min([self.corpus_max_len, + cpos_list[-1] + self.context_len]) + ordered_matches[key]['context_before_cpos_list'] = list(range(before_index, + cpos_list[0])) + ordered_matches[key]['context_after_cpos_list'] = list(range(cpos_list[-1] + 1, + after_index + 1)) + all_context_cpos_list += ordered_matches[key]['context_before_cpos_list'] + all_context_cpos_list += ordered_matches[key]['context_after_cpos_list'] + # Combines all_cpos_list with all_context_cpos_list as a sorted set + all_cpos_list += all_context_cpos_list + all_cpos_list = sorted(list(set(all_cpos_list))) + + # Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for + # all cpos entries in all_cpos_list + # Also saves these informations into the ordered_matches dict + all_cpos_infos = self.get_cpos_infos(all_cpos_list) + for key in ordered_matches.keys(): + # loops over cpos in cpos_list which holds all match cpos + # Replaces one cpos with the corresponding cpos information created + # by self.get_cpos_infos(all_cpos_list) + cpos_list = ordered_matches[key]['match_cpos_list'] + infos = [] + for cpos in cpos_list: + info = {cpos: all_cpos_infos.get(cpos)} + infos.append(info) + ordered_matches[key]['match_cpos_list'] = infos + try: + # loops over cpos in ordered_matches[key]['context_before_cpos_list'] + # which holds all cpos of the before context + # Replaces one cpos with the corresponding cpos information created + # by self.get_cpos_infos(all_cpos_list) + before_context_infos = [] + for context_before_cpos in ordered_matches[key]['context_before_cpos_list']: + before_context_info = {context_before_cpos: + all_cpos_infos.get(context_before_cpos)} + before_context_infos.append(before_context_info) + ordered_matches[key]['context_before_cpos_list'] = before_context_infos + except UnboundLocalError: + logger.warning('Context before cpos list is empty.') + try: + # loops over cpos in ordered_matches[key]['context_after_cpos_list'] + # which holds all cpos of the before context + # Replaces one cpos with the corresponding cpos information created + # by self.get_cpos_infos(all_cpos_list) + after_context_infos = [] + for context_after_cpos in ordered_matches[key]['context_after_cpos_list']: + after_context_info = {context_after_cpos: + all_cpos_infos.get(context_after_cpos)} + after_context_infos.append(after_context_info) + ordered_matches[key]['context_after_cpos_list'] = after_context_infos + except UnboundLocalError: + logger.warning('Context after cpos list is empty.') + return ordered_matches + + def get_cpos_infos(self, all_cpos): + ''' + Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for + all cpos entries specified in the parameter all_cpos. + ''' + cpos_infos = {} for attr_dict in self.attr_strings: - # print(self.attr_strings[attr_dict]) if attr_dict == 'positional_attrs': for p_attr_key in self.attr_strings[attr_dict].keys(): - # print(p_attr_key) - match_str = session.cl_cpos2str(self.attr_strings[attr_dict][p_attr_key], range(cpos[0], cpos[1])) - match_dict[p_attr_key] = match_str + match_str = self.cl_cpos2str(self.attr_strings[attr_dict][p_attr_key], + all_cpos) + cpos_infos[p_attr_key] = match_str elif attr_dict == 'struct_attrs': for struct_attr_key in self.attr_strings[attr_dict].keys(): - # print(struct_attr_key) - struct_entry = session.cl_cpos2struc(self.attr_strings['struct_attrs'][self.meta_struct_element], - range(cpos[0], cpos[1])) - match_str = session.cl_struc2str(self.attr_strings[attr_dict][struct_attr_key], struct_entry) - match_dict[struct_attr_key] = set(match_str) - return match_dict - - def __get_matches(self, i, index_pair, corpus_name, return_dict): - """ - Get matches as readable output - - Gets the actual match strings of cpos match indexes. Private helper - method used in show_results. - - Keyword arguments: - i -- serial number for match at given cpos - index_pair -- match start and match end cpos - corpus_name -- name of the parent corpus - return_dict -- dictionary created with manager.dict() that holds the - extracted strings tags etc. - """ - # print('START:', index_pair[0]) - # print('END:', index_pair[1]) - # print('=============================') - index_pair = [index_pair[0], index_pair[1] + 1] - tmp_session = CQiWrapper(username=self.username, password=self.password, - host=self.host, port=self.port) - tmp_session.connect() - match = self.get_cpos_info(index_pair, tmp_session) - before_index = max([0, index_pair[0] - self.context_len]) - after_index = min([self.corpus_max_len, - index_pair[1] + self.context_len]) - context_before = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'], - range(before_index, - index_pair[0])) - context_after = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'], - range(index_pair[1] + 1, - after_index + 1)) - tmp_dict = {'context_before': context_before, - 'context_after': context_after, - 'cpos_start': index_pair[0], - 'cpos_end': index_pair[1]} - match.update(tmp_dict) - return_dict[i] = match - tmp_session.disconnect() + struct_entry = self.cl_cpos2struc(self.attr_strings['struct_attrs'][self.meta_struct_element], + all_cpos) + match_str = self.cl_struc2str(self.attr_strings[attr_dict][struct_attr_key], struct_entry) + cpos_infos[struct_attr_key] = match_str + tmp_list = [] + attr_key_list = [] + for key in cpos_infos.keys(): + tmp_list.append(cpos_infos[key]) + attr_key_list.append(key) + joined_cpos_infos = zip(all_cpos, *tmp_list) + dict_cpos_infos = {} + for info in joined_cpos_infos: + dict_cpos_infos[info[0]] = dict(zip(attr_key_list, info[1:])) + return dict_cpos_infos