From b3d5c15df347bb2a8a2de266ea81f6d2418b4f74 Mon Sep 17 00:00:00 2001 From: Stephan Porada Date: Wed, 27 Nov 2019 09:41:21 +0100 Subject: [PATCH] CQiWrapper new data structure --- app/corpora/CQiWrapper/CQiWrapper.py | 196 ++++++++++++--------------- 1 file changed, 89 insertions(+), 107 deletions(-) diff --git a/app/corpora/CQiWrapper/CQiWrapper.py b/app/corpora/CQiWrapper/CQiWrapper.py index e2c9d996..e30b13ff 100644 --- a/app/corpora/CQiWrapper/CQiWrapper.py +++ b/app/corpora/CQiWrapper/CQiWrapper.py @@ -1,8 +1,7 @@ -from .CQiClient import CQiClient -from .CQi import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND -import collections +from CQiClient import CQiClient +from CQi import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND import re -from app import logger # only works if imported into opaque web app +# from app import logger # only works if imported into opaque web app class CQiWrapper(CQiClient): @@ -55,16 +54,16 @@ class CQiWrapper(CQiClient): self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name + '.' + struct_attr) - logger.warning(('All positional and ' - 'structural attributes: {}').format(self.attr_strings)) + # logger.warning(('All positional and ' + # 'structural attributes: {}').format(self.attr_strings)) def select_corpus(self, corpus_name): if corpus_name in self.corpus_list_coprora(): self.corpus_name = corpus_name self.__create_attribute_strings() - logger.warning('{} does exist.'.format(corpus_name)) + # logger.warning('{} does exist.'.format(corpus_name)) else: - logger.warning('{} does not exist.'.format(corpus_name)) + # logger.warning('{} does not exist.'.format(corpus_name)) pass def disconnect(self): @@ -75,7 +74,7 @@ class CQiWrapper(CQiClient): """ self.ctrl_bye() self.connection.close() - logger.warning('Disconnected from cqp server.') + # logger.warning('Disconnected from cqp server.') def query_subcorpus(self, query, result_subcorpus_name='Query-results'): """ @@ -95,7 +94,7 @@ class CQiWrapper(CQiClient): + result_subcorpus_name) self.SUBCORPUS_NAMES.append(self.result_subcorpus) self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus) - logger.warning('Nr of all matches is: {}'.format(self.nr_matches)) + # logger.warning('Nr of all matches is: {}'.format(self.nr_matches)) def show_subcorpora(self): """ @@ -125,7 +124,7 @@ class CQiWrapper(CQiClient): ) self.nr_matches = min(result_len, self.nr_matches) if self.nr_matches == 0: - logger.warning('Query resulted in 0 matches.') + # logger.warning('Query resulted in 0 matches.') return None else: # Get match cpos boundries @@ -141,86 +140,49 @@ class CQiWrapper(CQiClient): 0, self.nr_matches - 1)) - # Generate all cpos between boundries including start and end boundries - # Save them as list into on match entry at serial number 'i' - ordered_matches = collections.OrderedDict() - for i, match_pair in enumerate(match_boundaries): - ordered_matches[i] = ({'match_cpos': - list(range(match_pair[0], - match_pair[1] + 1))}) - # Saves cpos form all match entries into one list - all_cpos_list = [] - for key in ordered_matches.keys(): - all_cpos_list += ordered_matches[key]['match_cpos'] + # Generate all cpos between match boundries including start and end boundries. + # Also generate cpos for left and right context. + # Save those cpos into dict as lists for the keys 'lc', 'hit' and 'rc' + # Also collect all cpos together in one list for the final request of + # all cpos informations + all_matches = [] + all_cpos = [] + for start, end in match_boundaries: + lc_cpos = list(range(max([0, start - self.context_len]), start)) + lc = {'lc': lc_cpos} + match_cpos = list(range(start, end + 1)) + match = {'hit': match_cpos} + rc_cpos = list(range(end + 1, min([self.corpus_max_len, end + self.context_len + 1]))) + rc = {'rc': rc_cpos} + lc.update(match) + lc.update(rc) + all_cpos.extend(lc_cpos + match_cpos + rc_cpos) + all_matches.append(lc) + # print(all_matches) + # print(all_cpos) - # Saves all cpos from before and after context into the list: - # all_context_cpos_list - all_context_cpos_list = [] - for key in ordered_matches.keys(): - cpos_list = ordered_matches[key]['match_cpos'] - before_index = max([0, cpos_list[0] - self.context_len]) - after_index = min([self.corpus_max_len, - cpos_list[-1] + self.context_len]) - ordered_matches[key]['left_context_cpos'] = list(range(before_index, - cpos_list[0])) - ordered_matches[key]['right_context_cpos'] = list(range(cpos_list[-1] + 1, - after_index + 1)) - all_context_cpos_list += ordered_matches[key]['left_context_cpos'] - all_context_cpos_list += ordered_matches[key]['right_context_cpos'] - # Combines all_cpos_list with all_context_cpos_list as a sorted set - all_cpos_list += all_context_cpos_list - all_cpos_list = sorted(list(set(all_cpos_list))) + # Get all sentences IDs for all above collected cpos in all_cpos + s_ids = self.cl_cpos2struc('UTOPIEN.s', all_cpos) # CHANGE to CORPUS.s will always be like this in nopaque + # Get all cpos for all sneteces boundries + s_lookup = {} + for s_id in set(s_ids): + s_start, s_end = self.cl_struc2cpos('UTOPIEN.s', s_id) # CHANGE to CORPUS.s will always be like this in nopaque + # print(s_start, s_end) + s_cpos = range(s_start, s_end) + s_lookup.update({s_id: list(s_cpos)}) + # print(list(s_cpos)) + all_cpos.extend(s_cpos) + all_cpos = list(set(all_cpos)) # get rid of cpos duplicates # Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for # all cpos entries in all_cpos_list - # Also saves these informations into the ordered_matches dict - all_cpos_infos, s_list = self.get_cpos_infos(all_cpos_list) - for key in ordered_matches.keys(): - # loops over cpos in cpos_list which holds all match cpos - # Replaces one cpos with the corresponding cpos information created - # by self.get_cpos_infos(all_cpos_list) - cpos_list = ordered_matches[key]['match_cpos'] - infos = [] - for cpos in cpos_list: - info = {cpos: all_cpos_infos.get(cpos)} - infos.append(info) - ordered_matches[key]['match_cpos'] = infos - try: - # loops over cpos in ordered_matches[key]['left_context_cpos'] - # which holds all cpos of the before context - # Replaces one cpos with the corresponding cpos information created - # by self.get_cpos_infos(all_cpos_list) - before_context_infos = [] - for context_before_cpos in ordered_matches[key]['left_context_cpos']: - before_context_info = {context_before_cpos: - all_cpos_infos.get(context_before_cpos)} - before_context_infos.append(before_context_info) - ordered_matches[key]['left_context_cpos'] = before_context_infos - except UnboundLocalError: - logger.warning('Context before cpos list is empty.') - pass - try: - # loops over cpos in ordered_matches[key]['right_context_cpos'] - # which holds all cpos of the before context - # Replaces one cpos with the corresponding cpos information created - # by self.get_cpos_infos(all_cpos_list) - after_context_infos = [] - for context_after_cpos in ordered_matches[key]['right_context_cpos']: - after_context_info = {context_after_cpos: - all_cpos_infos.get(context_after_cpos)} - after_context_infos.append(after_context_info) - ordered_matches[key]['right_context_cpos'] = after_context_infos - except UnboundLocalError: - logger.warning('Context after cpos list is empty.') - pass - sentences = {} - s_list = set(s_list) - for s_id in s_list: - s_start, s_end = self.cl_struc2cpos('CORPUS.s', s_id) - sentence = self.cl_cpos2str('CORPUS.word', range(s_start, s_end + 1)) - sentences.update({s_id: re.sub(r' (?=\W)', '', ' '.join(sentence))}) - ordered_matches['sentences'] = sentences - return ordered_matches + # Also saves these informations into self.results dict + all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos) + + self.results = {'matches': all_matches, 'cpos_lookup': all_cpos_infos, + 's_lookup': s_lookup, 'text_lookup': text_lookup} + return self.results + # print(self.results) def get_cpos_infos(self, all_cpos): ''' @@ -228,25 +190,42 @@ class CQiWrapper(CQiClient): all cpos entries specified in the parameter all_cpos. ''' cpos_infos = {} - s_list = [] - for key in self.attr_strings.keys(): - if key == 'positional_attrs': - for p_attr_key in self.attr_strings[key].keys(): - match_strs = self.cl_cpos2str(self.attr_strings[key][p_attr_key], - all_cpos) - cpos_infos[p_attr_key] = match_strs - elif key == 'struct_attrs': - for struct_attr_key in self.attr_strings[key].keys(): - struct_entry = self.cl_cpos2struc(self.attr_strings[key][struct_attr_key], - all_cpos) - has_value = self.corpus_structural_attribute_has_values(self.attr_strings[key][struct_attr_key]) - if has_value: - match_strs = self.cl_struc2str(self.attr_strings[key][struct_attr_key], struct_entry) - elif self.attr_strings[key][struct_attr_key] == 'CORPUS.s': - s_list.extend(struct_entry) - else: - match_strs = [None for i in struct_entry] - cpos_infos[struct_attr_key] = zip(struct_entry, match_strs) + for p_attr_key in self.attr_strings['positional_attrs'].keys(): + match_strs = self.cl_cpos2str(self.attr_strings['positional_attrs'][p_attr_key], all_cpos) + cpos_infos[p_attr_key] = match_strs + + tmp_s_info = [] + tmp_text_info = [] + text_lookup = {} + tmp_dict = {} + for struct_attr_key in self.attr_strings['struct_attrs'].keys(): + check = self.attr_strings['struct_attrs'][struct_attr_key] + if check == 'UTOPIEN.s': + struct_ids = self.cl_cpos2struc(check, all_cpos) + for id in struct_ids: + tmp_s_info.append({struct_attr_key: id}) + elif check == 'UTOPIEN.entry': + struct_ids = self.cl_cpos2struc(check, all_cpos) + for id in struct_ids: + tmp_text_info.append({struct_attr_key: id}) + else: + struct_ids = struct_ids = self.cl_cpos2struc(check, all_cpos) + struct_values = self.cl_struc2str(self.attr_strings['struct_attrs'][struct_attr_key], struct_ids) + for value in struct_values: + for id in struct_ids: + tmp_dict.update({id: {struct_attr_key: value}}) + print(tmp_dict) + print(text_lookup) + + # struct_entry = self.cl_cpos2struc(self.attr_strings['struct_attrs'][struct_attr_key], all_cpos) + # has_value = self.corpus_structural_attribute_has_values(self.attr_strings['struct_attrs'][struct_attr_key]) + # if has_value: + # match_strs = self.cl_struc2str(self.attr_strings['struct_attrs'][struct_attr_key], struct_entry) + # elif self.attr_strings['struct_attrs'][struct_attr_key] == 'CORPUS.s': + # pass + # else: + # match_strs = [None for i in struct_entry] + # cpos_infos[struct_attr_key] = zip(struct_entry, match_strs) tmp_list = [] attr_key_list = [] for key in cpos_infos.keys(): @@ -256,4 +235,7 @@ class CQiWrapper(CQiClient): dict_cpos_infos = {} for info in joined_cpos_infos: dict_cpos_infos[info[0]] = dict(zip(attr_key_list, info[1:])) - return dict_cpos_infos, s_list + for key, s_id, text_id in zip(dict_cpos_infos.keys(), tmp_s_info, tmp_text_info): + dict_cpos_infos[key].update(s_id) + dict_cpos_infos[key].update(text_id) + return dict_cpos_infos, text_lookup