mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
				synced 2025-10-27 00:41:15 +00:00 
			
		
		
		
	Add new CQiWrapper
This commit is contained in:
		| @@ -1,4 +1,4 @@ | ||||
| from . import CQi | ||||
| import CQi | ||||
| import socket | ||||
| import struct | ||||
|  | ||||
|   | ||||
| @@ -1,6 +1,7 @@ | ||||
| from .CQiClient import CQiClient | ||||
| import multiprocessing | ||||
| from CQiClient import CQiClient | ||||
| from CQi import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND | ||||
| import collections | ||||
| from app import logger  # only works if imported into opaque web app | ||||
|  | ||||
|  | ||||
| class CQiWrapper(CQiClient): | ||||
| @@ -11,6 +12,8 @@ class CQiWrapper(CQiClient): | ||||
|     for ease of use. Also structures recieved data into python dictionaries. | ||||
|  | ||||
|     Keyword arguments: | ||||
|     host -- host IP adress or hostname wher the cqp server is running | ||||
|     port -- port of the cqp server | ||||
|     username -- username used to connect to the cqp server | ||||
|     password -- password of the user to connect to the cqp server | ||||
|     """ | ||||
| @@ -32,12 +35,15 @@ class CQiWrapper(CQiClient): | ||||
|         """ | ||||
|         self.ctrl_connect(self.username, self.password) | ||||
|  | ||||
|     def create_attribute_strings(self): | ||||
|     def __create_attribute_strings(self): | ||||
|         """ | ||||
|         Creates all needed attribute strings to query for word, lemma etc. in | ||||
|         the given corpus. | ||||
|         For example: CORPUS_NAME.word to query words | ||||
|         """ | ||||
|         p_attrs = self.corpus_positional_attributes(self.corpus_name) | ||||
|         struct_attrs = self.corpus_structural_attributes(self.corpus_name) | ||||
|         self.meta_struct_element = struct_attrs[0] | ||||
|         print(p_attrs) | ||||
|         print(struct_attrs) | ||||
|         self.attr_strings = {} | ||||
|         self.attr_strings['positional_attrs'] = {} | ||||
|         self.attr_strings['struct_attrs'] = {} | ||||
| @@ -49,8 +55,17 @@ class CQiWrapper(CQiClient): | ||||
|             self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name | ||||
|                                                               + '.' | ||||
|                                                               + struct_attr) | ||||
|     def set_corpus_name(self, corpus_name): | ||||
|         logger.warning(('All positional and ' | ||||
|                         'structural attributes: {}').format(self.attr_strings)) | ||||
|  | ||||
|     def select_corpus(self, corpus_name): | ||||
|         if corpus_name in self.corpus_list_coprora(): | ||||
|             self.corpus_name = corpus_name | ||||
|             self.__create_attribute_strings() | ||||
|             logger.warning('{} does exist.'.format(corpus_name)) | ||||
|         else: | ||||
|             self.disconnect() | ||||
|             logger.warning('{} does not exist.'.format(corpus_name)) | ||||
|  | ||||
|     def disconnect(self): | ||||
|         """ | ||||
| @@ -60,8 +75,9 @@ class CQiWrapper(CQiClient): | ||||
|         """ | ||||
|         self.ctrl_bye() | ||||
|         self.connection.close() | ||||
|         logger.warning('Disconnected from cqp server.') | ||||
|  | ||||
|     def query_subcorpus(self, result_subcorpus_name, query): | ||||
|     def query_subcorpus(self, query, result_subcorpus_name='Query-results'): | ||||
|         """ | ||||
|         Create subcorpus | ||||
|  | ||||
| @@ -74,152 +90,155 @@ class CQiWrapper(CQiClient): | ||||
|         query -- query written in cqp query language | ||||
|         """ | ||||
|         self.cqp_query(self.corpus_name, result_subcorpus_name, query) | ||||
|         self.result_subcorpus_ns = (self.corpus_name | ||||
|         self.result_subcorpus = (self.corpus_name | ||||
|                                  + ':' | ||||
|                                  + result_subcorpus_name) | ||||
|         self.SUBCORPUS_NAMES.append(self.result_subcorpus_ns) | ||||
|         self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus_ns) | ||||
|         print('Nr of all matches is:', self.nr_matches) | ||||
|         self.SUBCORPUS_NAMES.append(self.result_subcorpus) | ||||
|         self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus) | ||||
|         logger.warning('Nr of all matches is: {}'.format(self.nr_matches)) | ||||
|  | ||||
|     def show_subcorpora(self): | ||||
|         """ | ||||
|         Show all subcorpora currently saved by the cqp server. | ||||
|         """ | ||||
|         return self.cqp_list_subcorpora(self.corpus_name) | ||||
|  | ||||
|     def show_results(self, | ||||
|                      result_start_count=0, | ||||
|                      result_max_count=50, | ||||
|                      context_len=10,): | ||||
|     def show_query_results(self, | ||||
|                            context_len=10, | ||||
|                            result_len=1000): | ||||
|         """ | ||||
|         Show query results | ||||
|  | ||||
|         Shows the actual matched strings produce by the query. Uses the cpos | ||||
|         match indexes to grab those strings. saves them into an orderd | ||||
|         dictionary. Also saves coresponding tags, lemmas and context: | ||||
|         OrderedDict([ | ||||
|             (0, | ||||
|                 { | ||||
|                     'tokens': ['Big', 'Brother', 'himself'], | ||||
|                     'lemmas': ['big', 'brother', 'himself'], | ||||
|                     'pos_tags': ['JJ', 'NN1', 'PPX1'], | ||||
|                     'sem_tags': ['|A11.1+|N3.2+|N5+|', '|S2.2m|S4m|S9/S2.2m|', | ||||
|                                  '|Z8m|'], | ||||
|                     'context_before': ['figures', 'of', 'the', 'Party', ',', | ||||
|                                        'almost', 'on', 'a', 'level', 'with'], | ||||
|                     'context_after': [',', 'and', 'then', 'had', 'engaged', | ||||
|                                       'in', 'counter-revolu-', 'tionary', | ||||
|                                       'activities', ','], | ||||
|                     'entry_title': '1984', 'entry_author': | ||||
|                     'george_orwell', | ||||
|                     'cpos_start': 110490, | ||||
|                     'cpos_end': 110492 | ||||
|                 } | ||||
|             ) | ||||
|         ]) | ||||
|         dictionary. Also saves coresponding tags, lemmas and context. Gets those | ||||
|         informations using the corresponding cpos. | ||||
|  | ||||
|         Keyword arguments: | ||||
|         result_start_count -- start position of the dumped subcorpus. | ||||
|         (default 0) If it is 0 matches 0 to 50 will be shown. If it is 50 | ||||
|         matches 50 to 100 will be shown. | ||||
|         result_max_count -- defines how many matches at once will be shown. | ||||
|         (default 50) | ||||
|         context_len -- defines how many words before and after a match will be | ||||
|         shown (default 10) | ||||
|         result_len -- defines how many results are actually grabbed | ||||
|         """ | ||||
|         self.context_len = context_len | ||||
|         self.corpus_max_len = self.cl_attribute_size(self.attr_strings['positional_attrs']['word']) | ||||
|         self.corpus_max_len = self.cl_attribute_size( | ||||
|                                    self.attr_strings['positional_attrs']['word'] | ||||
|                               ) | ||||
|         self.nr_matches = min(result_len, self.nr_matches) | ||||
|         if self.nr_matches == 0: | ||||
|             print('Query resulted in 0 matches.') | ||||
|             logger.warning('Query resulted in 0 matches.') | ||||
|             self.disconnect | ||||
|             return None | ||||
|         else: | ||||
|             if self.nr_matches <= 50: | ||||
|                 matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns, | ||||
|                                                         0x10, | ||||
|             # Get match cpos boundries | ||||
|             # match_boundries shows the start and end cpos of one match as a | ||||
|             # pair of cpositions | ||||
|             # [(1355, 1357), (1477, 1479)] Example for two boundry pairs | ||||
|             match_boundaries = zip(self.cqp_dump_subcorpus(self.result_subcorpus, | ||||
|                                                            CONST_FIELD_MATCH, | ||||
|                                                            0, | ||||
|                                                         self.nr_matches - 1) | ||||
|                 matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns, | ||||
|                                                       0x11, | ||||
|                                                       0, self.nr_matches - 1) | ||||
|             else: | ||||
|                 matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns, | ||||
|                                                         0x10, | ||||
|                                                         result_start_count, | ||||
|                                                         result_max_count - 1) | ||||
|                 matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns, | ||||
|                                                       0x11, | ||||
|                                                       result_start_count, | ||||
|                                                       result_max_count - 1) | ||||
|             match_indexes = zip(matches_start, matches_end) | ||||
|                                                            self.nr_matches - 1), | ||||
|                                    self.cqp_dump_subcorpus(self.result_subcorpus, | ||||
|                                                            CONST_FIELD_MATCHEND, | ||||
|                                                            0, | ||||
|                                                            self.nr_matches - 1)) | ||||
|  | ||||
|             matches = [] | ||||
|             manager = multiprocessing.Manager() | ||||
|             return_dict = manager.dict() | ||||
|             for i, index_pair in enumerate(match_indexes): | ||||
|                 match = multiprocessing.Process(target=self.__get_matches, | ||||
|                                                 args=(i, | ||||
|                                                       index_pair, | ||||
|                                                       self.corpus_name, | ||||
|                                                       return_dict)) | ||||
|                 matches.append(match) | ||||
|                 match.start() | ||||
|             for match in matches: | ||||
|                 match.join() | ||||
|             #  sort matches into ordered dict | ||||
|             ordered_results = collections.OrderedDict() | ||||
|             for key in sorted(return_dict.keys()): | ||||
|                 ordered_results[key] = return_dict[key] | ||||
|             return ordered_results | ||||
|         # Generate all cpos between boundries including start and end boundries | ||||
|         # Save them as list into on match entry at serial number 'i' | ||||
|         ordered_matches = collections.OrderedDict() | ||||
|         for i, match_pair in enumerate(match_boundaries): | ||||
|             ordered_matches[i] = ({'match_cpos_list': | ||||
|                                    list(range(match_pair[0], | ||||
|                                               match_pair[1] + 1))}) | ||||
|         # Saves cpos form all match entries into one list | ||||
|         all_cpos_list = [] | ||||
|         for key in ordered_matches.keys(): | ||||
|             all_cpos_list += ordered_matches[key]['match_cpos_list'] | ||||
|  | ||||
|     def get_cpos_info(self, cpos, session): | ||||
|         match_dict = {} | ||||
|         # Saves all cpos from before and after context into the list: | ||||
|         # all_context_cpos_list | ||||
|         all_context_cpos_list = [] | ||||
|         for key in ordered_matches.keys(): | ||||
|             cpos_list = ordered_matches[key]['match_cpos_list'] | ||||
|             before_index = max([0, cpos_list[0] - self.context_len]) | ||||
|             after_index = min([self.corpus_max_len, | ||||
|                                cpos_list[-1] + self.context_len]) | ||||
|             ordered_matches[key]['context_before_cpos_list'] = list(range(before_index, | ||||
|                                                                           cpos_list[0])) | ||||
|             ordered_matches[key]['context_after_cpos_list'] = list(range(cpos_list[-1] + 1, | ||||
|                                                                          after_index + 1)) | ||||
|             all_context_cpos_list += ordered_matches[key]['context_before_cpos_list'] | ||||
|             all_context_cpos_list += ordered_matches[key]['context_after_cpos_list'] | ||||
|         # Combines all_cpos_list with all_context_cpos_list as a sorted set | ||||
|         all_cpos_list += all_context_cpos_list | ||||
|         all_cpos_list = sorted(list(set(all_cpos_list))) | ||||
|  | ||||
|         # Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for | ||||
|         # all cpos entries in all_cpos_list | ||||
|         # Also saves these informations into the ordered_matches dict | ||||
|         all_cpos_infos = self.get_cpos_infos(all_cpos_list) | ||||
|         for key in ordered_matches.keys(): | ||||
|             # loops over cpos in cpos_list which holds all match cpos | ||||
|             # Replaces one cpos with the corresponding cpos information created | ||||
|             # by self.get_cpos_infos(all_cpos_list) | ||||
|             cpos_list = ordered_matches[key]['match_cpos_list'] | ||||
|             infos = [] | ||||
|             for cpos in cpos_list: | ||||
|                 info = {cpos: all_cpos_infos.get(cpos)} | ||||
|                 infos.append(info) | ||||
|             ordered_matches[key]['match_cpos_list'] = infos | ||||
|             try: | ||||
|                 # loops over cpos in ordered_matches[key]['context_before_cpos_list'] | ||||
|                 # which holds all cpos of the before context | ||||
|                 # Replaces one cpos with the corresponding cpos information created | ||||
|                 # by self.get_cpos_infos(all_cpos_list) | ||||
|                 before_context_infos = [] | ||||
|                 for context_before_cpos in ordered_matches[key]['context_before_cpos_list']: | ||||
|                     before_context_info = {context_before_cpos: | ||||
|                                            all_cpos_infos.get(context_before_cpos)} | ||||
|                     before_context_infos.append(before_context_info) | ||||
|                 ordered_matches[key]['context_before_cpos_list'] = before_context_infos | ||||
|             except UnboundLocalError: | ||||
|                 logger.warning('Context before cpos list is empty.') | ||||
|             try: | ||||
|                 # loops over cpos in ordered_matches[key]['context_after_cpos_list'] | ||||
|                 # which holds all cpos of the before context | ||||
|                 # Replaces one cpos with the corresponding cpos information created | ||||
|                 # by self.get_cpos_infos(all_cpos_list) | ||||
|                 after_context_infos = [] | ||||
|                 for context_after_cpos in ordered_matches[key]['context_after_cpos_list']: | ||||
|                     after_context_info = {context_after_cpos: | ||||
|                                           all_cpos_infos.get(context_after_cpos)} | ||||
|                     after_context_infos.append(after_context_info) | ||||
|                 ordered_matches[key]['context_after_cpos_list'] = after_context_infos | ||||
|             except UnboundLocalError: | ||||
|                 logger.warning('Context after cpos list is empty.') | ||||
|         return ordered_matches | ||||
|  | ||||
|     def get_cpos_infos(self, all_cpos): | ||||
|         ''' | ||||
|         Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for | ||||
|         all cpos entries specified in the parameter all_cpos. | ||||
|         ''' | ||||
|         cpos_infos = {} | ||||
|         for attr_dict in self.attr_strings: | ||||
|             # print(self.attr_strings[attr_dict]) | ||||
|             if attr_dict == 'positional_attrs': | ||||
|                 for p_attr_key in self.attr_strings[attr_dict].keys(): | ||||
|                     # print(p_attr_key) | ||||
|                     match_str = session.cl_cpos2str(self.attr_strings[attr_dict][p_attr_key], range(cpos[0], cpos[1])) | ||||
|                     match_dict[p_attr_key] = match_str | ||||
|                     match_str = self.cl_cpos2str(self.attr_strings[attr_dict][p_attr_key], | ||||
|                                                  all_cpos) | ||||
|                     cpos_infos[p_attr_key] = match_str | ||||
|             elif attr_dict == 'struct_attrs': | ||||
|                 for struct_attr_key in self.attr_strings[attr_dict].keys(): | ||||
|                     # print(struct_attr_key) | ||||
|                     struct_entry = session.cl_cpos2struc(self.attr_strings['struct_attrs'][self.meta_struct_element], | ||||
|                                                          range(cpos[0], cpos[1])) | ||||
|                     match_str = session.cl_struc2str(self.attr_strings[attr_dict][struct_attr_key], struct_entry) | ||||
|                     match_dict[struct_attr_key] = set(match_str) | ||||
|         return match_dict | ||||
|  | ||||
|     def __get_matches(self, i, index_pair, corpus_name, return_dict): | ||||
|         """ | ||||
|         Get matches as readable output | ||||
|  | ||||
|         Gets the actual match strings of cpos match indexes. Private helper | ||||
|         method used in show_results. | ||||
|  | ||||
|         Keyword arguments: | ||||
|         i -- serial number for match at given cpos | ||||
|         index_pair -- match start and match end cpos | ||||
|         corpus_name -- name of the parent corpus | ||||
|         return_dict -- dictionary created with manager.dict() that holds the | ||||
|         extracted strings tags etc. | ||||
|         """ | ||||
|         # print('START:', index_pair[0]) | ||||
|         # print('END:', index_pair[1]) | ||||
|         # print('=============================') | ||||
|         index_pair = [index_pair[0], index_pair[1] + 1] | ||||
|         tmp_session = CQiWrapper(username=self.username, password=self.password, | ||||
|                                  host=self.host, port=self.port) | ||||
|         tmp_session.connect() | ||||
|         match = self.get_cpos_info(index_pair, tmp_session) | ||||
|         before_index = max([0, index_pair[0] - self.context_len]) | ||||
|         after_index = min([self.corpus_max_len, | ||||
|                            index_pair[1] + self.context_len]) | ||||
|         context_before = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'], | ||||
|                                                  range(before_index, | ||||
|                                                        index_pair[0])) | ||||
|         context_after = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'], | ||||
|                                                 range(index_pair[1] + 1, | ||||
|                                                       after_index + 1)) | ||||
|         tmp_dict = {'context_before': context_before, | ||||
|                     'context_after': context_after, | ||||
|                     'cpos_start': index_pair[0], | ||||
|                     'cpos_end': index_pair[1]} | ||||
|         match.update(tmp_dict) | ||||
|         return_dict[i] = match | ||||
|         tmp_session.disconnect() | ||||
|                     struct_entry = self.cl_cpos2struc(self.attr_strings['struct_attrs'][self.meta_struct_element], | ||||
|                                                       all_cpos) | ||||
|                     match_str = self.cl_struc2str(self.attr_strings[attr_dict][struct_attr_key], struct_entry) | ||||
|                     cpos_infos[struct_attr_key] = match_str | ||||
|         tmp_list = [] | ||||
|         attr_key_list = [] | ||||
|         for key in cpos_infos.keys(): | ||||
|             tmp_list.append(cpos_infos[key]) | ||||
|             attr_key_list.append(key) | ||||
|         joined_cpos_infos = zip(all_cpos, *tmp_list) | ||||
|         dict_cpos_infos = {} | ||||
|         for info in joined_cpos_infos: | ||||
|             dict_cpos_infos[info[0]] = dict(zip(attr_key_list, info[1:])) | ||||
|         return dict_cpos_infos | ||||
|   | ||||
		Reference in New Issue
	
	Block a user