mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
				synced 2025-10-27 00:41:15 +00:00 
			
		
		
		
	Add new CQiWrapper
This commit is contained in:
		| @@ -1,4 +1,4 @@ | |||||||
| from . import CQi | import CQi | ||||||
| import socket | import socket | ||||||
| import struct | import struct | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,6 +1,7 @@ | |||||||
| from .CQiClient import CQiClient | from CQiClient import CQiClient | ||||||
| import multiprocessing | from CQi import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND | ||||||
| import collections | import collections | ||||||
|  | from app import logger  # only works if imported into opaque web app | ||||||
|  |  | ||||||
|  |  | ||||||
| class CQiWrapper(CQiClient): | class CQiWrapper(CQiClient): | ||||||
| @@ -11,6 +12,8 @@ class CQiWrapper(CQiClient): | |||||||
|     for ease of use. Also structures recieved data into python dictionaries. |     for ease of use. Also structures recieved data into python dictionaries. | ||||||
|  |  | ||||||
|     Keyword arguments: |     Keyword arguments: | ||||||
|  |     host -- host IP adress or hostname wher the cqp server is running | ||||||
|  |     port -- port of the cqp server | ||||||
|     username -- username used to connect to the cqp server |     username -- username used to connect to the cqp server | ||||||
|     password -- password of the user to connect to the cqp server |     password -- password of the user to connect to the cqp server | ||||||
|     """ |     """ | ||||||
| @@ -32,12 +35,15 @@ class CQiWrapper(CQiClient): | |||||||
|         """ |         """ | ||||||
|         self.ctrl_connect(self.username, self.password) |         self.ctrl_connect(self.username, self.password) | ||||||
|  |  | ||||||
|     def create_attribute_strings(self): |     def __create_attribute_strings(self): | ||||||
|  |         """ | ||||||
|  |         Creates all needed attribute strings to query for word, lemma etc. in | ||||||
|  |         the given corpus. | ||||||
|  |         For example: CORPUS_NAME.word to query words | ||||||
|  |         """ | ||||||
|         p_attrs = self.corpus_positional_attributes(self.corpus_name) |         p_attrs = self.corpus_positional_attributes(self.corpus_name) | ||||||
|         struct_attrs = self.corpus_structural_attributes(self.corpus_name) |         struct_attrs = self.corpus_structural_attributes(self.corpus_name) | ||||||
|         self.meta_struct_element = struct_attrs[0] |         self.meta_struct_element = struct_attrs[0] | ||||||
|         print(p_attrs) |  | ||||||
|         print(struct_attrs) |  | ||||||
|         self.attr_strings = {} |         self.attr_strings = {} | ||||||
|         self.attr_strings['positional_attrs'] = {} |         self.attr_strings['positional_attrs'] = {} | ||||||
|         self.attr_strings['struct_attrs'] = {} |         self.attr_strings['struct_attrs'] = {} | ||||||
| @@ -49,8 +55,17 @@ class CQiWrapper(CQiClient): | |||||||
|             self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name |             self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name | ||||||
|                                                               + '.' |                                                               + '.' | ||||||
|                                                               + struct_attr) |                                                               + struct_attr) | ||||||
|     def set_corpus_name(self, corpus_name): |         logger.warning(('All positional and ' | ||||||
|         self.corpus_name = corpus_name |                         'structural attributes: {}').format(self.attr_strings)) | ||||||
|  |  | ||||||
|  |     def select_corpus(self, corpus_name): | ||||||
|  |         if corpus_name in self.corpus_list_coprora(): | ||||||
|  |             self.corpus_name = corpus_name | ||||||
|  |             self.__create_attribute_strings() | ||||||
|  |             logger.warning('{} does exist.'.format(corpus_name)) | ||||||
|  |         else: | ||||||
|  |             self.disconnect() | ||||||
|  |             logger.warning('{} does not exist.'.format(corpus_name)) | ||||||
|  |  | ||||||
|     def disconnect(self): |     def disconnect(self): | ||||||
|         """ |         """ | ||||||
| @@ -60,8 +75,9 @@ class CQiWrapper(CQiClient): | |||||||
|         """ |         """ | ||||||
|         self.ctrl_bye() |         self.ctrl_bye() | ||||||
|         self.connection.close() |         self.connection.close() | ||||||
|  |         logger.warning('Disconnected from cqp server.') | ||||||
|  |  | ||||||
|     def query_subcorpus(self, result_subcorpus_name, query): |     def query_subcorpus(self, query, result_subcorpus_name='Query-results'): | ||||||
|         """ |         """ | ||||||
|         Create subcorpus |         Create subcorpus | ||||||
|  |  | ||||||
| @@ -74,152 +90,155 @@ class CQiWrapper(CQiClient): | |||||||
|         query -- query written in cqp query language |         query -- query written in cqp query language | ||||||
|         """ |         """ | ||||||
|         self.cqp_query(self.corpus_name, result_subcorpus_name, query) |         self.cqp_query(self.corpus_name, result_subcorpus_name, query) | ||||||
|         self.result_subcorpus_ns = (self.corpus_name |         self.result_subcorpus = (self.corpus_name | ||||||
|                                     + ':' |                                  + ':' | ||||||
|                                     + result_subcorpus_name) |                                  + result_subcorpus_name) | ||||||
|         self.SUBCORPUS_NAMES.append(self.result_subcorpus_ns) |         self.SUBCORPUS_NAMES.append(self.result_subcorpus) | ||||||
|         self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus_ns) |         self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus) | ||||||
|         print('Nr of all matches is:', self.nr_matches) |         logger.warning('Nr of all matches is: {}'.format(self.nr_matches)) | ||||||
|  |  | ||||||
|     def show_subcorpora(self): |     def show_subcorpora(self): | ||||||
|  |         """ | ||||||
|  |         Show all subcorpora currently saved by the cqp server. | ||||||
|  |         """ | ||||||
|         return self.cqp_list_subcorpora(self.corpus_name) |         return self.cqp_list_subcorpora(self.corpus_name) | ||||||
|  |  | ||||||
|     def show_results(self, |     def show_query_results(self, | ||||||
|                      result_start_count=0, |                            context_len=10, | ||||||
|                      result_max_count=50, |                            result_len=1000): | ||||||
|                      context_len=10,): |  | ||||||
|         """ |         """ | ||||||
|         Show query results |         Show query results | ||||||
|  |  | ||||||
|         Shows the actual matched strings produce by the query. Uses the cpos |         Shows the actual matched strings produce by the query. Uses the cpos | ||||||
|         match indexes to grab those strings. saves them into an orderd |         match indexes to grab those strings. saves them into an orderd | ||||||
|         dictionary. Also saves coresponding tags, lemmas and context: |         dictionary. Also saves coresponding tags, lemmas and context. Gets those | ||||||
|         OrderedDict([ |         informations using the corresponding cpos. | ||||||
|             (0, |  | ||||||
|                 { |  | ||||||
|                     'tokens': ['Big', 'Brother', 'himself'], |  | ||||||
|                     'lemmas': ['big', 'brother', 'himself'], |  | ||||||
|                     'pos_tags': ['JJ', 'NN1', 'PPX1'], |  | ||||||
|                     'sem_tags': ['|A11.1+|N3.2+|N5+|', '|S2.2m|S4m|S9/S2.2m|', |  | ||||||
|                                  '|Z8m|'], |  | ||||||
|                     'context_before': ['figures', 'of', 'the', 'Party', ',', |  | ||||||
|                                        'almost', 'on', 'a', 'level', 'with'], |  | ||||||
|                     'context_after': [',', 'and', 'then', 'had', 'engaged', |  | ||||||
|                                       'in', 'counter-revolu-', 'tionary', |  | ||||||
|                                       'activities', ','], |  | ||||||
|                     'entry_title': '1984', 'entry_author': |  | ||||||
|                     'george_orwell', |  | ||||||
|                     'cpos_start': 110490, |  | ||||||
|                     'cpos_end': 110492 |  | ||||||
|                 } |  | ||||||
|             ) |  | ||||||
|         ]) |  | ||||||
|  |  | ||||||
|         Keyword arguments: |         Keyword arguments: | ||||||
|         result_start_count -- start position of the dumped subcorpus. |  | ||||||
|         (default 0) If it is 0 matches 0 to 50 will be shown. If it is 50 |  | ||||||
|         matches 50 to 100 will be shown. |  | ||||||
|         result_max_count -- defines how many matches at once will be shown. |  | ||||||
|         (default 50) |  | ||||||
|         context_len -- defines how many words before and after a match will be |         context_len -- defines how many words before and after a match will be | ||||||
|         shown (default 10) |         shown (default 10) | ||||||
|  |         result_len -- defines how many results are actually grabbed | ||||||
|         """ |         """ | ||||||
|         self.context_len = context_len |         self.context_len = context_len | ||||||
|         self.corpus_max_len = self.cl_attribute_size(self.attr_strings['positional_attrs']['word']) |         self.corpus_max_len = self.cl_attribute_size( | ||||||
|  |                                    self.attr_strings['positional_attrs']['word'] | ||||||
|  |                               ) | ||||||
|  |         self.nr_matches = min(result_len, self.nr_matches) | ||||||
|         if self.nr_matches == 0: |         if self.nr_matches == 0: | ||||||
|             print('Query resulted in 0 matches.') |             logger.warning('Query resulted in 0 matches.') | ||||||
|  |             self.disconnect | ||||||
|  |             return None | ||||||
|         else: |         else: | ||||||
|             if self.nr_matches <= 50: |             # Get match cpos boundries | ||||||
|                 matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns, |             # match_boundries shows the start and end cpos of one match as a | ||||||
|                                                         0x10, |             # pair of cpositions | ||||||
|                                                         0, |             # [(1355, 1357), (1477, 1479)] Example for two boundry pairs | ||||||
|                                                         self.nr_matches - 1) |             match_boundaries = zip(self.cqp_dump_subcorpus(self.result_subcorpus, | ||||||
|                 matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns, |                                                            CONST_FIELD_MATCH, | ||||||
|                                                       0x11, |                                                            0, | ||||||
|                                                       0, self.nr_matches - 1) |                                                            self.nr_matches - 1), | ||||||
|             else: |                                    self.cqp_dump_subcorpus(self.result_subcorpus, | ||||||
|                 matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns, |                                                            CONST_FIELD_MATCHEND, | ||||||
|                                                         0x10, |                                                            0, | ||||||
|                                                         result_start_count, |                                                            self.nr_matches - 1)) | ||||||
|                                                         result_max_count - 1) |  | ||||||
|                 matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns, |  | ||||||
|                                                       0x11, |  | ||||||
|                                                       result_start_count, |  | ||||||
|                                                       result_max_count - 1) |  | ||||||
|             match_indexes = zip(matches_start, matches_end) |  | ||||||
|  |  | ||||||
|             matches = [] |         # Generate all cpos between boundries including start and end boundries | ||||||
|             manager = multiprocessing.Manager() |         # Save them as list into on match entry at serial number 'i' | ||||||
|             return_dict = manager.dict() |         ordered_matches = collections.OrderedDict() | ||||||
|             for i, index_pair in enumerate(match_indexes): |         for i, match_pair in enumerate(match_boundaries): | ||||||
|                 match = multiprocessing.Process(target=self.__get_matches, |             ordered_matches[i] = ({'match_cpos_list': | ||||||
|                                                 args=(i, |                                    list(range(match_pair[0], | ||||||
|                                                       index_pair, |                                               match_pair[1] + 1))}) | ||||||
|                                                       self.corpus_name, |         # Saves cpos form all match entries into one list | ||||||
|                                                       return_dict)) |         all_cpos_list = [] | ||||||
|                 matches.append(match) |         for key in ordered_matches.keys(): | ||||||
|                 match.start() |             all_cpos_list += ordered_matches[key]['match_cpos_list'] | ||||||
|             for match in matches: |  | ||||||
|                 match.join() |  | ||||||
|             #  sort matches into ordered dict |  | ||||||
|             ordered_results = collections.OrderedDict() |  | ||||||
|             for key in sorted(return_dict.keys()): |  | ||||||
|                 ordered_results[key] = return_dict[key] |  | ||||||
|             return ordered_results |  | ||||||
|  |  | ||||||
|     def get_cpos_info(self, cpos, session): |         # Saves all cpos from before and after context into the list: | ||||||
|         match_dict = {} |         # all_context_cpos_list | ||||||
|  |         all_context_cpos_list = [] | ||||||
|  |         for key in ordered_matches.keys(): | ||||||
|  |             cpos_list = ordered_matches[key]['match_cpos_list'] | ||||||
|  |             before_index = max([0, cpos_list[0] - self.context_len]) | ||||||
|  |             after_index = min([self.corpus_max_len, | ||||||
|  |                                cpos_list[-1] + self.context_len]) | ||||||
|  |             ordered_matches[key]['context_before_cpos_list'] = list(range(before_index, | ||||||
|  |                                                                           cpos_list[0])) | ||||||
|  |             ordered_matches[key]['context_after_cpos_list'] = list(range(cpos_list[-1] + 1, | ||||||
|  |                                                                          after_index + 1)) | ||||||
|  |             all_context_cpos_list += ordered_matches[key]['context_before_cpos_list'] | ||||||
|  |             all_context_cpos_list += ordered_matches[key]['context_after_cpos_list'] | ||||||
|  |         # Combines all_cpos_list with all_context_cpos_list as a sorted set | ||||||
|  |         all_cpos_list += all_context_cpos_list | ||||||
|  |         all_cpos_list = sorted(list(set(all_cpos_list))) | ||||||
|  |  | ||||||
|  |         # Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for | ||||||
|  |         # all cpos entries in all_cpos_list | ||||||
|  |         # Also saves these informations into the ordered_matches dict | ||||||
|  |         all_cpos_infos = self.get_cpos_infos(all_cpos_list) | ||||||
|  |         for key in ordered_matches.keys(): | ||||||
|  |             # loops over cpos in cpos_list which holds all match cpos | ||||||
|  |             # Replaces one cpos with the corresponding cpos information created | ||||||
|  |             # by self.get_cpos_infos(all_cpos_list) | ||||||
|  |             cpos_list = ordered_matches[key]['match_cpos_list'] | ||||||
|  |             infos = [] | ||||||
|  |             for cpos in cpos_list: | ||||||
|  |                 info = {cpos: all_cpos_infos.get(cpos)} | ||||||
|  |                 infos.append(info) | ||||||
|  |             ordered_matches[key]['match_cpos_list'] = infos | ||||||
|  |             try: | ||||||
|  |                 # loops over cpos in ordered_matches[key]['context_before_cpos_list'] | ||||||
|  |                 # which holds all cpos of the before context | ||||||
|  |                 # Replaces one cpos with the corresponding cpos information created | ||||||
|  |                 # by self.get_cpos_infos(all_cpos_list) | ||||||
|  |                 before_context_infos = [] | ||||||
|  |                 for context_before_cpos in ordered_matches[key]['context_before_cpos_list']: | ||||||
|  |                     before_context_info = {context_before_cpos: | ||||||
|  |                                            all_cpos_infos.get(context_before_cpos)} | ||||||
|  |                     before_context_infos.append(before_context_info) | ||||||
|  |                 ordered_matches[key]['context_before_cpos_list'] = before_context_infos | ||||||
|  |             except UnboundLocalError: | ||||||
|  |                 logger.warning('Context before cpos list is empty.') | ||||||
|  |             try: | ||||||
|  |                 # loops over cpos in ordered_matches[key]['context_after_cpos_list'] | ||||||
|  |                 # which holds all cpos of the before context | ||||||
|  |                 # Replaces one cpos with the corresponding cpos information created | ||||||
|  |                 # by self.get_cpos_infos(all_cpos_list) | ||||||
|  |                 after_context_infos = [] | ||||||
|  |                 for context_after_cpos in ordered_matches[key]['context_after_cpos_list']: | ||||||
|  |                     after_context_info = {context_after_cpos: | ||||||
|  |                                           all_cpos_infos.get(context_after_cpos)} | ||||||
|  |                     after_context_infos.append(after_context_info) | ||||||
|  |                 ordered_matches[key]['context_after_cpos_list'] = after_context_infos | ||||||
|  |             except UnboundLocalError: | ||||||
|  |                 logger.warning('Context after cpos list is empty.') | ||||||
|  |         return ordered_matches | ||||||
|  |  | ||||||
|  |     def get_cpos_infos(self, all_cpos): | ||||||
|  |         ''' | ||||||
|  |         Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for | ||||||
|  |         all cpos entries specified in the parameter all_cpos. | ||||||
|  |         ''' | ||||||
|  |         cpos_infos = {} | ||||||
|         for attr_dict in self.attr_strings: |         for attr_dict in self.attr_strings: | ||||||
|             # print(self.attr_strings[attr_dict]) |  | ||||||
|             if attr_dict == 'positional_attrs': |             if attr_dict == 'positional_attrs': | ||||||
|                 for p_attr_key in self.attr_strings[attr_dict].keys(): |                 for p_attr_key in self.attr_strings[attr_dict].keys(): | ||||||
|                     # print(p_attr_key) |                     match_str = self.cl_cpos2str(self.attr_strings[attr_dict][p_attr_key], | ||||||
|                     match_str = session.cl_cpos2str(self.attr_strings[attr_dict][p_attr_key], range(cpos[0], cpos[1])) |                                                  all_cpos) | ||||||
|                     match_dict[p_attr_key] = match_str |                     cpos_infos[p_attr_key] = match_str | ||||||
|             elif attr_dict == 'struct_attrs': |             elif attr_dict == 'struct_attrs': | ||||||
|                 for struct_attr_key in self.attr_strings[attr_dict].keys(): |                 for struct_attr_key in self.attr_strings[attr_dict].keys(): | ||||||
|                     # print(struct_attr_key) |                     struct_entry = self.cl_cpos2struc(self.attr_strings['struct_attrs'][self.meta_struct_element], | ||||||
|                     struct_entry = session.cl_cpos2struc(self.attr_strings['struct_attrs'][self.meta_struct_element], |                                                       all_cpos) | ||||||
|                                                          range(cpos[0], cpos[1])) |                     match_str = self.cl_struc2str(self.attr_strings[attr_dict][struct_attr_key], struct_entry) | ||||||
|                     match_str = session.cl_struc2str(self.attr_strings[attr_dict][struct_attr_key], struct_entry) |                     cpos_infos[struct_attr_key] = match_str | ||||||
|                     match_dict[struct_attr_key] = set(match_str) |         tmp_list = [] | ||||||
|         return match_dict |         attr_key_list = [] | ||||||
|  |         for key in cpos_infos.keys(): | ||||||
|     def __get_matches(self, i, index_pair, corpus_name, return_dict): |             tmp_list.append(cpos_infos[key]) | ||||||
|         """ |             attr_key_list.append(key) | ||||||
|         Get matches as readable output |         joined_cpos_infos = zip(all_cpos, *tmp_list) | ||||||
|  |         dict_cpos_infos = {} | ||||||
|         Gets the actual match strings of cpos match indexes. Private helper |         for info in joined_cpos_infos: | ||||||
|         method used in show_results. |             dict_cpos_infos[info[0]] = dict(zip(attr_key_list, info[1:])) | ||||||
|  |         return dict_cpos_infos | ||||||
|         Keyword arguments: |  | ||||||
|         i -- serial number for match at given cpos |  | ||||||
|         index_pair -- match start and match end cpos |  | ||||||
|         corpus_name -- name of the parent corpus |  | ||||||
|         return_dict -- dictionary created with manager.dict() that holds the |  | ||||||
|         extracted strings tags etc. |  | ||||||
|         """ |  | ||||||
|         # print('START:', index_pair[0]) |  | ||||||
|         # print('END:', index_pair[1]) |  | ||||||
|         # print('=============================') |  | ||||||
|         index_pair = [index_pair[0], index_pair[1] + 1] |  | ||||||
|         tmp_session = CQiWrapper(username=self.username, password=self.password, |  | ||||||
|                                  host=self.host, port=self.port) |  | ||||||
|         tmp_session.connect() |  | ||||||
|         match = self.get_cpos_info(index_pair, tmp_session) |  | ||||||
|         before_index = max([0, index_pair[0] - self.context_len]) |  | ||||||
|         after_index = min([self.corpus_max_len, |  | ||||||
|                            index_pair[1] + self.context_len]) |  | ||||||
|         context_before = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'], |  | ||||||
|                                                  range(before_index, |  | ||||||
|                                                        index_pair[0])) |  | ||||||
|         context_after = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'], |  | ||||||
|                                                 range(index_pair[1] + 1, |  | ||||||
|                                                       after_index + 1)) |  | ||||||
|         tmp_dict = {'context_before': context_before, |  | ||||||
|                     'context_after': context_after, |  | ||||||
|                     'cpos_start': index_pair[0], |  | ||||||
|                     'cpos_end': index_pair[1]} |  | ||||||
|         match.update(tmp_dict) |  | ||||||
|         return_dict[i] = match |  | ||||||
|         tmp_session.disconnect() |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user