Add new CQiWrapper

2026-06-20 20:05:43 +00:00 · 2019-11-18 14:24:13 +01:00
parent baf06d3106
commit 5fdd67ebf2
2 changed files with 153 additions and 134 deletions
@@ -1,4 +1,4 @@
-from . import CQi
+import CQi
 import socket
 import struct
@@ -1,6 +1,7 @@
-from .CQiClient import CQiClient
+from CQiClient import CQiClient
-import multiprocessing
+from CQi import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND
 import collections
 from app import logger  # only works if imported into opaque web app
 class CQiWrapper(CQiClient):
@@ -11,6 +12,8 @@ class CQiWrapper(CQiClient):
    for ease of use. Also structures recieved data into python dictionaries.
    Keyword arguments:
    host -- host IP adress or hostname wher the cqp server is running
    port -- port of the cqp server
    username -- username used to connect to the cqp server
    password -- password of the user to connect to the cqp server
    """
@@ -32,12 +35,15 @@ class CQiWrapper(CQiClient):
        """
        self.ctrl_connect(self.username, self.password)
-    def create_attribute_strings(self):
+    def __create_attribute_strings(self):
        """
        Creates all needed attribute strings to query for word, lemma etc. in
        the given corpus.
        For example: CORPUS_NAME.word to query words
        """
        p_attrs = self.corpus_positional_attributes(self.corpus_name)
        struct_attrs = self.corpus_structural_attributes(self.corpus_name)
        self.meta_struct_element = struct_attrs[0]
        print(p_attrs)
        print(struct_attrs)
        self.attr_strings = {}
        self.attr_strings['positional_attrs'] = {}
        self.attr_strings['struct_attrs'] = {}
@@ -49,8 +55,17 @@ class CQiWrapper(CQiClient):
            self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
                                                              + '.'
                                                              + struct_attr)
-    def set_corpus_name(self, corpus_name):
+        logger.warning(('All positional and '
                        'structural attributes: {}').format(self.attr_strings))
    def select_corpus(self, corpus_name):
        if corpus_name in self.corpus_list_coprora():
            self.corpus_name = corpus_name
            self.__create_attribute_strings()
            logger.warning('{} does exist.'.format(corpus_name))
        else:
            self.disconnect()
            logger.warning('{} does not exist.'.format(corpus_name))
    def disconnect(self):
        """
@@ -60,8 +75,9 @@ class CQiWrapper(CQiClient):
        """
        self.ctrl_bye()
        self.connection.close()
        logger.warning('Disconnected from cqp server.')
-    def query_subcorpus(self, result_subcorpus_name, query):
+    def query_subcorpus(self, query, result_subcorpus_name='Query-results'):
        """
        Create subcorpus
@@ -74,152 +90,155 @@ class CQiWrapper(CQiClient):
        query -- query written in cqp query language
        """
        self.cqp_query(self.corpus_name, result_subcorpus_name, query)
-        self.result_subcorpus_ns = (self.corpus_name
+        self.result_subcorpus = (self.corpus_name
                                 + ':'
                                 + result_subcorpus_name)
-        self.SUBCORPUS_NAMES.append(self.result_subcorpus_ns)
+        self.SUBCORPUS_NAMES.append(self.result_subcorpus)
-        self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus_ns)
+        self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus)
-        print('Nr of all matches is:', self.nr_matches)
+        logger.warning('Nr of all matches is: {}'.format(self.nr_matches))
    def show_subcorpora(self):
        """
        Show all subcorpora currently saved by the cqp server.
        """
        return self.cqp_list_subcorpora(self.corpus_name)
-    def show_results(self,
+    def show_query_results(self,
-                     result_start_count=0,
+                           context_len=10,
-                     result_max_count=50,
+                           result_len=1000):
                     context_len=10,):
        """
        Show query results
        Shows the actual matched strings produce by the query. Uses the cpos
        match indexes to grab those strings. saves them into an orderd
-        dictionary. Also saves coresponding tags, lemmas and context:
+        dictionary. Also saves coresponding tags, lemmas and context. Gets those
-        OrderedDict([
+        informations using the corresponding cpos.
            (0,
                {
                    'tokens': ['Big', 'Brother', 'himself'],
                    'lemmas': ['big', 'brother', 'himself'],
                    'pos_tags': ['JJ', 'NN1', 'PPX1'],
                    'sem_tags': ['|A11.1+|N3.2+|N5+|', '|S2.2m|S4m|S9/S2.2m|',
                                 '|Z8m|'],
                    'context_before': ['figures', 'of', 'the', 'Party', ',',
                                       'almost', 'on', 'a', 'level', 'with'],
                    'context_after': [',', 'and', 'then', 'had', 'engaged',
                                      'in', 'counter-revolu-', 'tionary',
                                      'activities', ','],
                    'entry_title': '1984', 'entry_author':
                    'george_orwell',
                    'cpos_start': 110490,
                    'cpos_end': 110492
                }
            )
        ])
        Keyword arguments:
        result_start_count -- start position of the dumped subcorpus.
        (default 0) If it is 0 matches 0 to 50 will be shown. If it is 50
        matches 50 to 100 will be shown.
        result_max_count -- defines how many matches at once will be shown.
        (default 50)
        context_len -- defines how many words before and after a match will be
        shown (default 10)
        result_len -- defines how many results are actually grabbed
        """
        self.context_len = context_len
-        self.corpus_max_len = self.cl_attribute_size(self.attr_strings['positional_attrs']['word'])
+        self.corpus_max_len = self.cl_attribute_size(
                                   self.attr_strings['positional_attrs']['word']
                              )
        self.nr_matches = min(result_len, self.nr_matches)
        if self.nr_matches == 0:
-            print('Query resulted in 0 matches.')
+            logger.warning('Query resulted in 0 matches.')
            self.disconnect
            return None
        else:
-            if self.nr_matches <= 50:
+            # Get match cpos boundries
-                matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
+            # match_boundries shows the start and end cpos of one match as a
-                                                        0x10,
+            # pair of cpositions
            # [(1355, 1357), (1477, 1479)] Example for two boundry pairs
            match_boundaries = zip(self.cqp_dump_subcorpus(self.result_subcorpus,
                                                           CONST_FIELD_MATCH,
                                                           0,
-                                                        self.nr_matches - 1)
+                                                           self.nr_matches - 1),
-                matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
+                                   self.cqp_dump_subcorpus(self.result_subcorpus,
-                                                      0x11,
+                                                           CONST_FIELD_MATCHEND,
-                                                      0, self.nr_matches - 1)
+                                                           0,
-            else:
+                                                           self.nr_matches - 1))
                matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
                                                        0x10,
                                                        result_start_count,
                                                        result_max_count - 1)
                matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
                                                      0x11,
                                                      result_start_count,
                                                      result_max_count - 1)
            match_indexes = zip(matches_start, matches_end)
-            matches = []
+        # Generate all cpos between boundries including start and end boundries
-            manager = multiprocessing.Manager()
+        # Save them as list into on match entry at serial number 'i'
-            return_dict = manager.dict()
+        ordered_matches = collections.OrderedDict()
-            for i, index_pair in enumerate(match_indexes):
+        for i, match_pair in enumerate(match_boundaries):
-                match = multiprocessing.Process(target=self.__get_matches,
+            ordered_matches[i] = ({'match_cpos_list':
-                                                args=(i,
+                                   list(range(match_pair[0],
-                                                      index_pair,
+                                              match_pair[1] + 1))})
-                                                      self.corpus_name,
+        # Saves cpos form all match entries into one list
-                                                      return_dict))
+        all_cpos_list = []
-                matches.append(match)
+        for key in ordered_matches.keys():
-                match.start()
+            all_cpos_list += ordered_matches[key]['match_cpos_list']
            for match in matches:
                match.join()
            #  sort matches into ordered dict
            ordered_results = collections.OrderedDict()
            for key in sorted(return_dict.keys()):
                ordered_results[key] = return_dict[key]
            return ordered_results
-    def get_cpos_info(self, cpos, session):
+        # Saves all cpos from before and after context into the list:
-        match_dict = {}
+        # all_context_cpos_list
        all_context_cpos_list = []
        for key in ordered_matches.keys():
            cpos_list = ordered_matches[key]['match_cpos_list']
            before_index = max([0, cpos_list[0] - self.context_len])
            after_index = min([self.corpus_max_len,
                               cpos_list[-1] + self.context_len])
            ordered_matches[key]['context_before_cpos_list'] = list(range(before_index,
                                                                          cpos_list[0]))
            ordered_matches[key]['context_after_cpos_list'] = list(range(cpos_list[-1] + 1,
                                                                         after_index + 1))
            all_context_cpos_list += ordered_matches[key]['context_before_cpos_list']
            all_context_cpos_list += ordered_matches[key]['context_after_cpos_list']
        # Combines all_cpos_list with all_context_cpos_list as a sorted set
        all_cpos_list += all_context_cpos_list
        all_cpos_list = sorted(list(set(all_cpos_list)))
        # Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
        # all cpos entries in all_cpos_list
        # Also saves these informations into the ordered_matches dict
        all_cpos_infos = self.get_cpos_infos(all_cpos_list)
        for key in ordered_matches.keys():
            # loops over cpos in cpos_list which holds all match cpos
            # Replaces one cpos with the corresponding cpos information created
            # by self.get_cpos_infos(all_cpos_list)
            cpos_list = ordered_matches[key]['match_cpos_list']
            infos = []
            for cpos in cpos_list:
                info = {cpos: all_cpos_infos.get(cpos)}
                infos.append(info)
            ordered_matches[key]['match_cpos_list'] = infos
            try:
                # loops over cpos in ordered_matches[key]['context_before_cpos_list']
                # which holds all cpos of the before context
                # Replaces one cpos with the corresponding cpos information created
                # by self.get_cpos_infos(all_cpos_list)
                before_context_infos = []
                for context_before_cpos in ordered_matches[key]['context_before_cpos_list']:
                    before_context_info = {context_before_cpos:
                                           all_cpos_infos.get(context_before_cpos)}
                    before_context_infos.append(before_context_info)
                ordered_matches[key]['context_before_cpos_list'] = before_context_infos
            except UnboundLocalError:
                logger.warning('Context before cpos list is empty.')
            try:
                # loops over cpos in ordered_matches[key]['context_after_cpos_list']
                # which holds all cpos of the before context
                # Replaces one cpos with the corresponding cpos information created
                # by self.get_cpos_infos(all_cpos_list)
                after_context_infos = []
                for context_after_cpos in ordered_matches[key]['context_after_cpos_list']:
                    after_context_info = {context_after_cpos:
                                          all_cpos_infos.get(context_after_cpos)}
                    after_context_infos.append(after_context_info)
                ordered_matches[key]['context_after_cpos_list'] = after_context_infos
            except UnboundLocalError:
                logger.warning('Context after cpos list is empty.')
        return ordered_matches
    def get_cpos_infos(self, all_cpos):
        '''
        Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
        all cpos entries specified in the parameter all_cpos.
        '''
        cpos_infos = {}
        for attr_dict in self.attr_strings:
            # print(self.attr_strings[attr_dict])
            if attr_dict == 'positional_attrs':
                for p_attr_key in self.attr_strings[attr_dict].keys():
-                    # print(p_attr_key)
+                    match_str = self.cl_cpos2str(self.attr_strings[attr_dict][p_attr_key],
-                    match_str = session.cl_cpos2str(self.attr_strings[attr_dict][p_attr_key], range(cpos[0], cpos[1]))
+                                                 all_cpos)
-                    match_dict[p_attr_key] = match_str
+                    cpos_infos[p_attr_key] = match_str
            elif attr_dict == 'struct_attrs':
                for struct_attr_key in self.attr_strings[attr_dict].keys():
-                    # print(struct_attr_key)
+                    struct_entry = self.cl_cpos2struc(self.attr_strings['struct_attrs'][self.meta_struct_element],
-                    struct_entry = session.cl_cpos2struc(self.attr_strings['struct_attrs'][self.meta_struct_element],
+                                                      all_cpos)
-                                                         range(cpos[0], cpos[1]))
+                    match_str = self.cl_struc2str(self.attr_strings[attr_dict][struct_attr_key], struct_entry)
-                    match_str = session.cl_struc2str(self.attr_strings[attr_dict][struct_attr_key], struct_entry)
+                    cpos_infos[struct_attr_key] = match_str
-                    match_dict[struct_attr_key] = set(match_str)
+        tmp_list = []
-        return match_dict
+        attr_key_list = []
-
+        for key in cpos_infos.keys():
-    def __get_matches(self, i, index_pair, corpus_name, return_dict):
+            tmp_list.append(cpos_infos[key])
-        """
+            attr_key_list.append(key)
-        Get matches as readable output
+        joined_cpos_infos = zip(all_cpos, *tmp_list)
-
+        dict_cpos_infos = {}
-        Gets the actual match strings of cpos match indexes. Private helper
+        for info in joined_cpos_infos:
-        method used in show_results.
+            dict_cpos_infos[info[0]] = dict(zip(attr_key_list, info[1:]))
-
+        return dict_cpos_infos
        Keyword arguments:
        i -- serial number for match at given cpos
        index_pair -- match start and match end cpos
        corpus_name -- name of the parent corpus
        return_dict -- dictionary created with manager.dict() that holds the
        extracted strings tags etc.
        """
        # print('START:', index_pair[0])
        # print('END:', index_pair[1])
        # print('=============================')
        index_pair = [index_pair[0], index_pair[1] + 1]
        tmp_session = CQiWrapper(username=self.username, password=self.password,
                                 host=self.host, port=self.port)
        tmp_session.connect()
        match = self.get_cpos_info(index_pair, tmp_session)
        before_index = max([0, index_pair[0] - self.context_len])
        after_index = min([self.corpus_max_len,
                           index_pair[1] + self.context_len])
        context_before = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
                                                 range(before_index,
                                                       index_pair[0]))
        context_after = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
                                                range(index_pair[1] + 1,
                                                      after_index + 1))
        tmp_dict = {'context_before': context_before,
                    'context_after': context_after,
                    'cpos_start': index_pair[0],
                    'cpos_end': index_pair[1]}
        match.update(tmp_dict)
        return_dict[i] = match
        tmp_session.disconnect()