Add new CQiWrapper

2026-06-20 20:05:43 +00:00 · 2019-11-18 14:24:13 +01:00
parent baf06d3106
commit 5fdd67ebf2
2 changed files with 153 additions and 134 deletions
@@ -1,4 +1,4 @@
-from . import CQi
+import CQi
 import socket
 import struct

@@ -1,6 +1,7 @@
-from .CQiClient import CQiClient
-import multiprocessing
+from CQiClient import CQiClient
+from CQi import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND
 import collections
+from app import logger  # only works if imported into opaque web app


 class CQiWrapper(CQiClient):
@@ -11,6 +12,8 @@ class CQiWrapper(CQiClient):
    for ease of use. Also structures recieved data into python dictionaries.

    Keyword arguments:
+    host -- host IP adress or hostname wher the cqp server is running
+    port -- port of the cqp server
    username -- username used to connect to the cqp server
    password -- password of the user to connect to the cqp server
    """
@@ -32,12 +35,15 @@ class CQiWrapper(CQiClient):
        """
        self.ctrl_connect(self.username, self.password)

-    def create_attribute_strings(self):
+    def __create_attribute_strings(self):
+        """
+        Creates all needed attribute strings to query for word, lemma etc. in
+        the given corpus.
+        For example: CORPUS_NAME.word to query words
+        """
        p_attrs = self.corpus_positional_attributes(self.corpus_name)
        struct_attrs = self.corpus_structural_attributes(self.corpus_name)
        self.meta_struct_element = struct_attrs[0]
-        print(p_attrs)
-        print(struct_attrs)
        self.attr_strings = {}
        self.attr_strings['positional_attrs'] = {}
        self.attr_strings['struct_attrs'] = {}
@@ -49,8 +55,17 @@ class CQiWrapper(CQiClient):
            self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
                                                              + '.'
                                                              + struct_attr)
-    def set_corpus_name(self, corpus_name):
+        logger.warning(('All positional and '
+                        'structural attributes: {}').format(self.attr_strings))
+
+    def select_corpus(self, corpus_name):
+        if corpus_name in self.corpus_list_coprora():
            self.corpus_name = corpus_name
+            self.__create_attribute_strings()
+            logger.warning('{} does exist.'.format(corpus_name))
+        else:
+            self.disconnect()
+            logger.warning('{} does not exist.'.format(corpus_name))

    def disconnect(self):
        """
@@ -60,8 +75,9 @@ class CQiWrapper(CQiClient):
        """
        self.ctrl_bye()
        self.connection.close()
+        logger.warning('Disconnected from cqp server.')

-    def query_subcorpus(self, result_subcorpus_name, query):
+    def query_subcorpus(self, query, result_subcorpus_name='Query-results'):
        """
        Create subcorpus

@@ -74,152 +90,155 @@ class CQiWrapper(CQiClient):
        query -- query written in cqp query language
        """
        self.cqp_query(self.corpus_name, result_subcorpus_name, query)
-        self.result_subcorpus_ns = (self.corpus_name
+        self.result_subcorpus = (self.corpus_name
                                 + ':'
                                 + result_subcorpus_name)
-        self.SUBCORPUS_NAMES.append(self.result_subcorpus_ns)
-        self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus_ns)
-        print('Nr of all matches is:', self.nr_matches)
+        self.SUBCORPUS_NAMES.append(self.result_subcorpus)
+        self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus)
+        logger.warning('Nr of all matches is: {}'.format(self.nr_matches))

    def show_subcorpora(self):
+        """
+        Show all subcorpora currently saved by the cqp server.
+        """
        return self.cqp_list_subcorpora(self.corpus_name)

-    def show_results(self,
-                     result_start_count=0,
-                     result_max_count=50,
-                     context_len=10,):
+    def show_query_results(self,
+                           context_len=10,
+                           result_len=1000):
        """
        Show query results

        Shows the actual matched strings produce by the query. Uses the cpos
        match indexes to grab those strings. saves them into an orderd
-        dictionary. Also saves coresponding tags, lemmas and context:
-        OrderedDict([
-            (0,
-                {
-                    'tokens': ['Big', 'Brother', 'himself'],
-                    'lemmas': ['big', 'brother', 'himself'],
-                    'pos_tags': ['JJ', 'NN1', 'PPX1'],
-                    'sem_tags': ['|A11.1+|N3.2+|N5+|', '|S2.2m|S4m|S9/S2.2m|',
-                                 '|Z8m|'],
-                    'context_before': ['figures', 'of', 'the', 'Party', ',',
-                                       'almost', 'on', 'a', 'level', 'with'],
-                    'context_after': [',', 'and', 'then', 'had', 'engaged',
-                                      'in', 'counter-revolu-', 'tionary',
-                                      'activities', ','],
-                    'entry_title': '1984', 'entry_author':
-                    'george_orwell',
-                    'cpos_start': 110490,
-                    'cpos_end': 110492
-                }
-            )
-        ])
+        dictionary. Also saves coresponding tags, lemmas and context. Gets those
+        informations using the corresponding cpos.

        Keyword arguments:
-        result_start_count -- start position of the dumped subcorpus.
-        (default 0) If it is 0 matches 0 to 50 will be shown. If it is 50
-        matches 50 to 100 will be shown.
-        result_max_count -- defines how many matches at once will be shown.
-        (default 50)
        context_len -- defines how many words before and after a match will be
        shown (default 10)
+        result_len -- defines how many results are actually grabbed
        """
        self.context_len = context_len
-        self.corpus_max_len = self.cl_attribute_size(self.attr_strings['positional_attrs']['word'])
+        self.corpus_max_len = self.cl_attribute_size(
+                                   self.attr_strings['positional_attrs']['word']
+                              )
+        self.nr_matches = min(result_len, self.nr_matches)
        if self.nr_matches == 0:
-            print('Query resulted in 0 matches.')
+            logger.warning('Query resulted in 0 matches.')
+            self.disconnect
+            return None
        else:
-            if self.nr_matches <= 50:
-                matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
-                                                        0x10,
+            # Get match cpos boundries
+            # match_boundries shows the start and end cpos of one match as a
+            # pair of cpositions
+            # [(1355, 1357), (1477, 1479)] Example for two boundry pairs
+            match_boundaries = zip(self.cqp_dump_subcorpus(self.result_subcorpus,
+                                                           CONST_FIELD_MATCH,
                                                           0,
-                                                        self.nr_matches - 1)
-                matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
-                                                      0x11,
-                                                      0, self.nr_matches - 1)
-            else:
-                matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
-                                                        0x10,
-                                                        result_start_count,
-                                                        result_max_count - 1)
-                matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
-                                                      0x11,
-                                                      result_start_count,
-                                                      result_max_count - 1)
-            match_indexes = zip(matches_start, matches_end)
+                                                           self.nr_matches - 1),
+                                   self.cqp_dump_subcorpus(self.result_subcorpus,
+                                                           CONST_FIELD_MATCHEND,
+                                                           0,
+                                                           self.nr_matches - 1))

-            matches = []
-            manager = multiprocessing.Manager()
-            return_dict = manager.dict()
-            for i, index_pair in enumerate(match_indexes):
-                match = multiprocessing.Process(target=self.__get_matches,
-                                                args=(i,
-                                                      index_pair,
-                                                      self.corpus_name,
-                                                      return_dict))
-                matches.append(match)
-                match.start()
-            for match in matches:
-                match.join()
-            #  sort matches into ordered dict
-            ordered_results = collections.OrderedDict()
-            for key in sorted(return_dict.keys()):
-                ordered_results[key] = return_dict[key]
-            return ordered_results
+        # Generate all cpos between boundries including start and end boundries
+        # Save them as list into on match entry at serial number 'i'
+        ordered_matches = collections.OrderedDict()
+        for i, match_pair in enumerate(match_boundaries):
+            ordered_matches[i] = ({'match_cpos_list':
+                                   list(range(match_pair[0],
+                                              match_pair[1] + 1))})
+        # Saves cpos form all match entries into one list
+        all_cpos_list = []
+        for key in ordered_matches.keys():
+            all_cpos_list += ordered_matches[key]['match_cpos_list']

-    def get_cpos_info(self, cpos, session):
-        match_dict = {}
+        # Saves all cpos from before and after context into the list:
+        # all_context_cpos_list
+        all_context_cpos_list = []
+        for key in ordered_matches.keys():
+            cpos_list = ordered_matches[key]['match_cpos_list']
+            before_index = max([0, cpos_list[0] - self.context_len])
+            after_index = min([self.corpus_max_len,
+                               cpos_list[-1] + self.context_len])
+            ordered_matches[key]['context_before_cpos_list'] = list(range(before_index,
+                                                                          cpos_list[0]))
+            ordered_matches[key]['context_after_cpos_list'] = list(range(cpos_list[-1] + 1,
+                                                                         after_index + 1))
+            all_context_cpos_list += ordered_matches[key]['context_before_cpos_list']
+            all_context_cpos_list += ordered_matches[key]['context_after_cpos_list']
+        # Combines all_cpos_list with all_context_cpos_list as a sorted set
+        all_cpos_list += all_context_cpos_list
+        all_cpos_list = sorted(list(set(all_cpos_list)))
+
+        # Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
+        # all cpos entries in all_cpos_list
+        # Also saves these informations into the ordered_matches dict
+        all_cpos_infos = self.get_cpos_infos(all_cpos_list)
+        for key in ordered_matches.keys():
+            # loops over cpos in cpos_list which holds all match cpos
+            # Replaces one cpos with the corresponding cpos information created
+            # by self.get_cpos_infos(all_cpos_list)
+            cpos_list = ordered_matches[key]['match_cpos_list']
+            infos = []
+            for cpos in cpos_list:
+                info = {cpos: all_cpos_infos.get(cpos)}
+                infos.append(info)
+            ordered_matches[key]['match_cpos_list'] = infos
+            try:
+                # loops over cpos in ordered_matches[key]['context_before_cpos_list']
+                # which holds all cpos of the before context
+                # Replaces one cpos with the corresponding cpos information created
+                # by self.get_cpos_infos(all_cpos_list)
+                before_context_infos = []
+                for context_before_cpos in ordered_matches[key]['context_before_cpos_list']:
+                    before_context_info = {context_before_cpos:
+                                           all_cpos_infos.get(context_before_cpos)}
+                    before_context_infos.append(before_context_info)
+                ordered_matches[key]['context_before_cpos_list'] = before_context_infos
+            except UnboundLocalError:
+                logger.warning('Context before cpos list is empty.')
+            try:
+                # loops over cpos in ordered_matches[key]['context_after_cpos_list']
+                # which holds all cpos of the before context
+                # Replaces one cpos with the corresponding cpos information created
+                # by self.get_cpos_infos(all_cpos_list)
+                after_context_infos = []
+                for context_after_cpos in ordered_matches[key]['context_after_cpos_list']:
+                    after_context_info = {context_after_cpos:
+                                          all_cpos_infos.get(context_after_cpos)}
+                    after_context_infos.append(after_context_info)
+                ordered_matches[key]['context_after_cpos_list'] = after_context_infos
+            except UnboundLocalError:
+                logger.warning('Context after cpos list is empty.')
+        return ordered_matches
+
+    def get_cpos_infos(self, all_cpos):
+        '''
+        Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
+        all cpos entries specified in the parameter all_cpos.
+        '''
+        cpos_infos = {}
        for attr_dict in self.attr_strings:
-            # print(self.attr_strings[attr_dict])
            if attr_dict == 'positional_attrs':
                for p_attr_key in self.attr_strings[attr_dict].keys():
-                    # print(p_attr_key)
-                    match_str = session.cl_cpos2str(self.attr_strings[attr_dict][p_attr_key], range(cpos[0], cpos[1]))
-                    match_dict[p_attr_key] = match_str
+                    match_str = self.cl_cpos2str(self.attr_strings[attr_dict][p_attr_key],
+                                                 all_cpos)
+                    cpos_infos[p_attr_key] = match_str
            elif attr_dict == 'struct_attrs':
                for struct_attr_key in self.attr_strings[attr_dict].keys():
-                    # print(struct_attr_key)
-                    struct_entry = session.cl_cpos2struc(self.attr_strings['struct_attrs'][self.meta_struct_element],
-                                                         range(cpos[0], cpos[1]))
-                    match_str = session.cl_struc2str(self.attr_strings[attr_dict][struct_attr_key], struct_entry)
-                    match_dict[struct_attr_key] = set(match_str)
-        return match_dict
-
-    def __get_matches(self, i, index_pair, corpus_name, return_dict):
-        """
-        Get matches as readable output
-
-        Gets the actual match strings of cpos match indexes. Private helper
-        method used in show_results.
-
-        Keyword arguments:
-        i -- serial number for match at given cpos
-        index_pair -- match start and match end cpos
-        corpus_name -- name of the parent corpus
-        return_dict -- dictionary created with manager.dict() that holds the
-        extracted strings tags etc.
-        """
-        # print('START:', index_pair[0])
-        # print('END:', index_pair[1])
-        # print('=============================')
-        index_pair = [index_pair[0], index_pair[1] + 1]
-        tmp_session = CQiWrapper(username=self.username, password=self.password,
-                                 host=self.host, port=self.port)
-        tmp_session.connect()
-        match = self.get_cpos_info(index_pair, tmp_session)
-        before_index = max([0, index_pair[0] - self.context_len])
-        after_index = min([self.corpus_max_len,
-                           index_pair[1] + self.context_len])
-        context_before = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
-                                                 range(before_index,
-                                                       index_pair[0]))
-        context_after = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
-                                                range(index_pair[1] + 1,
-                                                      after_index + 1))
-        tmp_dict = {'context_before': context_before,
-                    'context_after': context_after,
-                    'cpos_start': index_pair[0],
-                    'cpos_end': index_pair[1]}
-        match.update(tmp_dict)
-        return_dict[i] = match
-        tmp_session.disconnect()
+                    struct_entry = self.cl_cpos2struc(self.attr_strings['struct_attrs'][self.meta_struct_element],
+                                                      all_cpos)
+                    match_str = self.cl_struc2str(self.attr_strings[attr_dict][struct_attr_key], struct_entry)
+                    cpos_infos[struct_attr_key] = match_str
+        tmp_list = []
+        attr_key_list = []
+        for key in cpos_infos.keys():
+            tmp_list.append(cpos_infos[key])
+            attr_key_list.append(key)
+        joined_cpos_infos = zip(all_cpos, *tmp_list)
+        dict_cpos_infos = {}
+        for info in joined_cpos_infos:
+            dict_cpos_infos[info[0]] = dict(zip(attr_key_list, info[1:]))
+        return dict_cpos_infos