From 3af400a732bd5defcbad79e3c9fb574bbb9f472a Mon Sep 17 00:00:00 2001
From: Stephan Porada <sporada@uni-bielefeld.de>
Date: Mon, 2 Dec 2019 14:19:40 +0100
Subject: [PATCH] Add get_sentences to wrapper

---
 app/corpora/CQiWrapper/CQiWrapper.py | 141 ++++++++++++++++++---------
 1 file changed, 93 insertions(+), 48 deletions(-)

diff --git a/app/corpora/CQiWrapper/CQiWrapper.py b/app/corpora/CQiWrapper/CQiWrapper.py
index c313bf05..c19cf84f 100644
--- a/app/corpora/CQiWrapper/CQiWrapper.py
+++ b/app/corpora/CQiWrapper/CQiWrapper.py
@@ -5,7 +5,7 @@ from app import logger  # only works if imported into opaque web app
 
 
 class CQiWrapper(CQiClient):
-    """
+    '''
     CQIiWrapper object
 
     High level wrapper that groups and renames some functions of CQiClient
@@ -16,7 +16,7 @@ class CQiWrapper(CQiClient):
     port -- port of the cqp server
     username -- username used to connect to the cqp server
     password -- password of the user to connect to the cqp server
-    """
+    '''
 
     SUBCORPUS_NAMES = []
 
@@ -27,20 +27,21 @@ class CQiWrapper(CQiClient):
         self.password = password
 
     def connect(self):
-        """
+        '''
         Connect with CQP server
 
         Connects via socket to the CQP server using the given username and
         password from class initiation.
-        """
+        '''
         self.ctrl_connect(self.username, self.password)
 
     def __create_attribute_strings(self):
-        """
+        '''
         Creates all needed attribute strings to query for word, lemma etc. in
         the given corpus.
         For example: CORPUS_NAME.word to query words
-        """
+        Automaticalle creates strings for all pre defined tags.
+        '''
         p_attrs = self.corpus_positional_attributes(self.corpus_name)
         struct_attrs = self.corpus_structural_attributes(self.corpus_name)
         self.attr_strings = {}
@@ -54,40 +55,45 @@ class CQiWrapper(CQiClient):
             self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
                                                               + '.'
                                                               + struct_attr)
-        # logger.warning(('All positional and '
-                        # 'structural attributes: {}').format(self.attr_strings))
+        logger.warning(('All positional and '
+                        'structural attributes: {}').format(self.attr_strings))
 
     def select_corpus(self, corpus_name):
+        '''
+        Checks if given copus name exists. If it exists set it as the main
+        corpus name used to create the needed query attribute strings like
+        CORPUS_NAME.word.
+        '''
         if corpus_name in self.corpus_list_coprora():
             self.corpus_name = corpus_name
             self.__create_attribute_strings()
-            # logger.warning('{} does exist.'.format(corpus_name))
+            logger.warning('{} does exist.'.format(corpus_name))
         else:
-            # logger.warning('{} does not exist.'.format(corpus_name))
-            pass
+            logger.warning('{} does not exist.'.format(corpus_name))
+            raise Exception('Given Corpus Name is not in corpora list.')
 
     def disconnect(self):
-        """
+        '''
         Disconnect from CQP server
 
         Disconnects from the CQP server. Closes used socket after disconnect.
-        """
+        '''
         self.ctrl_bye()
         self.connection.close()
-        # logger.warning('Disconnected from cqp server.')
+        logger.warning('Disconnected from cqp server.')
 
     def query_subcorpus(self, query, result_subcorpus_name='Query-results'):
-        """
+        '''
         Create subcorpus
 
         Input query will be used to create a subcorpus holding all cpos match
         positions for that query.
 
         Keyword arguments:
-        result_subcorpus_name -- user set name of the subcorpus which holds all
+        result_subcorpus_name -- set name of the subcorpus which holds all
         cpos match positions, produced by the query
         query -- query written in cqp query language
-        """
+        '''
         self.cqp_query(self.corpus_name, result_subcorpus_name, query)
         self.result_subcorpus = (self.corpus_name
                                  + ':'
@@ -95,19 +101,19 @@ class CQiWrapper(CQiClient):
         self.SUBCORPUS_NAMES.append(self.result_subcorpus)
         self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus)
         print('Nr of all matches is:', self.nr_matches)
-        # logger.warning('Nr of all matches is: {}'.format(self.nr_matches))
+        logger.warning('Nr of all matches is: {}'.format(self.nr_matches))
 
     def show_subcorpora(self):
-        """
+        '''
         Show all subcorpora currently saved by the cqp server.
-        """
+        '''
         return self.cqp_list_subcorpora(self.corpus_name)
 
     def show_query_results(self,
                            context_len=10,
                            result_len=1000,
                            result_offset=0):
-        """
+        '''
         Show query results
 
         Shows the actual matched strings produce by the query. Uses the cpos
@@ -118,15 +124,20 @@ class CQiWrapper(CQiClient):
         Keyword arguments:
         context_len -- defines how many words before and after a match will be
         shown (default 10)
-        result_len -- defines how many results are actually grabbed
-        """
+        result_len -- defines for how many matches all informations like lemma
+        and POS are being grabbed
+        result_offset -- defines the offset of the matches being requested. If
+        the offset is 100 informations for matches 100 to result_len are being
+        grabbed
+        '''
+        t0 = time.time()
         self.context_len = context_len
         self.corpus_max_len = self.cl_attribute_size(
                                    self.attr_strings['positional_attrs']['word']
                               )
         self.nr_matches = min(result_len, self.nr_matches)
         if self.nr_matches == 0:
-            # logger.warning('Query resulted in 0 matches.')
+            logger.warning('Query resulted in 0 matches.')
             return None
         else:
             # Get match cpos boundries
@@ -144,7 +155,8 @@ class CQiWrapper(CQiClient):
                                                            offset_start,
                                                            offset_end))
 
-        # Generate all cpos between match boundries including start and end boundries.
+        # Generate all cpos between match boundries including start and end
+        # boundries.
         # Also generate cpos for left and right context.
         # Save those cpos into dict as lists for the keys 'lc', 'hit' and 'rc'
         # Also collect all cpos together in one list for the final request of
@@ -157,42 +169,34 @@ class CQiWrapper(CQiClient):
             lc = {'lc': lc_cpos}
             match_cpos = list(range(start, end))
             match = {'hit': match_cpos}
-            rc_cpos = list(range(end, min([self.corpus_max_len, end + self.context_len])))
+            rc_cpos = list(range(end, min([self.corpus_max_len,
+                                           end + self.context_len])))
             rc = {'rc': rc_cpos}
             lc.update(match)
             lc.update(rc)
             all_cpos.extend(lc_cpos + match_cpos + rc_cpos)
             all_matches.append(lc)
-        # print(all_matches)
-        # print(all_cpos)
 
-        # Get all cpos for all sneteces boundries
-        # s_lookup = {}
-        # for s_id in set(s_ids):
-        #     s_start, s_end = self.cl_struc2cpos('UTOPIEN.s', s_id)
-        #     # CHANGE to UTOPIEN.s will always be like this in nopaque
-        #     s_cpos = range(s_start, s_end)
-        #     s_lookup.update({s_id: list(s_cpos)})
-        #     # print(list(s_cpos))
-        #     all_cpos.extend(s_cpos)
-        t0 = time.time()
-        all_cpos = list(set(all_cpos)) # get rid of cpos duplicates
+        all_cpos = list(set(all_cpos))  # get rid of cpos duplicates
+        len_all_cpos = len(all_cpos)
         t1 = time.time()
         t_total = t1 - t0
-        print('TIME FOR ALL CPOS:', t_total)
-        print('CPOS SUM:', len(all_cpos))
+        logger.warning('Time to create all CPOS for query: {}'.format(t_total))
+        print('Requesting {} CPOS with one query.'.format(len_all_cpos))
 
         # Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
         # all cpos entries in all_cpos_list
         # Also saves these informations into self.results dict
-        t6 = time.time()
+        t2 = time.time()
         all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
-        t7 = time.time()
-        t_final = t7 - t6
-        print('GOT ALL RESULTS IN:', t_final)
-
-        self.results = {'matches': all_matches, 'cpos_lookup': all_cpos_infos,
-                        'text_lookup': text_lookup}
+        t3 = time.time()
+        t_final = t3 - t2
+        print('Got infos for {} CPOS in {} seconds:'.format(len_all_cpos,
+                                                            t_final))
+        self.results = {'matches': all_matches,
+                        'cpos_lookup': all_cpos_infos,
+                        'text_lookup': text_lookup,
+                        'nr_matches': self.nr_matches}
         return self.results
 
     def get_cpos_infos(self, all_cpos):
@@ -250,3 +254,44 @@ class CQiWrapper(CQiClient):
         for info in joined_cpos_infos:
             dict_cpos_infos[info[0]] = dict(zip(attr_keys_list, info[1:]))
         return dict_cpos_infos, text_lookup
+
+    def get_sentences(self,
+                      match_cpos_list,
+                      get_surrounding_s=False,
+                      l_r_s_context_additional_len=1):
+        '''
+        Get sentence informations for one match also set if and how much left
+        right context sentences should be grabbed surrounding the given CPOS.
+        '''
+        t0 = time.time()
+        key = self.corpus_name + '.s'
+        first_cpos, last_cpos = match_cpos_list[0], match_cpos_list[-1]
+        context_sentences = {}
+        s_ids = self.cl_cpos2struc(key, [first_cpos, last_cpos])
+        for s_id in s_ids:
+            s_start, s_end = self.cl_struc2cpos(key, s_id)
+            s_cpos = list(range(s_start, s_end + 1))
+            context_sentences[s_id] = s_cpos
+        if get_surrounding_s:
+            max_s_id = self.cl_attribute_size(key)
+            additional_s_ids = []
+            additional_s = list(range(max(s_ids[0]
+                                          - l_r_s_context_additional_len,
+                                          0),
+                                      min(s_ids[-1]
+                                          + l_r_s_context_additional_len,
+                                          max_s_id) + 1))
+            additional_s_ids.extend(additional_s)
+            for s_id in additional_s_ids:
+                s_start, s_end = self.cl_struc2cpos(key, s_id)
+                s_cpos = list(range(s_start, s_end + 1))
+                context_sentences[s_id] = s_cpos
+        all_cpos = []
+        for key in context_sentences.keys():
+            all_cpos.extend(context_sentences[key])
+        all_cpos = list(set(all_cpos))
+        all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
+        t1 = time.time()
+        t_total = t1 - t0
+        logger.warning('Got all sentences informations in {} seconds'. format(t_total))
+        return context_sentences, all_cpos_infos, text_lookup