Merge branch 'development' of gitlab.ub.uni-bielefeld.de:sfb1288inf/opaque into development

2026-07-28 11:43:55 +00:00 · 2019-12-02 14:24:37 +01:00
parent 8bf71dea47 d2334e6a1e
commit f9f6857e4e
1 changed files with 93 additions and 48 deletions
@@ -5,7 +5,7 @@ from app import logger  # only works if imported into opaque web app
 class CQiWrapper(CQiClient):
-    """
+    '''
    CQIiWrapper object
    High level wrapper that groups and renames some functions of CQiClient
@@ -16,7 +16,7 @@ class CQiWrapper(CQiClient):
    port -- port of the cqp server
    username -- username used to connect to the cqp server
    password -- password of the user to connect to the cqp server
-    """
+    '''
    SUBCORPUS_NAMES = []
@@ -27,20 +27,21 @@ class CQiWrapper(CQiClient):
        self.password = password
    def connect(self):
-        """
+        '''
        Connect with CQP server
        Connects via socket to the CQP server using the given username and
        password from class initiation.
-        """
+        '''
        self.ctrl_connect(self.username, self.password)
    def __create_attribute_strings(self):
-        """
+        '''
        Creates all needed attribute strings to query for word, lemma etc. in
        the given corpus.
        For example: CORPUS_NAME.word to query words
-        """
+        Automaticalle creates strings for all pre defined tags.
        '''
        p_attrs = self.corpus_positional_attributes(self.corpus_name)
        struct_attrs = self.corpus_structural_attributes(self.corpus_name)
        self.attr_strings = {}
@@ -54,40 +55,45 @@ class CQiWrapper(CQiClient):
            self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
                                                              + '.'
                                                              + struct_attr)
-        # logger.warning(('All positional and '
+        logger.warning(('All positional and '
-                        # 'structural attributes: {}').format(self.attr_strings))
+                        'structural attributes: {}').format(self.attr_strings))
    def select_corpus(self, corpus_name):
        '''
        Checks if given copus name exists. If it exists set it as the main
        corpus name used to create the needed query attribute strings like
        CORPUS_NAME.word.
        '''
        if corpus_name in self.corpus_list_coprora():
            self.corpus_name = corpus_name
            self.__create_attribute_strings()
-            # logger.warning('{} does exist.'.format(corpus_name))
+            logger.warning('{} does exist.'.format(corpus_name))
        else:
-            # logger.warning('{} does not exist.'.format(corpus_name))
+            logger.warning('{} does not exist.'.format(corpus_name))
-            pass
+            raise Exception('Given Corpus Name is not in corpora list.')
    def disconnect(self):
-        """
+        '''
        Disconnect from CQP server
        Disconnects from the CQP server. Closes used socket after disconnect.
-        """
+        '''
        self.ctrl_bye()
        self.connection.close()
-        # logger.warning('Disconnected from cqp server.')
+        logger.warning('Disconnected from cqp server.')
    def query_subcorpus(self, query, result_subcorpus_name='Query-results'):
-        """
+        '''
        Create subcorpus
        Input query will be used to create a subcorpus holding all cpos match
        positions for that query.
        Keyword arguments:
-        result_subcorpus_name -- user set name of the subcorpus which holds all
+        result_subcorpus_name -- set name of the subcorpus which holds all
        cpos match positions, produced by the query
        query -- query written in cqp query language
-        """
+        '''
        self.cqp_query(self.corpus_name, result_subcorpus_name, query)
        self.result_subcorpus = (self.corpus_name
                                 + ':'
@@ -95,19 +101,19 @@ class CQiWrapper(CQiClient):
        self.SUBCORPUS_NAMES.append(self.result_subcorpus)
        self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus)
        print('Nr of all matches is:', self.nr_matches)
-        # logger.warning('Nr of all matches is: {}'.format(self.nr_matches))
+        logger.warning('Nr of all matches is: {}'.format(self.nr_matches))
    def show_subcorpora(self):
-        """
+        '''
        Show all subcorpora currently saved by the cqp server.
-        """
+        '''
        return self.cqp_list_subcorpora(self.corpus_name)
    def show_query_results(self,
                           context_len=10,
                           result_len=1000,
                           result_offset=0):
-        """
+        '''
        Show query results
        Shows the actual matched strings produce by the query. Uses the cpos
@@ -118,15 +124,20 @@ class CQiWrapper(CQiClient):
        Keyword arguments:
        context_len -- defines how many words before and after a match will be
        shown (default 10)
-        result_len -- defines how many results are actually grabbed
+        result_len -- defines for how many matches all informations like lemma
-        """
+        and POS are being grabbed
        result_offset -- defines the offset of the matches being requested. If
        the offset is 100 informations for matches 100 to result_len are being
        grabbed
        '''
        t0 = time.time()
        self.context_len = context_len
        self.corpus_max_len = self.cl_attribute_size(
                                   self.attr_strings['positional_attrs']['word']
                              )
        self.nr_matches = min(result_len, self.nr_matches)
        if self.nr_matches == 0:
-            # logger.warning('Query resulted in 0 matches.')
+            logger.warning('Query resulted in 0 matches.')
            return None
        else:
            # Get match cpos boundries
@@ -144,7 +155,8 @@ class CQiWrapper(CQiClient):
                                                           offset_start,
                                                           offset_end))
-        # Generate all cpos between match boundries including start and end boundries.
+        # Generate all cpos between match boundries including start and end
        # boundries.
        # Also generate cpos for left and right context.
        # Save those cpos into dict as lists for the keys 'lc', 'hit' and 'rc'
        # Also collect all cpos together in one list for the final request of
@@ -157,42 +169,34 @@ class CQiWrapper(CQiClient):
            lc = {'lc': lc_cpos}
            match_cpos = list(range(start, end))
            match = {'hit': match_cpos}
-            rc_cpos = list(range(end, min([self.corpus_max_len, end + self.context_len])))
+            rc_cpos = list(range(end, min([self.corpus_max_len,
                                           end + self.context_len])))
            rc = {'rc': rc_cpos}
            lc.update(match)
            lc.update(rc)
            all_cpos.extend(lc_cpos + match_cpos + rc_cpos)
            all_matches.append(lc)
        # print(all_matches)
        # print(all_cpos)
        # Get all cpos for all sneteces boundries
        # s_lookup = {}
        # for s_id in set(s_ids):
        #     s_start, s_end = self.cl_struc2cpos('UTOPIEN.s', s_id)
        #     # CHANGE to UTOPIEN.s will always be like this in nopaque
        #     s_cpos = range(s_start, s_end)
        #     s_lookup.update({s_id: list(s_cpos)})
        #     # print(list(s_cpos))
        #     all_cpos.extend(s_cpos)
        t0 = time.time()
        all_cpos = list(set(all_cpos))  # get rid of cpos duplicates
        len_all_cpos = len(all_cpos)
        t1 = time.time()
        t_total = t1 - t0
-        print('TIME FOR ALL CPOS:', t_total)
+        logger.warning('Time to create all CPOS for query: {}'.format(t_total))
-        print('CPOS SUM:', len(all_cpos))
+        print('Requesting {} CPOS with one query.'.format(len_all_cpos))
        # Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
        # all cpos entries in all_cpos_list
        # Also saves these informations into self.results dict
-        t6 = time.time()
+        t2 = time.time()
        all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
-        t7 = time.time()
+        t3 = time.time()
-        t_final = t7 - t6
+        t_final = t3 - t2
-        print('GOT ALL RESULTS IN:', t_final)
+        print('Got infos for {} CPOS in {} seconds:'.format(len_all_cpos,
-
+                                                            t_final))
-        self.results = {'matches': all_matches, 'cpos_lookup': all_cpos_infos,
+        self.results = {'matches': all_matches,
-                        'text_lookup': text_lookup}
+                        'cpos_lookup': all_cpos_infos,
                        'text_lookup': text_lookup,
                        'nr_matches': self.nr_matches}
        return self.results
    def get_cpos_infos(self, all_cpos):
@@ -250,3 +254,44 @@ class CQiWrapper(CQiClient):
        for info in joined_cpos_infos:
            dict_cpos_infos[info[0]] = dict(zip(attr_keys_list, info[1:]))
        return dict_cpos_infos, text_lookup
    def get_sentences(self,
                      match_cpos_list,
                      get_surrounding_s=False,
                      l_r_s_context_additional_len=1):
        '''
        Get sentence informations for one match also set if and how much left
        right context sentences should be grabbed surrounding the given CPOS.
        '''
        t0 = time.time()
        key = self.corpus_name + '.s'
        first_cpos, last_cpos = match_cpos_list[0], match_cpos_list[-1]
        context_sentences = {}
        s_ids = self.cl_cpos2struc(key, [first_cpos, last_cpos])
        for s_id in s_ids:
            s_start, s_end = self.cl_struc2cpos(key, s_id)
            s_cpos = list(range(s_start, s_end + 1))
            context_sentences[s_id] = s_cpos
        if get_surrounding_s:
            max_s_id = self.cl_attribute_size(key)
            additional_s_ids = []
            additional_s = list(range(max(s_ids[0]
                                          - l_r_s_context_additional_len,
                                          0),
                                      min(s_ids[-1]
                                          + l_r_s_context_additional_len,
                                          max_s_id) + 1))
            additional_s_ids.extend(additional_s)
            for s_id in additional_s_ids:
                s_start, s_end = self.cl_struc2cpos(key, s_id)
                s_cpos = list(range(s_start, s_end + 1))
                context_sentences[s_id] = s_cpos
        all_cpos = []
        for key in context_sentences.keys():
            all_cpos.extend(context_sentences[key])
        all_cpos = list(set(all_cpos))
        all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
        t1 = time.time()
        t_total = t1 - t0
        logger.warning('Got all sentences informations in {} seconds'. format(t_total))
        return context_sentences, all_cpos_infos, text_lookup