mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
				synced 2025-11-04 12:22:47 +00:00 
			
		
		
		
	Add get_sentences to wrapper
This commit is contained in:
		@@ -5,7 +5,7 @@ from app import logger  # only works if imported into opaque web app
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class CQiWrapper(CQiClient):
 | 
			
		||||
    """
 | 
			
		||||
    '''
 | 
			
		||||
    CQIiWrapper object
 | 
			
		||||
 | 
			
		||||
    High level wrapper that groups and renames some functions of CQiClient
 | 
			
		||||
@@ -16,7 +16,7 @@ class CQiWrapper(CQiClient):
 | 
			
		||||
    port -- port of the cqp server
 | 
			
		||||
    username -- username used to connect to the cqp server
 | 
			
		||||
    password -- password of the user to connect to the cqp server
 | 
			
		||||
    """
 | 
			
		||||
    '''
 | 
			
		||||
 | 
			
		||||
    SUBCORPUS_NAMES = []
 | 
			
		||||
 | 
			
		||||
@@ -27,20 +27,21 @@ class CQiWrapper(CQiClient):
 | 
			
		||||
        self.password = password
 | 
			
		||||
 | 
			
		||||
    def connect(self):
 | 
			
		||||
        """
 | 
			
		||||
        '''
 | 
			
		||||
        Connect with CQP server
 | 
			
		||||
 | 
			
		||||
        Connects via socket to the CQP server using the given username and
 | 
			
		||||
        password from class initiation.
 | 
			
		||||
        """
 | 
			
		||||
        '''
 | 
			
		||||
        self.ctrl_connect(self.username, self.password)
 | 
			
		||||
 | 
			
		||||
    def __create_attribute_strings(self):
 | 
			
		||||
        """
 | 
			
		||||
        '''
 | 
			
		||||
        Creates all needed attribute strings to query for word, lemma etc. in
 | 
			
		||||
        the given corpus.
 | 
			
		||||
        For example: CORPUS_NAME.word to query words
 | 
			
		||||
        """
 | 
			
		||||
        Automaticalle creates strings for all pre defined tags.
 | 
			
		||||
        '''
 | 
			
		||||
        p_attrs = self.corpus_positional_attributes(self.corpus_name)
 | 
			
		||||
        struct_attrs = self.corpus_structural_attributes(self.corpus_name)
 | 
			
		||||
        self.attr_strings = {}
 | 
			
		||||
@@ -54,40 +55,45 @@ class CQiWrapper(CQiClient):
 | 
			
		||||
            self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
 | 
			
		||||
                                                              + '.'
 | 
			
		||||
                                                              + struct_attr)
 | 
			
		||||
        # logger.warning(('All positional and '
 | 
			
		||||
                        # 'structural attributes: {}').format(self.attr_strings))
 | 
			
		||||
        logger.warning(('All positional and '
 | 
			
		||||
                        'structural attributes: {}').format(self.attr_strings))
 | 
			
		||||
 | 
			
		||||
    def select_corpus(self, corpus_name):
 | 
			
		||||
        '''
 | 
			
		||||
        Checks if given copus name exists. If it exists set it as the main
 | 
			
		||||
        corpus name used to create the needed query attribute strings like
 | 
			
		||||
        CORPUS_NAME.word.
 | 
			
		||||
        '''
 | 
			
		||||
        if corpus_name in self.corpus_list_coprora():
 | 
			
		||||
            self.corpus_name = corpus_name
 | 
			
		||||
            self.__create_attribute_strings()
 | 
			
		||||
            # logger.warning('{} does exist.'.format(corpus_name))
 | 
			
		||||
            logger.warning('{} does exist.'.format(corpus_name))
 | 
			
		||||
        else:
 | 
			
		||||
            # logger.warning('{} does not exist.'.format(corpus_name))
 | 
			
		||||
            pass
 | 
			
		||||
            logger.warning('{} does not exist.'.format(corpus_name))
 | 
			
		||||
            raise Exception('Given Corpus Name is not in corpora list.')
 | 
			
		||||
 | 
			
		||||
    def disconnect(self):
 | 
			
		||||
        """
 | 
			
		||||
        '''
 | 
			
		||||
        Disconnect from CQP server
 | 
			
		||||
 | 
			
		||||
        Disconnects from the CQP server. Closes used socket after disconnect.
 | 
			
		||||
        """
 | 
			
		||||
        '''
 | 
			
		||||
        self.ctrl_bye()
 | 
			
		||||
        self.connection.close()
 | 
			
		||||
        # logger.warning('Disconnected from cqp server.')
 | 
			
		||||
        logger.warning('Disconnected from cqp server.')
 | 
			
		||||
 | 
			
		||||
    def query_subcorpus(self, query, result_subcorpus_name='Query-results'):
 | 
			
		||||
        """
 | 
			
		||||
        '''
 | 
			
		||||
        Create subcorpus
 | 
			
		||||
 | 
			
		||||
        Input query will be used to create a subcorpus holding all cpos match
 | 
			
		||||
        positions for that query.
 | 
			
		||||
 | 
			
		||||
        Keyword arguments:
 | 
			
		||||
        result_subcorpus_name -- user set name of the subcorpus which holds all
 | 
			
		||||
        result_subcorpus_name -- set name of the subcorpus which holds all
 | 
			
		||||
        cpos match positions, produced by the query
 | 
			
		||||
        query -- query written in cqp query language
 | 
			
		||||
        """
 | 
			
		||||
        '''
 | 
			
		||||
        self.cqp_query(self.corpus_name, result_subcorpus_name, query)
 | 
			
		||||
        self.result_subcorpus = (self.corpus_name
 | 
			
		||||
                                 + ':'
 | 
			
		||||
@@ -95,19 +101,19 @@ class CQiWrapper(CQiClient):
 | 
			
		||||
        self.SUBCORPUS_NAMES.append(self.result_subcorpus)
 | 
			
		||||
        self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus)
 | 
			
		||||
        print('Nr of all matches is:', self.nr_matches)
 | 
			
		||||
        # logger.warning('Nr of all matches is: {}'.format(self.nr_matches))
 | 
			
		||||
        logger.warning('Nr of all matches is: {}'.format(self.nr_matches))
 | 
			
		||||
 | 
			
		||||
    def show_subcorpora(self):
 | 
			
		||||
        """
 | 
			
		||||
        '''
 | 
			
		||||
        Show all subcorpora currently saved by the cqp server.
 | 
			
		||||
        """
 | 
			
		||||
        '''
 | 
			
		||||
        return self.cqp_list_subcorpora(self.corpus_name)
 | 
			
		||||
 | 
			
		||||
    def show_query_results(self,
 | 
			
		||||
                           context_len=10,
 | 
			
		||||
                           result_len=1000,
 | 
			
		||||
                           result_offset=0):
 | 
			
		||||
        """
 | 
			
		||||
        '''
 | 
			
		||||
        Show query results
 | 
			
		||||
 | 
			
		||||
        Shows the actual matched strings produce by the query. Uses the cpos
 | 
			
		||||
@@ -118,15 +124,20 @@ class CQiWrapper(CQiClient):
 | 
			
		||||
        Keyword arguments:
 | 
			
		||||
        context_len -- defines how many words before and after a match will be
 | 
			
		||||
        shown (default 10)
 | 
			
		||||
        result_len -- defines how many results are actually grabbed
 | 
			
		||||
        """
 | 
			
		||||
        result_len -- defines for how many matches all informations like lemma
 | 
			
		||||
        and POS are being grabbed
 | 
			
		||||
        result_offset -- defines the offset of the matches being requested. If
 | 
			
		||||
        the offset is 100 informations for matches 100 to result_len are being
 | 
			
		||||
        grabbed
 | 
			
		||||
        '''
 | 
			
		||||
        t0 = time.time()
 | 
			
		||||
        self.context_len = context_len
 | 
			
		||||
        self.corpus_max_len = self.cl_attribute_size(
 | 
			
		||||
                                   self.attr_strings['positional_attrs']['word']
 | 
			
		||||
                              )
 | 
			
		||||
        self.nr_matches = min(result_len, self.nr_matches)
 | 
			
		||||
        if self.nr_matches == 0:
 | 
			
		||||
            # logger.warning('Query resulted in 0 matches.')
 | 
			
		||||
            logger.warning('Query resulted in 0 matches.')
 | 
			
		||||
            return None
 | 
			
		||||
        else:
 | 
			
		||||
            # Get match cpos boundries
 | 
			
		||||
@@ -144,7 +155,8 @@ class CQiWrapper(CQiClient):
 | 
			
		||||
                                                           offset_start,
 | 
			
		||||
                                                           offset_end))
 | 
			
		||||
 | 
			
		||||
        # Generate all cpos between match boundries including start and end boundries.
 | 
			
		||||
        # Generate all cpos between match boundries including start and end
 | 
			
		||||
        # boundries.
 | 
			
		||||
        # Also generate cpos for left and right context.
 | 
			
		||||
        # Save those cpos into dict as lists for the keys 'lc', 'hit' and 'rc'
 | 
			
		||||
        # Also collect all cpos together in one list for the final request of
 | 
			
		||||
@@ -157,42 +169,34 @@ class CQiWrapper(CQiClient):
 | 
			
		||||
            lc = {'lc': lc_cpos}
 | 
			
		||||
            match_cpos = list(range(start, end))
 | 
			
		||||
            match = {'hit': match_cpos}
 | 
			
		||||
            rc_cpos = list(range(end, min([self.corpus_max_len, end + self.context_len])))
 | 
			
		||||
            rc_cpos = list(range(end, min([self.corpus_max_len,
 | 
			
		||||
                                           end + self.context_len])))
 | 
			
		||||
            rc = {'rc': rc_cpos}
 | 
			
		||||
            lc.update(match)
 | 
			
		||||
            lc.update(rc)
 | 
			
		||||
            all_cpos.extend(lc_cpos + match_cpos + rc_cpos)
 | 
			
		||||
            all_matches.append(lc)
 | 
			
		||||
        # print(all_matches)
 | 
			
		||||
        # print(all_cpos)
 | 
			
		||||
 | 
			
		||||
        # Get all cpos for all sneteces boundries
 | 
			
		||||
        # s_lookup = {}
 | 
			
		||||
        # for s_id in set(s_ids):
 | 
			
		||||
        #     s_start, s_end = self.cl_struc2cpos('UTOPIEN.s', s_id)
 | 
			
		||||
        #     # CHANGE to UTOPIEN.s will always be like this in nopaque
 | 
			
		||||
        #     s_cpos = range(s_start, s_end)
 | 
			
		||||
        #     s_lookup.update({s_id: list(s_cpos)})
 | 
			
		||||
        #     # print(list(s_cpos))
 | 
			
		||||
        #     all_cpos.extend(s_cpos)
 | 
			
		||||
        t0 = time.time()
 | 
			
		||||
        all_cpos = list(set(all_cpos)) # get rid of cpos duplicates
 | 
			
		||||
        all_cpos = list(set(all_cpos))  # get rid of cpos duplicates
 | 
			
		||||
        len_all_cpos = len(all_cpos)
 | 
			
		||||
        t1 = time.time()
 | 
			
		||||
        t_total = t1 - t0
 | 
			
		||||
        print('TIME FOR ALL CPOS:', t_total)
 | 
			
		||||
        print('CPOS SUM:', len(all_cpos))
 | 
			
		||||
        logger.warning('Time to create all CPOS for query: {}'.format(t_total))
 | 
			
		||||
        print('Requesting {} CPOS with one query.'.format(len_all_cpos))
 | 
			
		||||
 | 
			
		||||
        # Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
 | 
			
		||||
        # all cpos entries in all_cpos_list
 | 
			
		||||
        # Also saves these informations into self.results dict
 | 
			
		||||
        t6 = time.time()
 | 
			
		||||
        t2 = time.time()
 | 
			
		||||
        all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
 | 
			
		||||
        t7 = time.time()
 | 
			
		||||
        t_final = t7 - t6
 | 
			
		||||
        print('GOT ALL RESULTS IN:', t_final)
 | 
			
		||||
 | 
			
		||||
        self.results = {'matches': all_matches, 'cpos_lookup': all_cpos_infos,
 | 
			
		||||
                        'text_lookup': text_lookup}
 | 
			
		||||
        t3 = time.time()
 | 
			
		||||
        t_final = t3 - t2
 | 
			
		||||
        print('Got infos for {} CPOS in {} seconds:'.format(len_all_cpos,
 | 
			
		||||
                                                            t_final))
 | 
			
		||||
        self.results = {'matches': all_matches,
 | 
			
		||||
                        'cpos_lookup': all_cpos_infos,
 | 
			
		||||
                        'text_lookup': text_lookup,
 | 
			
		||||
                        'nr_matches': self.nr_matches}
 | 
			
		||||
        return self.results
 | 
			
		||||
 | 
			
		||||
    def get_cpos_infos(self, all_cpos):
 | 
			
		||||
@@ -250,3 +254,44 @@ class CQiWrapper(CQiClient):
 | 
			
		||||
        for info in joined_cpos_infos:
 | 
			
		||||
            dict_cpos_infos[info[0]] = dict(zip(attr_keys_list, info[1:]))
 | 
			
		||||
        return dict_cpos_infos, text_lookup
 | 
			
		||||
 | 
			
		||||
    def get_sentences(self,
 | 
			
		||||
                      match_cpos_list,
 | 
			
		||||
                      get_surrounding_s=False,
 | 
			
		||||
                      l_r_s_context_additional_len=1):
 | 
			
		||||
        '''
 | 
			
		||||
        Get sentence informations for one match also set if and how much left
 | 
			
		||||
        right context sentences should be grabbed surrounding the given CPOS.
 | 
			
		||||
        '''
 | 
			
		||||
        t0 = time.time()
 | 
			
		||||
        key = self.corpus_name + '.s'
 | 
			
		||||
        first_cpos, last_cpos = match_cpos_list[0], match_cpos_list[-1]
 | 
			
		||||
        context_sentences = {}
 | 
			
		||||
        s_ids = self.cl_cpos2struc(key, [first_cpos, last_cpos])
 | 
			
		||||
        for s_id in s_ids:
 | 
			
		||||
            s_start, s_end = self.cl_struc2cpos(key, s_id)
 | 
			
		||||
            s_cpos = list(range(s_start, s_end + 1))
 | 
			
		||||
            context_sentences[s_id] = s_cpos
 | 
			
		||||
        if get_surrounding_s:
 | 
			
		||||
            max_s_id = self.cl_attribute_size(key)
 | 
			
		||||
            additional_s_ids = []
 | 
			
		||||
            additional_s = list(range(max(s_ids[0]
 | 
			
		||||
                                          - l_r_s_context_additional_len,
 | 
			
		||||
                                          0),
 | 
			
		||||
                                      min(s_ids[-1]
 | 
			
		||||
                                          + l_r_s_context_additional_len,
 | 
			
		||||
                                          max_s_id) + 1))
 | 
			
		||||
            additional_s_ids.extend(additional_s)
 | 
			
		||||
            for s_id in additional_s_ids:
 | 
			
		||||
                s_start, s_end = self.cl_struc2cpos(key, s_id)
 | 
			
		||||
                s_cpos = list(range(s_start, s_end + 1))
 | 
			
		||||
                context_sentences[s_id] = s_cpos
 | 
			
		||||
        all_cpos = []
 | 
			
		||||
        for key in context_sentences.keys():
 | 
			
		||||
            all_cpos.extend(context_sentences[key])
 | 
			
		||||
        all_cpos = list(set(all_cpos))
 | 
			
		||||
        all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
 | 
			
		||||
        t1 = time.time()
 | 
			
		||||
        t_total = t1 - t0
 | 
			
		||||
        logger.warning('Got all sentences informations in {} seconds'. format(t_total))
 | 
			
		||||
        return context_sentences, all_cpos_infos, text_lookup
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user