mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
				synced 2025-11-04 12:22:47 +00:00 
			
		
		
		
	Merge branch 'development' of gitlab.ub.uni-bielefeld.de:sfb1288inf/opaque into development
This commit is contained in:
		@@ -5,7 +5,7 @@ from app import logger  # only works if imported into opaque web app
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class CQiWrapper(CQiClient):
 | 
					class CQiWrapper(CQiClient):
 | 
				
			||||||
    """
 | 
					    '''
 | 
				
			||||||
    CQIiWrapper object
 | 
					    CQIiWrapper object
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    High level wrapper that groups and renames some functions of CQiClient
 | 
					    High level wrapper that groups and renames some functions of CQiClient
 | 
				
			||||||
@@ -16,7 +16,7 @@ class CQiWrapper(CQiClient):
 | 
				
			|||||||
    port -- port of the cqp server
 | 
					    port -- port of the cqp server
 | 
				
			||||||
    username -- username used to connect to the cqp server
 | 
					    username -- username used to connect to the cqp server
 | 
				
			||||||
    password -- password of the user to connect to the cqp server
 | 
					    password -- password of the user to connect to the cqp server
 | 
				
			||||||
    """
 | 
					    '''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    SUBCORPUS_NAMES = []
 | 
					    SUBCORPUS_NAMES = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -27,20 +27,21 @@ class CQiWrapper(CQiClient):
 | 
				
			|||||||
        self.password = password
 | 
					        self.password = password
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def connect(self):
 | 
					    def connect(self):
 | 
				
			||||||
        """
 | 
					        '''
 | 
				
			||||||
        Connect with CQP server
 | 
					        Connect with CQP server
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        Connects via socket to the CQP server using the given username and
 | 
					        Connects via socket to the CQP server using the given username and
 | 
				
			||||||
        password from class initiation.
 | 
					        password from class initiation.
 | 
				
			||||||
        """
 | 
					        '''
 | 
				
			||||||
        self.ctrl_connect(self.username, self.password)
 | 
					        self.ctrl_connect(self.username, self.password)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __create_attribute_strings(self):
 | 
					    def __create_attribute_strings(self):
 | 
				
			||||||
        """
 | 
					        '''
 | 
				
			||||||
        Creates all needed attribute strings to query for word, lemma etc. in
 | 
					        Creates all needed attribute strings to query for word, lemma etc. in
 | 
				
			||||||
        the given corpus.
 | 
					        the given corpus.
 | 
				
			||||||
        For example: CORPUS_NAME.word to query words
 | 
					        For example: CORPUS_NAME.word to query words
 | 
				
			||||||
        """
 | 
					        Automaticalle creates strings for all pre defined tags.
 | 
				
			||||||
 | 
					        '''
 | 
				
			||||||
        p_attrs = self.corpus_positional_attributes(self.corpus_name)
 | 
					        p_attrs = self.corpus_positional_attributes(self.corpus_name)
 | 
				
			||||||
        struct_attrs = self.corpus_structural_attributes(self.corpus_name)
 | 
					        struct_attrs = self.corpus_structural_attributes(self.corpus_name)
 | 
				
			||||||
        self.attr_strings = {}
 | 
					        self.attr_strings = {}
 | 
				
			||||||
@@ -54,40 +55,45 @@ class CQiWrapper(CQiClient):
 | 
				
			|||||||
            self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
 | 
					            self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
 | 
				
			||||||
                                                              + '.'
 | 
					                                                              + '.'
 | 
				
			||||||
                                                              + struct_attr)
 | 
					                                                              + struct_attr)
 | 
				
			||||||
        # logger.warning(('All positional and '
 | 
					        logger.warning(('All positional and '
 | 
				
			||||||
                        # 'structural attributes: {}').format(self.attr_strings))
 | 
					                        'structural attributes: {}').format(self.attr_strings))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def select_corpus(self, corpus_name):
 | 
					    def select_corpus(self, corpus_name):
 | 
				
			||||||
 | 
					        '''
 | 
				
			||||||
 | 
					        Checks if given copus name exists. If it exists set it as the main
 | 
				
			||||||
 | 
					        corpus name used to create the needed query attribute strings like
 | 
				
			||||||
 | 
					        CORPUS_NAME.word.
 | 
				
			||||||
 | 
					        '''
 | 
				
			||||||
        if corpus_name in self.corpus_list_coprora():
 | 
					        if corpus_name in self.corpus_list_coprora():
 | 
				
			||||||
            self.corpus_name = corpus_name
 | 
					            self.corpus_name = corpus_name
 | 
				
			||||||
            self.__create_attribute_strings()
 | 
					            self.__create_attribute_strings()
 | 
				
			||||||
            # logger.warning('{} does exist.'.format(corpus_name))
 | 
					            logger.warning('{} does exist.'.format(corpus_name))
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            # logger.warning('{} does not exist.'.format(corpus_name))
 | 
					            logger.warning('{} does not exist.'.format(corpus_name))
 | 
				
			||||||
            pass
 | 
					            raise Exception('Given Corpus Name is not in corpora list.')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def disconnect(self):
 | 
					    def disconnect(self):
 | 
				
			||||||
        """
 | 
					        '''
 | 
				
			||||||
        Disconnect from CQP server
 | 
					        Disconnect from CQP server
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        Disconnects from the CQP server. Closes used socket after disconnect.
 | 
					        Disconnects from the CQP server. Closes used socket after disconnect.
 | 
				
			||||||
        """
 | 
					        '''
 | 
				
			||||||
        self.ctrl_bye()
 | 
					        self.ctrl_bye()
 | 
				
			||||||
        self.connection.close()
 | 
					        self.connection.close()
 | 
				
			||||||
        # logger.warning('Disconnected from cqp server.')
 | 
					        logger.warning('Disconnected from cqp server.')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def query_subcorpus(self, query, result_subcorpus_name='Query-results'):
 | 
					    def query_subcorpus(self, query, result_subcorpus_name='Query-results'):
 | 
				
			||||||
        """
 | 
					        '''
 | 
				
			||||||
        Create subcorpus
 | 
					        Create subcorpus
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        Input query will be used to create a subcorpus holding all cpos match
 | 
					        Input query will be used to create a subcorpus holding all cpos match
 | 
				
			||||||
        positions for that query.
 | 
					        positions for that query.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        Keyword arguments:
 | 
					        Keyword arguments:
 | 
				
			||||||
        result_subcorpus_name -- user set name of the subcorpus which holds all
 | 
					        result_subcorpus_name -- set name of the subcorpus which holds all
 | 
				
			||||||
        cpos match positions, produced by the query
 | 
					        cpos match positions, produced by the query
 | 
				
			||||||
        query -- query written in cqp query language
 | 
					        query -- query written in cqp query language
 | 
				
			||||||
        """
 | 
					        '''
 | 
				
			||||||
        self.cqp_query(self.corpus_name, result_subcorpus_name, query)
 | 
					        self.cqp_query(self.corpus_name, result_subcorpus_name, query)
 | 
				
			||||||
        self.result_subcorpus = (self.corpus_name
 | 
					        self.result_subcorpus = (self.corpus_name
 | 
				
			||||||
                                 + ':'
 | 
					                                 + ':'
 | 
				
			||||||
@@ -95,19 +101,19 @@ class CQiWrapper(CQiClient):
 | 
				
			|||||||
        self.SUBCORPUS_NAMES.append(self.result_subcorpus)
 | 
					        self.SUBCORPUS_NAMES.append(self.result_subcorpus)
 | 
				
			||||||
        self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus)
 | 
					        self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus)
 | 
				
			||||||
        print('Nr of all matches is:', self.nr_matches)
 | 
					        print('Nr of all matches is:', self.nr_matches)
 | 
				
			||||||
        # logger.warning('Nr of all matches is: {}'.format(self.nr_matches))
 | 
					        logger.warning('Nr of all matches is: {}'.format(self.nr_matches))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def show_subcorpora(self):
 | 
					    def show_subcorpora(self):
 | 
				
			||||||
        """
 | 
					        '''
 | 
				
			||||||
        Show all subcorpora currently saved by the cqp server.
 | 
					        Show all subcorpora currently saved by the cqp server.
 | 
				
			||||||
        """
 | 
					        '''
 | 
				
			||||||
        return self.cqp_list_subcorpora(self.corpus_name)
 | 
					        return self.cqp_list_subcorpora(self.corpus_name)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def show_query_results(self,
 | 
					    def show_query_results(self,
 | 
				
			||||||
                           context_len=10,
 | 
					                           context_len=10,
 | 
				
			||||||
                           result_len=1000,
 | 
					                           result_len=1000,
 | 
				
			||||||
                           result_offset=0):
 | 
					                           result_offset=0):
 | 
				
			||||||
        """
 | 
					        '''
 | 
				
			||||||
        Show query results
 | 
					        Show query results
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        Shows the actual matched strings produce by the query. Uses the cpos
 | 
					        Shows the actual matched strings produce by the query. Uses the cpos
 | 
				
			||||||
@@ -118,15 +124,20 @@ class CQiWrapper(CQiClient):
 | 
				
			|||||||
        Keyword arguments:
 | 
					        Keyword arguments:
 | 
				
			||||||
        context_len -- defines how many words before and after a match will be
 | 
					        context_len -- defines how many words before and after a match will be
 | 
				
			||||||
        shown (default 10)
 | 
					        shown (default 10)
 | 
				
			||||||
        result_len -- defines how many results are actually grabbed
 | 
					        result_len -- defines for how many matches all informations like lemma
 | 
				
			||||||
        """
 | 
					        and POS are being grabbed
 | 
				
			||||||
 | 
					        result_offset -- defines the offset of the matches being requested. If
 | 
				
			||||||
 | 
					        the offset is 100 informations for matches 100 to result_len are being
 | 
				
			||||||
 | 
					        grabbed
 | 
				
			||||||
 | 
					        '''
 | 
				
			||||||
 | 
					        t0 = time.time()
 | 
				
			||||||
        self.context_len = context_len
 | 
					        self.context_len = context_len
 | 
				
			||||||
        self.corpus_max_len = self.cl_attribute_size(
 | 
					        self.corpus_max_len = self.cl_attribute_size(
 | 
				
			||||||
                                   self.attr_strings['positional_attrs']['word']
 | 
					                                   self.attr_strings['positional_attrs']['word']
 | 
				
			||||||
                              )
 | 
					                              )
 | 
				
			||||||
        self.nr_matches = min(result_len, self.nr_matches)
 | 
					        self.nr_matches = min(result_len, self.nr_matches)
 | 
				
			||||||
        if self.nr_matches == 0:
 | 
					        if self.nr_matches == 0:
 | 
				
			||||||
            # logger.warning('Query resulted in 0 matches.')
 | 
					            logger.warning('Query resulted in 0 matches.')
 | 
				
			||||||
            return None
 | 
					            return None
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            # Get match cpos boundries
 | 
					            # Get match cpos boundries
 | 
				
			||||||
@@ -144,7 +155,8 @@ class CQiWrapper(CQiClient):
 | 
				
			|||||||
                                                           offset_start,
 | 
					                                                           offset_start,
 | 
				
			||||||
                                                           offset_end))
 | 
					                                                           offset_end))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Generate all cpos between match boundries including start and end boundries.
 | 
					        # Generate all cpos between match boundries including start and end
 | 
				
			||||||
 | 
					        # boundries.
 | 
				
			||||||
        # Also generate cpos for left and right context.
 | 
					        # Also generate cpos for left and right context.
 | 
				
			||||||
        # Save those cpos into dict as lists for the keys 'lc', 'hit' and 'rc'
 | 
					        # Save those cpos into dict as lists for the keys 'lc', 'hit' and 'rc'
 | 
				
			||||||
        # Also collect all cpos together in one list for the final request of
 | 
					        # Also collect all cpos together in one list for the final request of
 | 
				
			||||||
@@ -157,42 +169,34 @@ class CQiWrapper(CQiClient):
 | 
				
			|||||||
            lc = {'lc': lc_cpos}
 | 
					            lc = {'lc': lc_cpos}
 | 
				
			||||||
            match_cpos = list(range(start, end))
 | 
					            match_cpos = list(range(start, end))
 | 
				
			||||||
            match = {'hit': match_cpos}
 | 
					            match = {'hit': match_cpos}
 | 
				
			||||||
            rc_cpos = list(range(end, min([self.corpus_max_len, end + self.context_len])))
 | 
					            rc_cpos = list(range(end, min([self.corpus_max_len,
 | 
				
			||||||
 | 
					                                           end + self.context_len])))
 | 
				
			||||||
            rc = {'rc': rc_cpos}
 | 
					            rc = {'rc': rc_cpos}
 | 
				
			||||||
            lc.update(match)
 | 
					            lc.update(match)
 | 
				
			||||||
            lc.update(rc)
 | 
					            lc.update(rc)
 | 
				
			||||||
            all_cpos.extend(lc_cpos + match_cpos + rc_cpos)
 | 
					            all_cpos.extend(lc_cpos + match_cpos + rc_cpos)
 | 
				
			||||||
            all_matches.append(lc)
 | 
					            all_matches.append(lc)
 | 
				
			||||||
        # print(all_matches)
 | 
					 | 
				
			||||||
        # print(all_cpos)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Get all cpos for all sneteces boundries
 | 
					        all_cpos = list(set(all_cpos))  # get rid of cpos duplicates
 | 
				
			||||||
        # s_lookup = {}
 | 
					        len_all_cpos = len(all_cpos)
 | 
				
			||||||
        # for s_id in set(s_ids):
 | 
					 | 
				
			||||||
        #     s_start, s_end = self.cl_struc2cpos('UTOPIEN.s', s_id)
 | 
					 | 
				
			||||||
        #     # CHANGE to UTOPIEN.s will always be like this in nopaque
 | 
					 | 
				
			||||||
        #     s_cpos = range(s_start, s_end)
 | 
					 | 
				
			||||||
        #     s_lookup.update({s_id: list(s_cpos)})
 | 
					 | 
				
			||||||
        #     # print(list(s_cpos))
 | 
					 | 
				
			||||||
        #     all_cpos.extend(s_cpos)
 | 
					 | 
				
			||||||
        t0 = time.time()
 | 
					 | 
				
			||||||
        all_cpos = list(set(all_cpos)) # get rid of cpos duplicates
 | 
					 | 
				
			||||||
        t1 = time.time()
 | 
					        t1 = time.time()
 | 
				
			||||||
        t_total = t1 - t0
 | 
					        t_total = t1 - t0
 | 
				
			||||||
        print('TIME FOR ALL CPOS:', t_total)
 | 
					        logger.warning('Time to create all CPOS for query: {}'.format(t_total))
 | 
				
			||||||
        print('CPOS SUM:', len(all_cpos))
 | 
					        print('Requesting {} CPOS with one query.'.format(len_all_cpos))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
 | 
					        # Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
 | 
				
			||||||
        # all cpos entries in all_cpos_list
 | 
					        # all cpos entries in all_cpos_list
 | 
				
			||||||
        # Also saves these informations into self.results dict
 | 
					        # Also saves these informations into self.results dict
 | 
				
			||||||
        t6 = time.time()
 | 
					        t2 = time.time()
 | 
				
			||||||
        all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
 | 
					        all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
 | 
				
			||||||
        t7 = time.time()
 | 
					        t3 = time.time()
 | 
				
			||||||
        t_final = t7 - t6
 | 
					        t_final = t3 - t2
 | 
				
			||||||
        print('GOT ALL RESULTS IN:', t_final)
 | 
					        print('Got infos for {} CPOS in {} seconds:'.format(len_all_cpos,
 | 
				
			||||||
 | 
					                                                            t_final))
 | 
				
			||||||
        self.results = {'matches': all_matches, 'cpos_lookup': all_cpos_infos,
 | 
					        self.results = {'matches': all_matches,
 | 
				
			||||||
                        'text_lookup': text_lookup}
 | 
					                        'cpos_lookup': all_cpos_infos,
 | 
				
			||||||
 | 
					                        'text_lookup': text_lookup,
 | 
				
			||||||
 | 
					                        'nr_matches': self.nr_matches}
 | 
				
			||||||
        return self.results
 | 
					        return self.results
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_cpos_infos(self, all_cpos):
 | 
					    def get_cpos_infos(self, all_cpos):
 | 
				
			||||||
@@ -250,3 +254,44 @@ class CQiWrapper(CQiClient):
 | 
				
			|||||||
        for info in joined_cpos_infos:
 | 
					        for info in joined_cpos_infos:
 | 
				
			||||||
            dict_cpos_infos[info[0]] = dict(zip(attr_keys_list, info[1:]))
 | 
					            dict_cpos_infos[info[0]] = dict(zip(attr_keys_list, info[1:]))
 | 
				
			||||||
        return dict_cpos_infos, text_lookup
 | 
					        return dict_cpos_infos, text_lookup
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_sentences(self,
 | 
				
			||||||
 | 
					                      match_cpos_list,
 | 
				
			||||||
 | 
					                      get_surrounding_s=False,
 | 
				
			||||||
 | 
					                      l_r_s_context_additional_len=1):
 | 
				
			||||||
 | 
					        '''
 | 
				
			||||||
 | 
					        Get sentence informations for one match also set if and how much left
 | 
				
			||||||
 | 
					        right context sentences should be grabbed surrounding the given CPOS.
 | 
				
			||||||
 | 
					        '''
 | 
				
			||||||
 | 
					        t0 = time.time()
 | 
				
			||||||
 | 
					        key = self.corpus_name + '.s'
 | 
				
			||||||
 | 
					        first_cpos, last_cpos = match_cpos_list[0], match_cpos_list[-1]
 | 
				
			||||||
 | 
					        context_sentences = {}
 | 
				
			||||||
 | 
					        s_ids = self.cl_cpos2struc(key, [first_cpos, last_cpos])
 | 
				
			||||||
 | 
					        for s_id in s_ids:
 | 
				
			||||||
 | 
					            s_start, s_end = self.cl_struc2cpos(key, s_id)
 | 
				
			||||||
 | 
					            s_cpos = list(range(s_start, s_end + 1))
 | 
				
			||||||
 | 
					            context_sentences[s_id] = s_cpos
 | 
				
			||||||
 | 
					        if get_surrounding_s:
 | 
				
			||||||
 | 
					            max_s_id = self.cl_attribute_size(key)
 | 
				
			||||||
 | 
					            additional_s_ids = []
 | 
				
			||||||
 | 
					            additional_s = list(range(max(s_ids[0]
 | 
				
			||||||
 | 
					                                          - l_r_s_context_additional_len,
 | 
				
			||||||
 | 
					                                          0),
 | 
				
			||||||
 | 
					                                      min(s_ids[-1]
 | 
				
			||||||
 | 
					                                          + l_r_s_context_additional_len,
 | 
				
			||||||
 | 
					                                          max_s_id) + 1))
 | 
				
			||||||
 | 
					            additional_s_ids.extend(additional_s)
 | 
				
			||||||
 | 
					            for s_id in additional_s_ids:
 | 
				
			||||||
 | 
					                s_start, s_end = self.cl_struc2cpos(key, s_id)
 | 
				
			||||||
 | 
					                s_cpos = list(range(s_start, s_end + 1))
 | 
				
			||||||
 | 
					                context_sentences[s_id] = s_cpos
 | 
				
			||||||
 | 
					        all_cpos = []
 | 
				
			||||||
 | 
					        for key in context_sentences.keys():
 | 
				
			||||||
 | 
					            all_cpos.extend(context_sentences[key])
 | 
				
			||||||
 | 
					        all_cpos = list(set(all_cpos))
 | 
				
			||||||
 | 
					        all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
 | 
				
			||||||
 | 
					        t1 = time.time()
 | 
				
			||||||
 | 
					        t_total = t1 - t0
 | 
				
			||||||
 | 
					        logger.warning('Got all sentences informations in {} seconds'. format(t_total))
 | 
				
			||||||
 | 
					        return context_sentences, all_cpos_infos, text_lookup
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user