mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
				synced 2025-11-03 20:02:47 +00:00 
			
		
		
		
	Add new CQiWrapper
This commit is contained in:
		@@ -1,7 +1,6 @@
 | 
				
			|||||||
from .CQiClient import CQiClient
 | 
					from .CQiClient import CQiClient
 | 
				
			||||||
import multiprocessing
 | 
					import multiprocessing
 | 
				
			||||||
import collections
 | 
					import collections
 | 
				
			||||||
import socket
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class CQiWrapper(CQiClient):
 | 
					class CQiWrapper(CQiClient):
 | 
				
			||||||
@@ -33,21 +32,41 @@ class CQiWrapper(CQiClient):
 | 
				
			|||||||
        """
 | 
					        """
 | 
				
			||||||
        self.ctrl_connect(self.username, self.password)
 | 
					        self.ctrl_connect(self.username, self.password)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def create_attribute_strings(self, corpus_name):
 | 
					    def create_attribute_strings(self):
 | 
				
			||||||
        self.word_str = corpus_name + '.word'
 | 
					        p_attrs = self.corpus_positional_attributes(self.corpus_name)
 | 
				
			||||||
        self.lemma_str = corpus_name + '.lemma'
 | 
					        struct_attrs = self.corpus_structural_attributes(self.corpus_name)
 | 
				
			||||||
        self.pos_str = corpus_name + '.pos'
 | 
					        self.meta_struct_element = struct_attrs[0]
 | 
				
			||||||
        self.sem_str = corpus_name + '.sem'
 | 
					        print(p_attrs)
 | 
				
			||||||
        self.entry_str = corpus_name + '.entry'
 | 
					        print(struct_attrs)
 | 
				
			||||||
        self.entry_author_str = self.entry_str + '_author'
 | 
					        self.attr_strings = {}
 | 
				
			||||||
        self.entry_title_str = self.entry_str + '_title'
 | 
					        self.attr_strings['positional_attrs'] = {}
 | 
				
			||||||
        self.attributes = [self.word_str,
 | 
					        self.attr_strings['struct_attrs'] = {}
 | 
				
			||||||
                           self.lemma_str,
 | 
					        for p_attr in p_attrs:
 | 
				
			||||||
                           self.pos_str,
 | 
					            self.attr_strings['positional_attrs'][p_attr] = (self.corpus_name
 | 
				
			||||||
                           self.sem_str,
 | 
					                                                             + '.'
 | 
				
			||||||
                           self.entry_str,
 | 
					                                                             + p_attr)
 | 
				
			||||||
                           self.entry_author_str,
 | 
					        for struct_attr in struct_attrs[:-1]:
 | 
				
			||||||
                           self.entry_title_str]
 | 
					            self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
 | 
				
			||||||
 | 
					                                                              + '.'
 | 
				
			||||||
 | 
					                                                              + struct_attr)
 | 
				
			||||||
 | 
					        # self.word_str = corpus_name + '.word'
 | 
				
			||||||
 | 
					        # self.lemma_str = corpus_name + '.lemma'
 | 
				
			||||||
 | 
					        # self.pos_str = corpus_name + '.pos'
 | 
				
			||||||
 | 
					        # self.sem_str = corpus_name + '.sem'
 | 
				
			||||||
 | 
					        # self.entry_str = corpus_name + '.entry'
 | 
				
			||||||
 | 
					        # self.entry_author_str = self.entry_str + '_author'
 | 
				
			||||||
 | 
					        # self.entry_title_str = self.entry_str + '_title'
 | 
				
			||||||
 | 
					        # self.attributes = [self.word_str,
 | 
				
			||||||
 | 
					        #                    self.lemma_str,
 | 
				
			||||||
 | 
					        #                    self.pos_str,
 | 
				
			||||||
 | 
					        #                    self.sem_str,
 | 
				
			||||||
 | 
					        #                    self.entry_str,
 | 
				
			||||||
 | 
					        #                    self.entry_author_str,
 | 
				
			||||||
 | 
					        #                    self.entry_title_str]
 | 
				
			||||||
 | 
					        # print(self.attributes)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def set_corpus_name(self, corpus_name):
 | 
				
			||||||
 | 
					        self.corpus_name = corpus_name
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def disconnect(self):
 | 
					    def disconnect(self):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
@@ -58,7 +77,7 @@ class CQiWrapper(CQiClient):
 | 
				
			|||||||
        self.ctrl_bye()
 | 
					        self.ctrl_bye()
 | 
				
			||||||
        self.connection.close()
 | 
					        self.connection.close()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def query_subcorpus(self, corpus_name, result_subcorpus_name, query):
 | 
					    def query_subcorpus(self, result_subcorpus_name, query):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        Create subcorpus
 | 
					        Create subcorpus
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -66,13 +85,12 @@ class CQiWrapper(CQiClient):
 | 
				
			|||||||
        positions for that query.
 | 
					        positions for that query.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        Keyword arguments:
 | 
					        Keyword arguments:
 | 
				
			||||||
        corpus_name -- name of the corpus the query will be used on
 | 
					 | 
				
			||||||
        result_subcorpus_name -- user set name of the subcorpus which holds all
 | 
					        result_subcorpus_name -- user set name of the subcorpus which holds all
 | 
				
			||||||
        cpos match positions, produced by the query
 | 
					        cpos match positions, produced by the query
 | 
				
			||||||
        query -- query written in cqp query language
 | 
					        query -- query written in cqp query language
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        self.cqp_query(corpus_name, result_subcorpus_name, query)
 | 
					        self.cqp_query(self.corpus_name, result_subcorpus_name, query)
 | 
				
			||||||
        self.result_subcorpus_ns = (corpus_name
 | 
					        self.result_subcorpus_ns = (self.corpus_name
 | 
				
			||||||
                                    + ':'
 | 
					                                    + ':'
 | 
				
			||||||
                                    + result_subcorpus_name)
 | 
					                                    + result_subcorpus_name)
 | 
				
			||||||
        self.SUBCORPUS_NAMES.append(self.result_subcorpus_ns)
 | 
					        self.SUBCORPUS_NAMES.append(self.result_subcorpus_ns)
 | 
				
			||||||
@@ -80,11 +98,9 @@ class CQiWrapper(CQiClient):
 | 
				
			|||||||
        print('Nr of all matches is:', self.nr_matches)
 | 
					        print('Nr of all matches is:', self.nr_matches)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def show_subcorpora(self):
 | 
					    def show_subcorpora(self):
 | 
				
			||||||
        print('Known subcorpora:', self.SUBCORPUS_NAMES)
 | 
					        return self.cqp_list_subcorpora(self.corpus_name)
 | 
				
			||||||
        return self.SUBCORPUS_NAMES
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def show_results(self,
 | 
					    def show_results(self,
 | 
				
			||||||
                     corpus_name,
 | 
					 | 
				
			||||||
                     result_start_count=0,
 | 
					                     result_start_count=0,
 | 
				
			||||||
                     result_max_count=50,
 | 
					                     result_max_count=50,
 | 
				
			||||||
                     context_len=10,):
 | 
					                     context_len=10,):
 | 
				
			||||||
@@ -116,7 +132,6 @@ class CQiWrapper(CQiClient):
 | 
				
			|||||||
        ])
 | 
					        ])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        Keyword arguments:
 | 
					        Keyword arguments:
 | 
				
			||||||
        corpus_name -- name of the parent corpus the subcorpus is part of
 | 
					 | 
				
			||||||
        result_start_count -- start position of the dumped subcorpus.
 | 
					        result_start_count -- start position of the dumped subcorpus.
 | 
				
			||||||
        (default 0) If it is 0 matches 0 to 50 will be shown. If it is 50
 | 
					        (default 0) If it is 0 matches 0 to 50 will be shown. If it is 50
 | 
				
			||||||
        matches 50 to 100 will be shown.
 | 
					        matches 50 to 100 will be shown.
 | 
				
			||||||
@@ -126,8 +141,7 @@ class CQiWrapper(CQiClient):
 | 
				
			|||||||
        shown (default 10)
 | 
					        shown (default 10)
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        self.context_len = context_len
 | 
					        self.context_len = context_len
 | 
				
			||||||
        word_str = corpus_name + '.word'
 | 
					        self.corpus_max_len = self.cl_attribute_size(self.attr_strings['positional_attrs']['word'])
 | 
				
			||||||
        self.corpus_max_len = self.cl_attribute_size(word_str)
 | 
					 | 
				
			||||||
        if self.nr_matches == 0:
 | 
					        if self.nr_matches == 0:
 | 
				
			||||||
            print('Query resulted in 0 matches.')
 | 
					            print('Query resulted in 0 matches.')
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
@@ -157,7 +171,7 @@ class CQiWrapper(CQiClient):
 | 
				
			|||||||
                match = multiprocessing.Process(target=self.__get_matches,
 | 
					                match = multiprocessing.Process(target=self.__get_matches,
 | 
				
			||||||
                                                args=(i,
 | 
					                                                args=(i,
 | 
				
			||||||
                                                      index_pair,
 | 
					                                                      index_pair,
 | 
				
			||||||
                                                      corpus_name,
 | 
					                                                      self.corpus_name,
 | 
				
			||||||
                                                      return_dict))
 | 
					                                                      return_dict))
 | 
				
			||||||
                matches.append(match)
 | 
					                matches.append(match)
 | 
				
			||||||
                match.start()
 | 
					                match.start()
 | 
				
			||||||
@@ -167,7 +181,25 @@ class CQiWrapper(CQiClient):
 | 
				
			|||||||
            ordered_results = collections.OrderedDict()
 | 
					            ordered_results = collections.OrderedDict()
 | 
				
			||||||
            for key in sorted(return_dict.keys()):
 | 
					            for key in sorted(return_dict.keys()):
 | 
				
			||||||
                ordered_results[key] = return_dict[key]
 | 
					                ordered_results[key] = return_dict[key]
 | 
				
			||||||
            print('ORDERED_RESULTS', ordered_results)
 | 
					            return ordered_results
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_cpos_info(self, cpos, session):
 | 
				
			||||||
 | 
					        match_dict = {}
 | 
				
			||||||
 | 
					        for attr_dict in self.attr_strings:
 | 
				
			||||||
 | 
					            # print(self.attr_strings[attr_dict])
 | 
				
			||||||
 | 
					            if attr_dict == 'positional_attrs':
 | 
				
			||||||
 | 
					                for p_attr_key in self.attr_strings[attr_dict].keys():
 | 
				
			||||||
 | 
					                    # print(p_attr_key)
 | 
				
			||||||
 | 
					                    match_str = session.cl_cpos2str(self.attr_strings[attr_dict][p_attr_key], range(cpos[0], cpos[1]))
 | 
				
			||||||
 | 
					                    match_dict[p_attr_key] = match_str
 | 
				
			||||||
 | 
					            elif attr_dict == 'struct_attrs':
 | 
				
			||||||
 | 
					                for struct_attr_key in self.attr_strings[attr_dict].keys():
 | 
				
			||||||
 | 
					                    # print(struct_attr_key)
 | 
				
			||||||
 | 
					                    struct_entry = session.cl_cpos2struc(self.attr_strings['struct_attrs'][self.meta_struct_element],
 | 
				
			||||||
 | 
					                                                         range(cpos[0], cpos[1]))
 | 
				
			||||||
 | 
					                    match_str = session.cl_struc2str(self.attr_strings[attr_dict][struct_attr_key], struct_entry)
 | 
				
			||||||
 | 
					                    match_dict[struct_attr_key] = set(match_str)
 | 
				
			||||||
 | 
					        return match_dict
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __get_matches(self, i, index_pair, corpus_name, return_dict):
 | 
					    def __get_matches(self, i, index_pair, corpus_name, return_dict):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
@@ -183,58 +215,46 @@ class CQiWrapper(CQiClient):
 | 
				
			|||||||
        return_dict -- dictionary created with manager.dict() that holds the
 | 
					        return_dict -- dictionary created with manager.dict() that holds the
 | 
				
			||||||
        extracted strings tags etc.
 | 
					        extracted strings tags etc.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        print('START:', index_pair[0])
 | 
					        # print('START:', index_pair[0])
 | 
				
			||||||
        print('END:', index_pair[1])
 | 
					        # print('END:', index_pair[1])
 | 
				
			||||||
        print('=============================')
 | 
					        # print('=============================')
 | 
				
			||||||
 | 
					        index_pair = [index_pair[0], index_pair[1] + 1]
 | 
				
			||||||
        tmp_session = CQiWrapper(username=self.username, password=self.password,
 | 
					        tmp_session = CQiWrapper(username=self.username, password=self.password,
 | 
				
			||||||
                                 host=self.host, port=self.port)
 | 
					                                 host=self.host, port=self.port)
 | 
				
			||||||
        tmp_session.connect()
 | 
					        tmp_session.connect()
 | 
				
			||||||
        tokens = tmp_session.cl_cpos2str(self.word_str,
 | 
					        match = self.get_cpos_info(index_pair, tmp_session)
 | 
				
			||||||
                                         range(index_pair[0],
 | 
					        # tokens = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
 | 
				
			||||||
                                               index_pair[1] + 1))
 | 
					        #                                  range(index_pair[0],
 | 
				
			||||||
        lemmas = tmp_session.cl_cpos2str(self.lemma_str,
 | 
					        #                                        index_pair[1] + 1))
 | 
				
			||||||
                                         range(index_pair[0],
 | 
					        # lemmas = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['lemma'],
 | 
				
			||||||
                                               index_pair[1] + 1))
 | 
					        #                                  range(index_pair[0],
 | 
				
			||||||
        pos_tags = tmp_session.cl_cpos2str(self.pos_str,
 | 
					        #                                        index_pair[1] + 1))
 | 
				
			||||||
                                           range(index_pair[0],
 | 
					        # pos_tags = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['pos'],
 | 
				
			||||||
                                                 index_pair[1] + 1))
 | 
					        #                                    range(index_pair[0],
 | 
				
			||||||
        sem_tags = tmp_session.cl_cpos2str(self.sem_str,
 | 
					        #                                          index_pair[1] + 1))
 | 
				
			||||||
                                           range(index_pair[0],
 | 
					        # sem_tags = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['sem'],
 | 
				
			||||||
                                                 index_pair[1] + 1))
 | 
					        #                                    range(index_pair[0],
 | 
				
			||||||
        struc_entry = tmp_session.cl_cpos2struc(self.entry_str,
 | 
					        #                                          index_pair[1] + 1))
 | 
				
			||||||
                                                range(index_pair[0],
 | 
					        # struc_entry = tmp_session.cl_cpos2struc(self.attr_strings['struct_attrs']['entry'],
 | 
				
			||||||
                                                      index_pair[1] + 1))
 | 
					        #                                         range(index_pair[0],
 | 
				
			||||||
 | 
					        #                                               index_pair[1] + 1))
 | 
				
			||||||
        before_index = max([0, index_pair[0] - self.context_len])
 | 
					        before_index = max([0, index_pair[0] - self.context_len])
 | 
				
			||||||
        after_index = min([self.corpus_max_len,
 | 
					        after_index = min([self.corpus_max_len,
 | 
				
			||||||
                           index_pair[1] + self.context_len])
 | 
					                           index_pair[1] + self.context_len])
 | 
				
			||||||
        context_before = tmp_session.cl_cpos2str(self.word_str,
 | 
					        context_before = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
 | 
				
			||||||
                                                 range(before_index,
 | 
					                                                 range(before_index,
 | 
				
			||||||
                                                       index_pair[0]))
 | 
					                                                       index_pair[0]))
 | 
				
			||||||
        context_after = tmp_session.cl_cpos2str(self.word_str,
 | 
					        context_after = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
 | 
				
			||||||
                                                range(index_pair[1] + 1,
 | 
					                                                range(index_pair[1] + 1,
 | 
				
			||||||
                                                      after_index + 1))
 | 
					                                                      after_index + 1))
 | 
				
			||||||
        entry_titles = tmp_session.cl_struc2str(self.entry_title_str,
 | 
					        # entry_titles = tmp_session.cl_struc2str(self.attr_strings['struct_attrs']['entry_title'],
 | 
				
			||||||
                                                struc_entry)
 | 
					        #                                         struc_entry)
 | 
				
			||||||
        entry_authors = tmp_session.cl_struc2str(self.entry_author_str,
 | 
					        # entry_authors = tmp_session.cl_struc2str(self.attr_strings['struct_attrs']['entry_author'],
 | 
				
			||||||
                                                 struc_entry)
 | 
					        #                                          struc_entry)
 | 
				
			||||||
        return_dict[i] = {'tokens': tokens,
 | 
					        tmp_dict = {'context_before': context_before,
 | 
				
			||||||
                          'lemmas': lemmas,
 | 
					                    'context_after': context_after,
 | 
				
			||||||
                          'pos_tags': pos_tags,
 | 
					                    'cpos_start': index_pair[0],
 | 
				
			||||||
                          'sem_tags': sem_tags,
 | 
					                    'cpos_end': index_pair[1]}
 | 
				
			||||||
                          'context_before': context_before,
 | 
					        match.update(tmp_dict)
 | 
				
			||||||
                          'context_after': context_after,
 | 
					        return_dict[i] = match
 | 
				
			||||||
                          'entry_title': entry_titles[0],
 | 
					 | 
				
			||||||
                          'entry_author': entry_authors[0],
 | 
					 | 
				
			||||||
                          'cpos_start': index_pair[0],
 | 
					 | 
				
			||||||
                          'cpos_end': index_pair[1]}
 | 
					 | 
				
			||||||
        tmp_session.disconnect()
 | 
					        tmp_session.disconnect()
 | 
				
			||||||
 | 
					 | 
				
			||||||
    def get_cpos_info(self, cpos):
 | 
					 | 
				
			||||||
        match_dict = collections.OrderedDict()
 | 
					 | 
				
			||||||
        for attribute in self.attributes:
 | 
					 | 
				
			||||||
            if '.entry' not in attribute:
 | 
					 | 
				
			||||||
                match_str = self.cl_cpos2str(attribute, range(cpos[0], cpos[1]))
 | 
					 | 
				
			||||||
                match_dict[attribute] = match_str
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                continue
 | 
					 | 
				
			||||||
        print(match_dict)
 | 
					 | 
				
			||||||
 
 | 
				
			|||||||
@@ -38,9 +38,11 @@ def recv_query(message):
 | 
				
			|||||||
    corpus_name = 'CORPUS'
 | 
					    corpus_name = 'CORPUS'
 | 
				
			||||||
    result_subcorpus_name = 'Query-results'  # should be set by the user somehow
 | 
					    result_subcorpus_name = 'Query-results'  # should be set by the user somehow
 | 
				
			||||||
    query = message['query']
 | 
					    query = message['query']
 | 
				
			||||||
    analysis_client.create_attribute_strings(corpus_name)
 | 
					    analysis_client.set_corpus_name(corpus_name)
 | 
				
			||||||
    analysis_client.query_subcorpus(corpus_name, result_subcorpus_name, query)
 | 
					    analysis_client.create_attribute_strings()
 | 
				
			||||||
    analysis_client.show_results(corpus_name)
 | 
					    analysis_client.query_subcorpus(result_subcorpus_name, query)
 | 
				
			||||||
 | 
					    results = analysis_client.show_results()
 | 
				
			||||||
 | 
					    logger.warning('Query results: {}'.format(str(results)))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def observe_corpus_analysis_connection(app, corpus_id, session_id):
 | 
					def observe_corpus_analysis_connection(app, corpus_id, session_id):
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user