from .CQiClient import CQiClient import multiprocessing import collections import socket class CQiWrapper(CQiClient): """ CQIiWrapper object High level wrapper that groups and renames some functions of CQiClient for ease of use. Also structures recieved data into python dictionaries. Keyword arguments: username -- username used to connect to the cqp server password -- password of the user to connect to the cqp server """ SUBCORPUS_NAMES = [] def __init__(self, host='127.0.0.1', port=4877, username='opaque', password='opaque'): super(CQiWrapper, self).__init__(host=host, port=port) self.username = username self.password = password def connect(self): """ Connect with CQP server Connects via socket to the CQP server using the given username and password from class initiation. """ self.ctrl_connect(self.username, self.password) def create_attribute_strings(self, corpus_name): self.word_str = corpus_name + '.word' self.lemma_str = corpus_name + '.lemma' self.pos_str = corpus_name + '.pos' self.sem_str = corpus_name + '.sem' self.entry_str = corpus_name + '.entry' self.entry_author_str = self.entry_str + '_author' self.entry_title_str = self.entry_str + '_title' self.attributes = [self.word_str, self.lemma_str, self.pos_str, self.sem_str, self.entry_str, self.entry_author_str, self.entry_title_str] def disconnect(self): """ Disconnect from CQP server Disconnects from the CQP server. Closes used socket after disconnect. """ self.ctrl_bye() self.connection.close() def query_subcorpus(self, corpus_name, result_subcorpus_name, query): """ Create subcorpus Input query will be used to create a subcorpus holding all cpos match positions for that query. Keyword arguments: corpus_name -- name of the corpus the query will be used on result_subcorpus_name -- user set name of the subcorpus which holds all cpos match positions, produced by the query query -- query written in cqp query language """ self.cqp_query(corpus_name, result_subcorpus_name, query) self.result_subcorpus_ns = (corpus_name + ':' + result_subcorpus_name) self.SUBCORPUS_NAMES.append(self.result_subcorpus_ns) self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus_ns) print('Nr of all matches is:', self.nr_matches) def show_subcorpora(self): print('Known subcorpora:', self.SUBCORPUS_NAMES) return self.SUBCORPUS_NAMES def show_results(self, corpus_name, result_start_count=0, result_max_count=50, context_len=10,): """ Show query results Shows the actual matched strings produce by the query. Uses the cpos match indexes to grab those strings. saves them into an orderd dictionary. Also saves coresponding tags, lemmas and context: OrderedDict([ (0, { 'tokens': ['Big', 'Brother', 'himself'], 'lemmas': ['big', 'brother', 'himself'], 'pos_tags': ['JJ', 'NN1', 'PPX1'], 'sem_tags': ['|A11.1+|N3.2+|N5+|', '|S2.2m|S4m|S9/S2.2m|', '|Z8m|'], 'context_before': ['figures', 'of', 'the', 'Party', ',', 'almost', 'on', 'a', 'level', 'with'], 'context_after': [',', 'and', 'then', 'had', 'engaged', 'in', 'counter-revolu-', 'tionary', 'activities', ','], 'entry_title': '1984', 'entry_author': 'george_orwell', 'cpos_start': 110490, 'cpos_end': 110492 } ) ]) Keyword arguments: corpus_name -- name of the parent corpus the subcorpus is part of result_start_count -- start position of the dumped subcorpus. (default 0) If it is 0 matches 0 to 50 will be shown. If it is 50 matches 50 to 100 will be shown. result_max_count -- defines how many matches at once will be shown. (default 50) context_len -- defines how many words before and after a match will be shown (default 10) """ self.context_len = context_len word_str = corpus_name + '.word' self.corpus_max_len = self.cl_attribute_size(word_str) if self.nr_matches == 0: print('Query resulted in 0 matches.') else: if self.nr_matches <= 50: matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns, 0x10, 0, self.nr_matches - 1) matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns, 0x11, 0, self.nr_matches - 1) else: matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns, 0x10, result_start_count, result_max_count - 1) matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns, 0x11, result_start_count, result_max_count - 1) match_indexes = zip(matches_start, matches_end) matches = [] manager = multiprocessing.Manager() return_dict = manager.dict() for i, index_pair in enumerate(match_indexes): match = multiprocessing.Process(target=self.__get_matches, args=(i, index_pair, corpus_name, return_dict)) matches.append(match) match.start() for match in matches: match.join() # sort matches into ordered dict ordered_results = collections.OrderedDict() for key in sorted(return_dict.keys()): ordered_results[key] = return_dict[key] print('ORDERED_RESULTS', ordered_results) def __get_matches(self, i, index_pair, corpus_name, return_dict): """ Get matches as readable output Gets the actual match strings of cpos match indexes. Private helper method used in show_results. Keyword arguments: i -- serial number for match at given cpos index_pair -- match start and match end cpos corpus_name -- name of the parent corpus return_dict -- dictionary created with manager.dict() that holds the extracted strings tags etc. """ print('START:', index_pair[0]) print('END:', index_pair[1]) print('=============================') tmp_session = CQiWrapper(username=self.username, password=self.password, host=self.host, port=self.port) tmp_session.connect() tokens = tmp_session.cl_cpos2str(self.word_str, range(index_pair[0], index_pair[1] + 1)) lemmas = tmp_session.cl_cpos2str(self.lemma_str, range(index_pair[0], index_pair[1] + 1)) pos_tags = tmp_session.cl_cpos2str(self.pos_str, range(index_pair[0], index_pair[1] + 1)) sem_tags = tmp_session.cl_cpos2str(self.sem_str, range(index_pair[0], index_pair[1] + 1)) struc_entry = tmp_session.cl_cpos2struc(self.entry_str, range(index_pair[0], index_pair[1] + 1)) before_index = max([0, index_pair[0] - self.context_len]) after_index = min([self.corpus_max_len, index_pair[1] + self.context_len]) context_before = tmp_session.cl_cpos2str(self.word_str, range(before_index, index_pair[0])) context_after = tmp_session.cl_cpos2str(self.word_str, range(index_pair[1] + 1, after_index + 1)) entry_titles = tmp_session.cl_struc2str(self.entry_title_str, struc_entry) entry_authors = tmp_session.cl_struc2str(self.entry_author_str, struc_entry) return_dict[i] = {'tokens': tokens, 'lemmas': lemmas, 'pos_tags': pos_tags, 'sem_tags': sem_tags, 'context_before': context_before, 'context_after': context_after, 'entry_title': entry_titles[0], 'entry_author': entry_authors[0], 'cpos_start': index_pair[0], 'cpos_end': index_pair[1]} tmp_session.disconnect() def get_cpos_info(self, cpos): match_dict = collections.OrderedDict() for attribute in self.attributes: if '.entry' not in attribute: match_str = self.cl_cpos2str(attribute, range(cpos[0], cpos[1])) match_dict[attribute] = match_str else: continue print(match_dict)