From 3af400a732bd5defcbad79e3c9fb574bbb9f472a Mon Sep 17 00:00:00 2001 From: Stephan Porada Date: Mon, 2 Dec 2019 14:19:40 +0100 Subject: [PATCH] Add get_sentences to wrapper --- app/corpora/CQiWrapper/CQiWrapper.py | 141 ++++++++++++++++++--------- 1 file changed, 93 insertions(+), 48 deletions(-) diff --git a/app/corpora/CQiWrapper/CQiWrapper.py b/app/corpora/CQiWrapper/CQiWrapper.py index c313bf05..c19cf84f 100644 --- a/app/corpora/CQiWrapper/CQiWrapper.py +++ b/app/corpora/CQiWrapper/CQiWrapper.py @@ -5,7 +5,7 @@ from app import logger # only works if imported into opaque web app class CQiWrapper(CQiClient): - """ + ''' CQIiWrapper object High level wrapper that groups and renames some functions of CQiClient @@ -16,7 +16,7 @@ class CQiWrapper(CQiClient): port -- port of the cqp server username -- username used to connect to the cqp server password -- password of the user to connect to the cqp server - """ + ''' SUBCORPUS_NAMES = [] @@ -27,20 +27,21 @@ class CQiWrapper(CQiClient): self.password = password def connect(self): - """ + ''' Connect with CQP server Connects via socket to the CQP server using the given username and password from class initiation. - """ + ''' self.ctrl_connect(self.username, self.password) def __create_attribute_strings(self): - """ + ''' Creates all needed attribute strings to query for word, lemma etc. in the given corpus. For example: CORPUS_NAME.word to query words - """ + Automaticalle creates strings for all pre defined tags. + ''' p_attrs = self.corpus_positional_attributes(self.corpus_name) struct_attrs = self.corpus_structural_attributes(self.corpus_name) self.attr_strings = {} @@ -54,40 +55,45 @@ class CQiWrapper(CQiClient): self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name + '.' + struct_attr) - # logger.warning(('All positional and ' - # 'structural attributes: {}').format(self.attr_strings)) + logger.warning(('All positional and ' + 'structural attributes: {}').format(self.attr_strings)) def select_corpus(self, corpus_name): + ''' + Checks if given copus name exists. If it exists set it as the main + corpus name used to create the needed query attribute strings like + CORPUS_NAME.word. + ''' if corpus_name in self.corpus_list_coprora(): self.corpus_name = corpus_name self.__create_attribute_strings() - # logger.warning('{} does exist.'.format(corpus_name)) + logger.warning('{} does exist.'.format(corpus_name)) else: - # logger.warning('{} does not exist.'.format(corpus_name)) - pass + logger.warning('{} does not exist.'.format(corpus_name)) + raise Exception('Given Corpus Name is not in corpora list.') def disconnect(self): - """ + ''' Disconnect from CQP server Disconnects from the CQP server. Closes used socket after disconnect. - """ + ''' self.ctrl_bye() self.connection.close() - # logger.warning('Disconnected from cqp server.') + logger.warning('Disconnected from cqp server.') def query_subcorpus(self, query, result_subcorpus_name='Query-results'): - """ + ''' Create subcorpus Input query will be used to create a subcorpus holding all cpos match positions for that query. Keyword arguments: - result_subcorpus_name -- user set name of the subcorpus which holds all + result_subcorpus_name -- set name of the subcorpus which holds all cpos match positions, produced by the query query -- query written in cqp query language - """ + ''' self.cqp_query(self.corpus_name, result_subcorpus_name, query) self.result_subcorpus = (self.corpus_name + ':' @@ -95,19 +101,19 @@ class CQiWrapper(CQiClient): self.SUBCORPUS_NAMES.append(self.result_subcorpus) self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus) print('Nr of all matches is:', self.nr_matches) - # logger.warning('Nr of all matches is: {}'.format(self.nr_matches)) + logger.warning('Nr of all matches is: {}'.format(self.nr_matches)) def show_subcorpora(self): - """ + ''' Show all subcorpora currently saved by the cqp server. - """ + ''' return self.cqp_list_subcorpora(self.corpus_name) def show_query_results(self, context_len=10, result_len=1000, result_offset=0): - """ + ''' Show query results Shows the actual matched strings produce by the query. Uses the cpos @@ -118,15 +124,20 @@ class CQiWrapper(CQiClient): Keyword arguments: context_len -- defines how many words before and after a match will be shown (default 10) - result_len -- defines how many results are actually grabbed - """ + result_len -- defines for how many matches all informations like lemma + and POS are being grabbed + result_offset -- defines the offset of the matches being requested. If + the offset is 100 informations for matches 100 to result_len are being + grabbed + ''' + t0 = time.time() self.context_len = context_len self.corpus_max_len = self.cl_attribute_size( self.attr_strings['positional_attrs']['word'] ) self.nr_matches = min(result_len, self.nr_matches) if self.nr_matches == 0: - # logger.warning('Query resulted in 0 matches.') + logger.warning('Query resulted in 0 matches.') return None else: # Get match cpos boundries @@ -144,7 +155,8 @@ class CQiWrapper(CQiClient): offset_start, offset_end)) - # Generate all cpos between match boundries including start and end boundries. + # Generate all cpos between match boundries including start and end + # boundries. # Also generate cpos for left and right context. # Save those cpos into dict as lists for the keys 'lc', 'hit' and 'rc' # Also collect all cpos together in one list for the final request of @@ -157,42 +169,34 @@ class CQiWrapper(CQiClient): lc = {'lc': lc_cpos} match_cpos = list(range(start, end)) match = {'hit': match_cpos} - rc_cpos = list(range(end, min([self.corpus_max_len, end + self.context_len]))) + rc_cpos = list(range(end, min([self.corpus_max_len, + end + self.context_len]))) rc = {'rc': rc_cpos} lc.update(match) lc.update(rc) all_cpos.extend(lc_cpos + match_cpos + rc_cpos) all_matches.append(lc) - # print(all_matches) - # print(all_cpos) - # Get all cpos for all sneteces boundries - # s_lookup = {} - # for s_id in set(s_ids): - # s_start, s_end = self.cl_struc2cpos('UTOPIEN.s', s_id) - # # CHANGE to UTOPIEN.s will always be like this in nopaque - # s_cpos = range(s_start, s_end) - # s_lookup.update({s_id: list(s_cpos)}) - # # print(list(s_cpos)) - # all_cpos.extend(s_cpos) - t0 = time.time() - all_cpos = list(set(all_cpos)) # get rid of cpos duplicates + all_cpos = list(set(all_cpos)) # get rid of cpos duplicates + len_all_cpos = len(all_cpos) t1 = time.time() t_total = t1 - t0 - print('TIME FOR ALL CPOS:', t_total) - print('CPOS SUM:', len(all_cpos)) + logger.warning('Time to create all CPOS for query: {}'.format(t_total)) + print('Requesting {} CPOS with one query.'.format(len_all_cpos)) # Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for # all cpos entries in all_cpos_list # Also saves these informations into self.results dict - t6 = time.time() + t2 = time.time() all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos) - t7 = time.time() - t_final = t7 - t6 - print('GOT ALL RESULTS IN:', t_final) - - self.results = {'matches': all_matches, 'cpos_lookup': all_cpos_infos, - 'text_lookup': text_lookup} + t3 = time.time() + t_final = t3 - t2 + print('Got infos for {} CPOS in {} seconds:'.format(len_all_cpos, + t_final)) + self.results = {'matches': all_matches, + 'cpos_lookup': all_cpos_infos, + 'text_lookup': text_lookup, + 'nr_matches': self.nr_matches} return self.results def get_cpos_infos(self, all_cpos): @@ -250,3 +254,44 @@ class CQiWrapper(CQiClient): for info in joined_cpos_infos: dict_cpos_infos[info[0]] = dict(zip(attr_keys_list, info[1:])) return dict_cpos_infos, text_lookup + + def get_sentences(self, + match_cpos_list, + get_surrounding_s=False, + l_r_s_context_additional_len=1): + ''' + Get sentence informations for one match also set if and how much left + right context sentences should be grabbed surrounding the given CPOS. + ''' + t0 = time.time() + key = self.corpus_name + '.s' + first_cpos, last_cpos = match_cpos_list[0], match_cpos_list[-1] + context_sentences = {} + s_ids = self.cl_cpos2struc(key, [first_cpos, last_cpos]) + for s_id in s_ids: + s_start, s_end = self.cl_struc2cpos(key, s_id) + s_cpos = list(range(s_start, s_end + 1)) + context_sentences[s_id] = s_cpos + if get_surrounding_s: + max_s_id = self.cl_attribute_size(key) + additional_s_ids = [] + additional_s = list(range(max(s_ids[0] + - l_r_s_context_additional_len, + 0), + min(s_ids[-1] + + l_r_s_context_additional_len, + max_s_id) + 1)) + additional_s_ids.extend(additional_s) + for s_id in additional_s_ids: + s_start, s_end = self.cl_struc2cpos(key, s_id) + s_cpos = list(range(s_start, s_end + 1)) + context_sentences[s_id] = s_cpos + all_cpos = [] + for key in context_sentences.keys(): + all_cpos.extend(context_sentences[key]) + all_cpos = list(set(all_cpos)) + all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos) + t1 = time.time() + t_total = t1 - t0 + logger.warning('Got all sentences informations in {} seconds'. format(t_total)) + return context_sentences, all_cpos_infos, text_lookup