From a1cdfd498ad9fdafe815455e1bde83bf99260a97 Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Tue, 7 Apr 2020 16:27:28 +0200 Subject: [PATCH] Remove wrapper --- app/corpora/cqi/wrapper.py | 321 ------------------------------------- 1 file changed, 321 deletions(-) delete mode 100644 app/corpora/cqi/wrapper.py diff --git a/app/corpora/cqi/wrapper.py b/app/corpora/cqi/wrapper.py deleted file mode 100644 index f6c58395..00000000 --- a/app/corpora/cqi/wrapper.py +++ /dev/null @@ -1,321 +0,0 @@ -from .api import APIClient -from .api.specification import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND -import time - - -class CQiWrapper(APIClient): - ''' - CQIiWrapper object - - High level wrapper that groups and renames some functions of CQiClient - for ease of use. Also structures recieved data into python dictionaries. - - Keyword arguments: - host -- host IP adress or hostname wher the cqp server is running - port -- port of the cqp server - username -- username used to connect to the cqp server - password -- password of the user to connect to the cqp server - ''' - - SUBCORPUS_NAMES = [] - - def __init__(self, host='127.0.0.1', port=4877, username='anonymous', - password=''): - super(CQiWrapper, self).__init__(host, port=port) - self.username = username - self.password = password - - def connect(self): - ''' - Connect with CQP server - - Connects via socket to the CQP server using the given username and - password from class initiation. - ''' - self.ctrl_connect(self.username, self.password) - - def __create_attribute_strings(self): - ''' - Creates all needed attribute strings to query for word, lemma etc. in - the given corpus. - For example: CORPUS_NAME.word to query words - Automaticalle creates strings for all pre defined tags. - ''' - p_attrs = self.corpus_positional_attributes(self.corpus_name) - struct_attrs = self.corpus_structural_attributes(self.corpus_name) - self.attr_strings = {} - self.attr_strings['positional_attrs'] = {} - self.attr_strings['struct_attrs'] = {} - for p_attr in p_attrs: - self.attr_strings['positional_attrs'][p_attr] = (self.corpus_name - + '.' - + p_attr) - for struct_attr in struct_attrs: - self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name - + '.' - + struct_attr) - print(('All positional and ' - 'structural attributes: {}').format(self.attr_strings)) - - def select_corpus(self, corpus_name): - ''' - Checks if given copus name exists. If it exists set it as the main - corpus name used to create the needed query attribute strings like - CORPUS_NAME.word. - ''' - if corpus_name in self.corpus_list_coprora(): - self.corpus_name = corpus_name - self.__create_attribute_strings() - print('{} does exist.'.format(corpus_name)) - else: - print('{} does not exist.'.format(corpus_name)) - raise Exception('Given Corpus Name is not in corpora list.') - - def disconnect(self): - ''' - Disconnect from CQP server - - Disconnects from the CQP server. Closes used socket after disconnect. - ''' - self.ctrl_bye() - print('Disconnected from cqp server.') - - def query_subcorpus(self, query, result_subcorpus_name='Query-results'): - ''' - Create subcorpus - - Input query will be used to create a subcorpus holding all cpos match - positions for that query. - - Keyword arguments: - result_subcorpus_name -- set name of the subcorpus which holds all - cpos match positions, produced by the query - query -- query written in cqp query language - ''' - self.query = query - self.cqp_query(self.corpus_name, result_subcorpus_name, query) - self.result_subcorpus = (self.corpus_name - + ':' - + result_subcorpus_name) - self.SUBCORPUS_NAMES.append(self.result_subcorpus) - self.match_count = self.cqp_subcorpus_size(self.result_subcorpus) - print('Nr of all matches is: {}'.format(self.match_count)) - - def show_subcorpora(self): - ''' - Show all subcorpora currently saved by the cqp server. - ''' - return self.cqp_list_subcorpora(self.corpus_name) - - def show_query_results(self, - context_len=10, - result_len=1000, - result_offset=0): - ''' - Show query results - - Shows the actual matched strings produce by the query. Uses the cpos - match indexes to grab those strings. saves them into an orderd - dictionary. Also saves coresponding tags, lemmas and context. Gets those - informations using the corresponding cpos. - - Keyword arguments: - context_len -- defines how many words before and after a match will be - shown (default 10) - result_len -- defines for how many matches all informations like lemma - and POS are being grabbed - result_offset -- defines the offset of the matches being requested. If - the offset is 100 informations for matches 100 to result_len are being - grabbed - ''' - t0 = time.time() - self.context_len = context_len - self.corpus_max_len = self.cl_attribute_size( - self.attr_strings['positional_attrs']['word'] - ) - self.nr_matches = min(result_len, self.match_count) - if self.match_count == 0: - print('Query resulted in 0 matches.') - self.results = {'code': 0, - 'result': {'matches': [], - 'match_count': self.match_count, - 'cpos_lookup': {}, - 'text_lookup': {}} - } - return self.results - else: - # Get match cpos boundries - # match_boundries shows the start and end cpos of one match as a - # pair of cpositions - # [(1355, 1357), (1477, 1479)] Example for two boundry pairs - offset_start = 0 if result_offset == 0 else result_offset - print('Offset start is: {}'.format(offset_start)) - offset_end = min((self.nr_matches + result_offset - 1), self.match_count - 1) - print('Offset end is: {}'.format(offset_end)) - match_boundaries = zip(self.cqp_dump_subcorpus(self.result_subcorpus, - CONST_FIELD_MATCH, - offset_start, - offset_end), - self.cqp_dump_subcorpus(self.result_subcorpus, - CONST_FIELD_MATCHEND, - offset_start, - offset_end)) - - # Generate all cpos between match boundries including start and end - # boundries. - # Also generate cpos for left and right context. - # Save those cpos into dict as lists for the keys 'lc', 'hit' and 'rc' - # Also collect all cpos together in one list for the final request of - # all cpos informations - all_matches = [] - all_cpos = [] - for start, end in match_boundaries: - end += 1 - lc_cpos = list(range(max([0, start - self.context_len]), start)) - lc = {'lc': lc_cpos} - match_cpos = list(range(start, end)) - match = {'hit': match_cpos} - rc_cpos = list(range(end, min([self.corpus_max_len, - end + self.context_len]))) - rc = {'rc': rc_cpos} - lc.update(match) - lc.update(rc) - all_cpos.extend(lc_cpos + match_cpos + rc_cpos) - all_matches.append(lc) - - all_cpos = list(set(all_cpos)) # get rid of cpos duplicates - len_all_cpos = len(all_cpos) - t1 = time.time() - t_total = t1 - t0 - print('Time to create all CPOS for query: {}'.format(t_total)) - print('Requesting {} CPOS with one query.'.format(len_all_cpos)) - - # Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for - # all cpos entries in all_cpos_list - # Also saves these informations into self.results dict - t2 = time.time() - all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos) - t3 = time.time() - t_final = t3 - t2 - print('Got infos for {} CPOS in {} seconds:'.format(len_all_cpos, - t_final)) - self.results = {'code': 0, - 'result': {'matches': all_matches, - 'match_count': self.match_count, - 'cpos_lookup': all_cpos_infos, - 'text_lookup': text_lookup} - } - return self.results - - def get_cpos_infos(self, all_cpos): - ''' - Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for - all cpos entries specified in the parameter all_cpos. - ''' - # Get all positional attribute informations - cpos_infos = {} - for p_attr_key in self.attr_strings['positional_attrs'].keys(): - match_strs = self.cl_cpos2str(self.attr_strings['positional_attrs'][p_attr_key], all_cpos) - cpos_infos[p_attr_key] = match_strs - - # Get all strucutural attribute informations - tmp_info = {} - structs_to_check = [] - for struct_attr_key in self.attr_strings['struct_attrs'].keys(): - key = self.attr_strings['struct_attrs'][struct_attr_key] - has_value = self.corpus_structural_attribute_has_values(key) - struct_ids = self.cl_cpos2struc(key, all_cpos) - if has_value is False: # Get IDs of strucutural elements without values (this means get IDs of XML tags. Struct elements only have values if they are XML attributes) - tmp_info[struct_attr_key] = [] - for id in struct_ids: - tmp_info[struct_attr_key].append(id) - else: - structs_to_check.append({key: struct_attr_key}) - print('Structs to check: {}'.format(structs_to_check)) - struct_attr_values = list(tmp_info.values()) - # print('Struct attr value list: {}'.format(struct_attr_values)) - struct_attr_keys = list(tmp_info.keys()) - # print('Struct attr key list: {}'.format(struct_attr_keys)) - - # Build textlookup dictionary - text_lookup_ids = list(set(struct_attr_values[0])) # every CPOS is associated with one text id. A set is build to only gather text_lookup informations for every unique text id - text_lookup = {} # final dict containing all info of one text identified by its id - for d in structs_to_check: - s_key, s_value = zip(*d.items()) - print('dict entries: {}: {}'.format(s_key, s_value)) - s_value = s_value[0].split('_', 1)[-1] - print('S_VALUE: {}'.format(s_value)) - struct_values = self.cl_struc2str(s_key[0], text_lookup_ids) - print('Extracted Value with key {}: {}'.format(s_key[0], struct_values)) - zipped = dict(zip(text_lookup_ids, struct_values)) - for zip_key, zip_value in zipped.items(): - print('Text id as key is: {}'.format(zip_key)) - print('Value of this text is: {}'.format(zip_value)) - check = text_lookup.get(zip_key) - print('check: {}'.format(check)) - if check is None: - text_lookup[zip_key] = {s_value: zip_value} - else: - text_lookup[zip_key].update({s_value: zip_value}) - - # zip keys and values together - attr_values_list = [] - attr_keys_list = [] - for key in cpos_infos.keys(): - attr_values_list.append(cpos_infos[key]) - attr_keys_list.append(key) - attr_keys_list.extend(struct_attr_keys) - attr_values_list.extend(struct_attr_values) - joined_cpos_infos = zip(all_cpos, *attr_values_list) - dict_cpos_infos = {} - for info in joined_cpos_infos: - dict_cpos_infos[info[0]] = dict(zip(attr_keys_list, info[1:])) - return dict_cpos_infos, text_lookup - - def get_sentences(self, - match_cpos_list, - get_surrounding_s=False, - l_r_s_context_additional_len=1): - ''' - Get sentence informations for one match also set if and how much left - right context sentences should be grabbed surrounding the given CPOS. - ''' - t0 = time.time() - key = self.corpus_name + '.s' - first_cpos, last_cpos = match_cpos_list[0], match_cpos_list[-1] - context_sentences = {} - s_ids = self.cl_cpos2struc(key, [first_cpos, last_cpos]) - print('s id match: {}'.format(s_ids)) - for s_id in s_ids: - s_start, s_end = self.cl_struc2cpos(key, s_id) - s_cpos = list(range(s_start, s_end + 1)) - context_sentences[s_id] = s_cpos - if get_surrounding_s: - max_s_id = self.cl_attribute_size(key) - 1 - print('max sid: {}'.format(max_s_id)) - additional_s_ids = [] - additional_s = list(range(max(s_ids[0] - - l_r_s_context_additional_len, - 0), - min(s_ids[-1] - + l_r_s_context_additional_len, - max_s_id) + 1)) - additional_s_ids.extend(additional_s) - for s_id in additional_s_ids: - print('s id additional: {}'.format(s_id)) - s_start, s_end = self.cl_struc2cpos(key, s_id) - s_cpos = list(range(s_start, s_end + 1)) - context_sentences[s_id] = s_cpos - all_cpos = [] - for key in context_sentences.keys(): - all_cpos.extend(context_sentences[key]) - all_cpos = list(set(all_cpos)) - all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos) - t1 = time.time() - t_total = t1 - t0 - print('Got all sentences informations in {} seconds'. format(t_total)) - match_context = {'context_s_cpos': context_sentences, - 'cpos_lookup': all_cpos_infos, - 'text_lookup': text_lookup, - 'match_cpos_list': match_cpos_list} - return match_context