Remove wrapper

2026-06-24 13:30:29 +00:00 · 2020-04-07 16:27:28 +02:00
parent 83a607728d
commit a1cdfd498a
1 changed files with 0 additions and 321 deletions
@@ -1,321 +0,0 @@
-from .api import APIClient
-from .api.specification import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND
-import time
-
-
-class CQiWrapper(APIClient):
-    '''
-    CQIiWrapper object
-
-    High level wrapper that groups and renames some functions of CQiClient
-    for ease of use. Also structures recieved data into python dictionaries.
-
-    Keyword arguments:
-    host -- host IP adress or hostname wher the cqp server is running
-    port -- port of the cqp server
-    username -- username used to connect to the cqp server
-    password -- password of the user to connect to the cqp server
-    '''
-
-    SUBCORPUS_NAMES = []
-
-    def __init__(self, host='127.0.0.1', port=4877, username='anonymous',
-                 password=''):
-        super(CQiWrapper, self).__init__(host, port=port)
-        self.username = username
-        self.password = password
-
-    def connect(self):
-        '''
-        Connect with CQP server
-
-        Connects via socket to the CQP server using the given username and
-        password from class initiation.
-        '''
-        self.ctrl_connect(self.username, self.password)
-
-    def __create_attribute_strings(self):
-        '''
-        Creates all needed attribute strings to query for word, lemma etc. in
-        the given corpus.
-        For example: CORPUS_NAME.word to query words
-        Automaticalle creates strings for all pre defined tags.
-        '''
-        p_attrs = self.corpus_positional_attributes(self.corpus_name)
-        struct_attrs = self.corpus_structural_attributes(self.corpus_name)
-        self.attr_strings = {}
-        self.attr_strings['positional_attrs'] = {}
-        self.attr_strings['struct_attrs'] = {}
-        for p_attr in p_attrs:
-            self.attr_strings['positional_attrs'][p_attr] = (self.corpus_name
-                                                             + '.'
-                                                             + p_attr)
-        for struct_attr in struct_attrs:
-            self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
-                                                              + '.'
-                                                              + struct_attr)
-        print(('All positional and '
-                        'structural attributes: {}').format(self.attr_strings))
-
-    def select_corpus(self, corpus_name):
-        '''
-        Checks if given copus name exists. If it exists set it as the main
-        corpus name used to create the needed query attribute strings like
-        CORPUS_NAME.word.
-        '''
-        if corpus_name in self.corpus_list_coprora():
-            self.corpus_name = corpus_name
-            self.__create_attribute_strings()
-            print('{} does exist.'.format(corpus_name))
-        else:
-            print('{} does not exist.'.format(corpus_name))
-            raise Exception('Given Corpus Name is not in corpora list.')
-
-    def disconnect(self):
-        '''
-        Disconnect from CQP server
-
-        Disconnects from the CQP server. Closes used socket after disconnect.
-        '''
-        self.ctrl_bye()
-        print('Disconnected from cqp server.')
-
-    def query_subcorpus(self, query, result_subcorpus_name='Query-results'):
-        '''
-        Create subcorpus
-
-        Input query will be used to create a subcorpus holding all cpos match
-        positions for that query.
-
-        Keyword arguments:
-        result_subcorpus_name -- set name of the subcorpus which holds all
-        cpos match positions, produced by the query
-        query -- query written in cqp query language
-        '''
-        self.query = query
-        self.cqp_query(self.corpus_name, result_subcorpus_name, query)
-        self.result_subcorpus = (self.corpus_name
-                                 + ':'
-                                 + result_subcorpus_name)
-        self.SUBCORPUS_NAMES.append(self.result_subcorpus)
-        self.match_count = self.cqp_subcorpus_size(self.result_subcorpus)
-        print('Nr of all matches is: {}'.format(self.match_count))
-
-    def show_subcorpora(self):
-        '''
-        Show all subcorpora currently saved by the cqp server.
-        '''
-        return self.cqp_list_subcorpora(self.corpus_name)
-
-    def show_query_results(self,
-                           context_len=10,
-                           result_len=1000,
-                           result_offset=0):
-        '''
-        Show query results
-
-        Shows the actual matched strings produce by the query. Uses the cpos
-        match indexes to grab those strings. saves them into an orderd
-        dictionary. Also saves coresponding tags, lemmas and context. Gets those
-        informations using the corresponding cpos.
-
-        Keyword arguments:
-        context_len -- defines how many words before and after a match will be
-        shown (default 10)
-        result_len -- defines for how many matches all informations like lemma
-        and POS are being grabbed
-        result_offset -- defines the offset of the matches being requested. If
-        the offset is 100 informations for matches 100 to result_len are being
-        grabbed
-        '''
-        t0 = time.time()
-        self.context_len = context_len
-        self.corpus_max_len = self.cl_attribute_size(
-                                   self.attr_strings['positional_attrs']['word']
-                              )
-        self.nr_matches = min(result_len, self.match_count)
-        if self.match_count == 0:
-            print('Query resulted in 0 matches.')
-            self.results = {'code': 0,
-                            'result': {'matches': [],
-                                       'match_count': self.match_count,
-                                       'cpos_lookup': {},
-                                       'text_lookup': {}}
-                            }
-            return self.results
-        else:
-            # Get match cpos boundries
-            # match_boundries shows the start and end cpos of one match as a
-            # pair of cpositions
-            # [(1355, 1357), (1477, 1479)] Example for two boundry pairs
-            offset_start = 0 if result_offset == 0 else result_offset
-            print('Offset start is: {}'.format(offset_start))
-            offset_end = min((self.nr_matches + result_offset - 1), self.match_count - 1)
-            print('Offset end is: {}'.format(offset_end))
-            match_boundaries = zip(self.cqp_dump_subcorpus(self.result_subcorpus,
-                                                           CONST_FIELD_MATCH,
-                                                           offset_start,
-                                                           offset_end),
-                                   self.cqp_dump_subcorpus(self.result_subcorpus,
-                                                           CONST_FIELD_MATCHEND,
-                                                           offset_start,
-                                                           offset_end))
-
-        # Generate all cpos between match boundries including start and end
-        # boundries.
-        # Also generate cpos for left and right context.
-        # Save those cpos into dict as lists for the keys 'lc', 'hit' and 'rc'
-        # Also collect all cpos together in one list for the final request of
-        # all cpos informations
-        all_matches = []
-        all_cpos = []
-        for start, end in match_boundaries:
-            end += 1
-            lc_cpos = list(range(max([0, start - self.context_len]), start))
-            lc = {'lc': lc_cpos}
-            match_cpos = list(range(start, end))
-            match = {'hit': match_cpos}
-            rc_cpos = list(range(end, min([self.corpus_max_len,
-                                           end + self.context_len])))
-            rc = {'rc': rc_cpos}
-            lc.update(match)
-            lc.update(rc)
-            all_cpos.extend(lc_cpos + match_cpos + rc_cpos)
-            all_matches.append(lc)
-
-        all_cpos = list(set(all_cpos))  # get rid of cpos duplicates
-        len_all_cpos = len(all_cpos)
-        t1 = time.time()
-        t_total = t1 - t0
-        print('Time to create all CPOS for query: {}'.format(t_total))
-        print('Requesting {} CPOS with one query.'.format(len_all_cpos))
-
-        # Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
-        # all cpos entries in all_cpos_list
-        # Also saves these informations into self.results dict
-        t2 = time.time()
-        all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
-        t3 = time.time()
-        t_final = t3 - t2
-        print('Got infos for {} CPOS in {} seconds:'.format(len_all_cpos,
-                                                                     t_final))
-        self.results = {'code': 0,
-                        'result': {'matches': all_matches,
-                                   'match_count': self.match_count,
-                                   'cpos_lookup': all_cpos_infos,
-                                   'text_lookup': text_lookup}
-                        }
-        return self.results
-
-    def get_cpos_infos(self, all_cpos):
-        '''
-        Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
-        all cpos entries specified in the parameter all_cpos.
-        '''
-        # Get all positional attribute informations
-        cpos_infos = {}
-        for p_attr_key in self.attr_strings['positional_attrs'].keys():
-            match_strs = self.cl_cpos2str(self.attr_strings['positional_attrs'][p_attr_key], all_cpos)
-            cpos_infos[p_attr_key] = match_strs
-
-        # Get all strucutural attribute informations
-        tmp_info = {}
-        structs_to_check = []
-        for struct_attr_key in self.attr_strings['struct_attrs'].keys():
-            key = self.attr_strings['struct_attrs'][struct_attr_key]
-            has_value = self.corpus_structural_attribute_has_values(key)
-            struct_ids = self.cl_cpos2struc(key, all_cpos)
-            if has_value is False:  # Get IDs of strucutural elements without values (this means get IDs of XML tags. Struct elements only have values if they are XML attributes)
-                tmp_info[struct_attr_key] = []
-                for id in struct_ids:
-                    tmp_info[struct_attr_key].append(id)
-            else:
-                structs_to_check.append({key: struct_attr_key})
-        print('Structs to check: {}'.format(structs_to_check))
-        struct_attr_values = list(tmp_info.values())
-        # print('Struct attr value list: {}'.format(struct_attr_values))
-        struct_attr_keys = list(tmp_info.keys())
-        # print('Struct attr key list: {}'.format(struct_attr_keys))
-
-        # Build textlookup dictionary
-        text_lookup_ids = list(set(struct_attr_values[0]))  # every CPOS is associated with one text id. A set is build to only gather text_lookup informations for every unique text id
-        text_lookup = {}  # final dict containing all info of one text identified by its id
-        for d in structs_to_check:
-            s_key, s_value = zip(*d.items())
-            print('dict entries: {}: {}'.format(s_key, s_value))
-            s_value = s_value[0].split('_', 1)[-1]
-            print('S_VALUE: {}'.format(s_value))
-            struct_values = self.cl_struc2str(s_key[0], text_lookup_ids)
-            print('Extracted Value with key {}: {}'.format(s_key[0], struct_values))
-            zipped = dict(zip(text_lookup_ids, struct_values))
-            for zip_key, zip_value in zipped.items():
-                print('Text id as key is: {}'.format(zip_key))
-                print('Value of this text is: {}'.format(zip_value))
-                check = text_lookup.get(zip_key)
-                print('check: {}'.format(check))
-                if check is None:
-                    text_lookup[zip_key] = {s_value: zip_value}
-                else:
-                    text_lookup[zip_key].update({s_value: zip_value})
-
-        # zip keys and values together
-        attr_values_list = []
-        attr_keys_list = []
-        for key in cpos_infos.keys():
-            attr_values_list.append(cpos_infos[key])
-            attr_keys_list.append(key)
-        attr_keys_list.extend(struct_attr_keys)
-        attr_values_list.extend(struct_attr_values)
-        joined_cpos_infos = zip(all_cpos, *attr_values_list)
-        dict_cpos_infos = {}
-        for info in joined_cpos_infos:
-            dict_cpos_infos[info[0]] = dict(zip(attr_keys_list, info[1:]))
-        return dict_cpos_infos, text_lookup
-
-    def get_sentences(self,
-                      match_cpos_list,
-                      get_surrounding_s=False,
-                      l_r_s_context_additional_len=1):
-        '''
-        Get sentence informations for one match also set if and how much left
-        right context sentences should be grabbed surrounding the given CPOS.
-        '''
-        t0 = time.time()
-        key = self.corpus_name + '.s'
-        first_cpos, last_cpos = match_cpos_list[0], match_cpos_list[-1]
-        context_sentences = {}
-        s_ids = self.cl_cpos2struc(key, [first_cpos, last_cpos])
-        print('s id match: {}'.format(s_ids))
-        for s_id in s_ids:
-            s_start, s_end = self.cl_struc2cpos(key, s_id)
-            s_cpos = list(range(s_start, s_end + 1))
-            context_sentences[s_id] = s_cpos
-        if get_surrounding_s:
-            max_s_id = self.cl_attribute_size(key) - 1
-            print('max sid: {}'.format(max_s_id))
-            additional_s_ids = []
-            additional_s = list(range(max(s_ids[0]
-                                          - l_r_s_context_additional_len,
-                                          0),
-                                      min(s_ids[-1]
-                                          + l_r_s_context_additional_len,
-                                          max_s_id) + 1))
-            additional_s_ids.extend(additional_s)
-            for s_id in additional_s_ids:
-                print('s id additional: {}'.format(s_id))
-                s_start, s_end = self.cl_struc2cpos(key, s_id)
-                s_cpos = list(range(s_start, s_end + 1))
-                context_sentences[s_id] = s_cpos
-        all_cpos = []
-        for key in context_sentences.keys():
-            all_cpos.extend(context_sentences[key])
-        all_cpos = list(set(all_cpos))
-        all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
-        t1 = time.time()
-        t_total = t1 - t0
-        print('Got all sentences informations in {} seconds'. format(t_total))
-        match_context = {'context_s_cpos': context_sentences,
-                         'cpos_lookup': all_cpos_infos,
-                         'text_lookup': text_lookup,
-                         'match_cpos_list': match_cpos_list}
-        return match_context