nopaque/app/corpora/cqi/wrapper.py

from .api import APIClient
from .constants import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND
import time


class CQiWrapper(APIClient):
    '''
    CQIiWrapper object

    High level wrapper that groups and renames some functions of CQiClient
    for ease of use. Also structures recieved data into python dictionaries.

    Keyword arguments:
    host -- host IP adress or hostname wher the cqp server is running
    port -- port of the cqp server
    username -- username used to connect to the cqp server
    password -- password of the user to connect to the cqp server
    '''

    SUBCORPUS_NAMES = []

    def __init__(self, host='127.0.0.1', port=4877, username='anonymous',
                 password=''):
        super(CQiWrapper, self).__init__(host, port=port)
        self.username = username
        self.password = password

    def connect(self):
        '''
        Connect with CQP server

        Connects via socket to the CQP server using the given username and
        password from class initiation.
        '''
        self.ctrl_connect(self.username, self.password)

    def __create_attribute_strings(self):
        '''
        Creates all needed attribute strings to query for word, lemma etc. in
        the given corpus.
        For example: CORPUS_NAME.word to query words
        Automaticalle creates strings for all pre defined tags.
        '''
        p_attrs = self.corpus_positional_attributes(self.corpus_name)
        struct_attrs = self.corpus_structural_attributes(self.corpus_name)
        self.attr_strings = {}
        self.attr_strings['positional_attrs'] = {}
        self.attr_strings['struct_attrs'] = {}
        for p_attr in p_attrs:
            self.attr_strings['positional_attrs'][p_attr] = (self.corpus_name
                                                             + '.'
                                                             + p_attr)
        for struct_attr in struct_attrs:
            self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
                                                              + '.'
                                                              + struct_attr)
        print(('All positional and '
                        'structural attributes: {}').format(self.attr_strings))

    def select_corpus(self, corpus_name):
        '''
        Checks if given copus name exists. If it exists set it as the main
        corpus name used to create the needed query attribute strings like
        CORPUS_NAME.word.
        '''
        if corpus_name in self.corpus_list_coprora():
            self.corpus_name = corpus_name
            self.__create_attribute_strings()
            print('{} does exist.'.format(corpus_name))
        else:
            print('{} does not exist.'.format(corpus_name))
            raise Exception('Given Corpus Name is not in corpora list.')

    def disconnect(self):
        '''
        Disconnect from CQP server

        Disconnects from the CQP server. Closes used socket after disconnect.
        '''
        self.ctrl_bye()
        print('Disconnected from cqp server.')

    def query_subcorpus(self, query, result_subcorpus_name='Query-results'):
        '''
        Create subcorpus

        Input query will be used to create a subcorpus holding all cpos match
        positions for that query.

        Keyword arguments:
        result_subcorpus_name -- set name of the subcorpus which holds all
        cpos match positions, produced by the query
        query -- query written in cqp query language
        '''
        self.query = query
        self.cqp_query(self.corpus_name, result_subcorpus_name, query)
        self.result_subcorpus = (self.corpus_name
                                 + ':'
                                 + result_subcorpus_name)
        self.SUBCORPUS_NAMES.append(self.result_subcorpus)
        self.match_count = self.cqp_subcorpus_size(self.result_subcorpus)
        print('Nr of all matches is: {}'.format(self.match_count))

    def show_subcorpora(self):
        '''
        Show all subcorpora currently saved by the cqp server.
        '''
        return self.cqp_list_subcorpora(self.corpus_name)

    def show_query_results(self,
                           context_len=10,
                           result_len=1000,
                           result_offset=0):
        '''
        Show query results

        Shows the actual matched strings produce by the query. Uses the cpos
        match indexes to grab those strings. saves them into an orderd
        dictionary. Also saves coresponding tags, lemmas and context. Gets those
        informations using the corresponding cpos.

        Keyword arguments:
        context_len -- defines how many words before and after a match will be
        shown (default 10)
        result_len -- defines for how many matches all informations like lemma
        and POS are being grabbed
        result_offset -- defines the offset of the matches being requested. If
        the offset is 100 informations for matches 100 to result_len are being
        grabbed
        '''
        t0 = time.time()
        self.context_len = context_len
        self.corpus_max_len = self.cl_attribute_size(
                                   self.attr_strings['positional_attrs']['word']
                              )
        self.nr_matches = min(result_len, self.match_count)
        if self.match_count == 0:
            print('Query resulted in 0 matches.')
            self.results = {'code': 0,
                            'result': {'matches': [],
                                       'match_count': self.match_count,
                                       'cpos_lookup': {},
                                       'text_lookup': {}}
                            }
            return self.results
        else:
            # Get match cpos boundries
            # match_boundries shows the start and end cpos of one match as a
            # pair of cpositions
            # [(1355, 1357), (1477, 1479)] Example for two boundry pairs
            offset_start = 0 if result_offset == 0 else result_offset
            print('Offset start is: {}'.format(offset_start))
            offset_end = min((self.nr_matches + result_offset - 1), self.match_count - 1)
            print('Offset end is: {}'.format(offset_end))
            match_boundaries = zip(self.cqp_dump_subcorpus(self.result_subcorpus,
                                                           CONST_FIELD_MATCH,
                                                           offset_start,
                                                           offset_end),
                                   self.cqp_dump_subcorpus(self.result_subcorpus,
                                                           CONST_FIELD_MATCHEND,
                                                           offset_start,
                                                           offset_end))

        # Generate all cpos between match boundries including start and end
        # boundries.
        # Also generate cpos for left and right context.
        # Save those cpos into dict as lists for the keys 'lc', 'hit' and 'rc'
        # Also collect all cpos together in one list for the final request of
        # all cpos informations
        all_matches = []
        all_cpos = []
        for start, end in match_boundaries:
            end += 1
            lc_cpos = list(range(max([0, start - self.context_len]), start))
            lc = {'lc': lc_cpos}
            match_cpos = list(range(start, end))
            match = {'hit': match_cpos}
            rc_cpos = list(range(end, min([self.corpus_max_len,
                                           end + self.context_len])))
            rc = {'rc': rc_cpos}
            lc.update(match)
            lc.update(rc)
            all_cpos.extend(lc_cpos + match_cpos + rc_cpos)
            all_matches.append(lc)

        all_cpos = list(set(all_cpos))  # get rid of cpos duplicates
        len_all_cpos = len(all_cpos)
        t1 = time.time()
        t_total = t1 - t0
        print('Time to create all CPOS for query: {}'.format(t_total))
        print('Requesting {} CPOS with one query.'.format(len_all_cpos))

        # Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
        # all cpos entries in all_cpos_list
        # Also saves these informations into self.results dict
        t2 = time.time()
        all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
        t3 = time.time()
        t_final = t3 - t2
        print('Got infos for {} CPOS in {} seconds:'.format(len_all_cpos,
                                                                     t_final))
        self.results = {'code': 0,
                        'result': {'matches': all_matches,
                                   'match_count': self.match_count,
                                   'cpos_lookup': all_cpos_infos,
                                   'text_lookup': text_lookup}
                        }
        return self.results

    def get_cpos_infos(self, all_cpos):
        '''
        Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
        all cpos entries specified in the parameter all_cpos.
        '''
        # Get all positional attribute informations
        cpos_infos = {}
        for p_attr_key in self.attr_strings['positional_attrs'].keys():
            match_strs = self.cl_cpos2str(self.attr_strings['positional_attrs'][p_attr_key], all_cpos)
            cpos_infos[p_attr_key] = match_strs

        # Get all strucutural attribute informations
        tmp_info = {}
        structs_to_check = []
        for struct_attr_key in self.attr_strings['struct_attrs'].keys():
            key = self.attr_strings['struct_attrs'][struct_attr_key]
            has_value = self.corpus_structural_attribute_has_values(key)
            struct_ids = self.cl_cpos2struc(key, all_cpos)
            if has_value is False:  # Get IDs of strucutural elements without values (this means get IDs of XML tags. Struct elements only have values if they are XML attributes)
                tmp_info[struct_attr_key] = []
                for id in struct_ids:
                    tmp_info[struct_attr_key].append(id)
            else:
                structs_to_check.append({key: struct_attr_key})
        print('Structs to check: {}'.format(structs_to_check))
        struct_attr_values = list(tmp_info.values())
        # print('Struct attr value list: {}'.format(struct_attr_values))
        struct_attr_keys = list(tmp_info.keys())
        # print('Struct attr key list: {}'.format(struct_attr_keys))

        # Build textlookup dictionary
        text_lookup_ids = list(set(struct_attr_values[0]))  # every CPOS is associated with one text id. A set is build to only gather text_lookup informations for every unique text id
        text_lookup = {}  # final dict containing all info of one text identified by its id
        for d in structs_to_check:
            s_key, s_value = zip(*d.items())
            print('dict entries: {}: {}'.format(s_key, s_value))
            s_value = s_value[0].split('_', 1)[-1]
            print('S_VALUE: {}'.format(s_value))
            struct_values = self.cl_struc2str(s_key[0], text_lookup_ids)
            print('Extracted Value with key {}: {}'.format(s_key[0], struct_values))
            zipped = dict(zip(text_lookup_ids, struct_values))
            for zip_key, zip_value in zipped.items():
                print('Text id as key is: {}'.format(zip_key))
                print('Value of this text is: {}'.format(zip_value))
                check = text_lookup.get(zip_key)
                print('check: {}'.format(check))
                if check is None:
                    text_lookup[zip_key] = {s_value: zip_value}
                else:
                    text_lookup[zip_key].update({s_value: zip_value})

        # zip keys and values together
        attr_values_list = []
        attr_keys_list = []
        for key in cpos_infos.keys():
            attr_values_list.append(cpos_infos[key])
            attr_keys_list.append(key)
        attr_keys_list.extend(struct_attr_keys)
        attr_values_list.extend(struct_attr_values)
        joined_cpos_infos = zip(all_cpos, *attr_values_list)
        dict_cpos_infos = {}
        for info in joined_cpos_infos:
            dict_cpos_infos[info[0]] = dict(zip(attr_keys_list, info[1:]))
        return dict_cpos_infos, text_lookup

    def get_sentences(self,
                      match_cpos_list,
                      get_surrounding_s=False,
                      l_r_s_context_additional_len=1):
        '''
        Get sentence informations for one match also set if and how much left
        right context sentences should be grabbed surrounding the given CPOS.
        '''
        t0 = time.time()
        key = self.corpus_name + '.s'
        first_cpos, last_cpos = match_cpos_list[0], match_cpos_list[-1]
        context_sentences = {}
        s_ids = self.cl_cpos2struc(key, [first_cpos, last_cpos])
        print('s id match: {}'.format(s_ids))
        for s_id in s_ids:
            s_start, s_end = self.cl_struc2cpos(key, s_id)
            s_cpos = list(range(s_start, s_end + 1))
            context_sentences[s_id] = s_cpos
        if get_surrounding_s:
            max_s_id = self.cl_attribute_size(key) - 1
            print('max sid: {}'.format(max_s_id))
            additional_s_ids = []
            additional_s = list(range(max(s_ids[0]
                                          - l_r_s_context_additional_len,
                                          0),
                                      min(s_ids[-1]
                                          + l_r_s_context_additional_len,
                                          max_s_id) + 1))
            additional_s_ids.extend(additional_s)
            for s_id in additional_s_ids:
                print('s id additional: {}'.format(s_id))
                s_start, s_end = self.cl_struc2cpos(key, s_id)
                s_cpos = list(range(s_start, s_end + 1))
                context_sentences[s_id] = s_cpos
        all_cpos = []
        for key in context_sentences.keys():
            all_cpos.extend(context_sentences[key])
        all_cpos = list(set(all_cpos))
        all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
        t1 = time.time()
        t_total = t1 - t0
        print('Got all sentences informations in {} seconds'. format(t_total))
        match_context = {'context_s_cpos': context_sentences,
                         'cpos_lookup': all_cpos_infos,
                         'text_lookup': text_lookup,
                         'match_cpos_list': match_cpos_list}
        return match_context
Add package implementation of cqi 2020-03-23 08:10:35 +00:00			`from .api import APIClient`
			`from .constants import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND`
Update 2020-03-11 12:34:09 +00:00			`import time`
Add CQiWrapper 2019-11-07 14:48:47 +00:00

Add package implementation of cqi 2020-03-23 08:10:35 +00:00			`class CQiWrapper(APIClient):`
Add get_sentences to wrapper 2019-12-02 13:19:40 +00:00			`'''`
Add CQiWrapper 2019-11-07 14:48:47 +00:00			`CQIiWrapper object`

			`High level wrapper that groups and renames some functions of CQiClient`
			`for ease of use. Also structures recieved data into python dictionaries.`

			`Keyword arguments:`
Add new CQiWrapper 2019-11-18 13:24:13 +00:00			`host -- host IP adress or hostname wher the cqp server is running`
			`port -- port of the cqp server`
Add CQiWrapper 2019-11-07 14:48:47 +00:00			`username -- username used to connect to the cqp server`
			`password -- password of the user to connect to the cqp server`
Add get_sentences to wrapper 2019-12-02 13:19:40 +00:00			`'''`
Add CQiWrapper 2019-11-07 14:48:47 +00:00
			`SUBCORPUS_NAMES = []`

Update CQi stuff 2020-03-10 15:06:47 +00:00			`def __init__(self, host='127.0.0.1', port=4877, username='anonymous',`
			`password=''):`
Add changes from cqiclient repository 2020-03-20 14:12:19 +00:00			`super(CQiWrapper, self).__init__(host, port=port)`
Add CQiWrapper 2019-11-07 14:48:47 +00:00			`self.username = username`
			`self.password = password`

			`def connect(self):`
Add get_sentences to wrapper 2019-12-02 13:19:40 +00:00			`'''`
Add CQiWrapper 2019-11-07 14:48:47 +00:00			`Connect with CQP server`

			`Connects via socket to the CQP server using the given username and`
			`password from class initiation.`
Add get_sentences to wrapper 2019-12-02 13:19:40 +00:00			`'''`
Add CQiWrapper 2019-11-07 14:48:47 +00:00			`self.ctrl_connect(self.username, self.password)`

Add new CQiWrapper 2019-11-18 13:24:13 +00:00			`def __create_attribute_strings(self):`
Add get_sentences to wrapper 2019-12-02 13:19:40 +00:00			`'''`
Add new CQiWrapper 2019-11-18 13:24:13 +00:00			`Creates all needed attribute strings to query for word, lemma etc. in`
			`the given corpus.`
			`For example: CORPUS_NAME.word to query words`
Add get_sentences to wrapper 2019-12-02 13:19:40 +00:00			`Automaticalle creates strings for all pre defined tags.`
			`'''`
Add new CQiWrapper 2019-11-11 14:35:37 +00:00			`p_attrs = self.corpus_positional_attributes(self.corpus_name)`
			`struct_attrs = self.corpus_structural_attributes(self.corpus_name)`
			`self.attr_strings = {}`
			`self.attr_strings['positional_attrs'] = {}`
			`self.attr_strings['struct_attrs'] = {}`
			`for p_attr in p_attrs:`
			`self.attr_strings['positional_attrs'][p_attr] = (self.corpus_name`
			`+ '.'`
			`+ p_attr)`
Add Wrapper Version 2.0 2019-11-19 10:48:00 +00:00			`for struct_attr in struct_attrs:`
Add new CQiWrapper 2019-11-11 14:35:37 +00:00			`self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name`
			`+ '.'`
			`+ struct_attr)`
Add package implementation of cqi 2020-03-23 08:10:35 +00:00			`print(('All positional and '`
Add get_sentences to wrapper 2019-12-02 13:19:40 +00:00			`'structural attributes: {}').format(self.attr_strings))`
Add new CQiWrapper 2019-11-18 13:24:13 +00:00
			`def select_corpus(self, corpus_name):`
Add get_sentences to wrapper 2019-12-02 13:19:40 +00:00			`'''`
			`Checks if given copus name exists. If it exists set it as the main`
			`corpus name used to create the needed query attribute strings like`
			`CORPUS_NAME.word.`
			`'''`
Add new CQiWrapper 2019-11-18 13:24:13 +00:00			`if corpus_name in self.corpus_list_coprora():`
			`self.corpus_name = corpus_name`
			`self.__create_attribute_strings()`
Add package implementation of cqi 2020-03-23 08:10:35 +00:00			`print('{} does exist.'.format(corpus_name))`
Add new CQiWrapper 2019-11-18 13:24:13 +00:00			`else:`
Add package implementation of cqi 2020-03-23 08:10:35 +00:00			`print('{} does not exist.'.format(corpus_name))`
Add get_sentences to wrapper 2019-12-02 13:19:40 +00:00			`raise Exception('Given Corpus Name is not in corpora list.')`
Add CQiWrapper 2019-11-07 14:48:47 +00:00
			`def disconnect(self):`
Add get_sentences to wrapper 2019-12-02 13:19:40 +00:00			`'''`
Add CQiWrapper 2019-11-07 14:48:47 +00:00			`Disconnect from CQP server`

			`Disconnects from the CQP server. Closes used socket after disconnect.`
Add get_sentences to wrapper 2019-12-02 13:19:40 +00:00			`'''`
Add CQiWrapper 2019-11-07 14:48:47 +00:00			`self.ctrl_bye()`
Add package implementation of cqi 2020-03-23 08:10:35 +00:00			`print('Disconnected from cqp server.')`
Add CQiWrapper 2019-11-07 14:48:47 +00:00
Add new CQiWrapper 2019-11-18 13:24:13 +00:00			`def query_subcorpus(self, query, result_subcorpus_name='Query-results'):`
Add get_sentences to wrapper 2019-12-02 13:19:40 +00:00			`'''`
Add CQiWrapper 2019-11-07 14:48:47 +00:00			`Create subcorpus`

			`Input query will be used to create a subcorpus holding all cpos match`
			`positions for that query.`

			`Keyword arguments:`
Add get_sentences to wrapper 2019-12-02 13:19:40 +00:00			`result_subcorpus_name -- set name of the subcorpus which holds all`
Add CQiWrapper 2019-11-07 14:48:47 +00:00			`cpos match positions, produced by the query`
			`query -- query written in cqp query language`
Add get_sentences to wrapper 2019-12-02 13:19:40 +00:00			`'''`
Add second iteration of json download 2020-01-21 13:50:27 +00:00			`self.query = query`
Add new CQiWrapper 2019-11-11 14:35:37 +00:00			`self.cqp_query(self.corpus_name, result_subcorpus_name, query)`
Add new CQiWrapper 2019-11-18 13:24:13 +00:00			`self.result_subcorpus = (self.corpus_name`
			`+ ':'`
			`+ result_subcorpus_name)`
			`self.SUBCORPUS_NAMES.append(self.result_subcorpus)`
Rename some stuff 2020-01-27 12:19:33 +00:00			`self.match_count = self.cqp_subcorpus_size(self.result_subcorpus)`
Add package implementation of cqi 2020-03-23 08:10:35 +00:00			`print('Nr of all matches is: {}'.format(self.match_count))`
Add CQiWrapper 2019-11-07 14:48:47 +00:00
			`def show_subcorpora(self):`
Add get_sentences to wrapper 2019-12-02 13:19:40 +00:00			`'''`
Add new CQiWrapper 2019-11-18 13:24:13 +00:00			`Show all subcorpora currently saved by the cqp server.`
Add get_sentences to wrapper 2019-12-02 13:19:40 +00:00			`'''`
Add new CQiWrapper 2019-11-11 14:35:37 +00:00			`return self.cqp_list_subcorpora(self.corpus_name)`
Add CQiWrapper 2019-11-07 14:48:47 +00:00
Add new CQiWrapper 2019-11-18 13:24:13 +00:00			`def show_query_results(self,`
			`context_len=10,`
Get results with wrapper 3.0 2019-11-28 13:14:56 +00:00			`result_len=1000,`
			`result_offset=0):`
Add get_sentences to wrapper 2019-12-02 13:19:40 +00:00			`'''`
Add CQiWrapper 2019-11-07 14:48:47 +00:00			`Show query results`

			`Shows the actual matched strings produce by the query. Uses the cpos`
			`match indexes to grab those strings. saves them into an orderd`
Add new CQiWrapper 2019-11-18 13:24:13 +00:00			`dictionary. Also saves coresponding tags, lemmas and context. Gets those`
			`informations using the corresponding cpos.`
Add CQiWrapper 2019-11-07 14:48:47 +00:00
			`Keyword arguments:`
			`context_len -- defines how many words before and after a match will be`
			`shown (default 10)`
Add get_sentences to wrapper 2019-12-02 13:19:40 +00:00			`result_len -- defines for how many matches all informations like lemma`
			`and POS are being grabbed`
			`result_offset -- defines the offset of the matches being requested. If`
			`the offset is 100 informations for matches 100 to result_len are being`
			`grabbed`
			`'''`
			`t0 = time.time()`
Add CQiWrapper 2019-11-07 14:48:47 +00:00			`self.context_len = context_len`
Add new CQiWrapper 2019-11-18 13:24:13 +00:00			`self.corpus_max_len = self.cl_attribute_size(`
			`self.attr_strings['positional_attrs']['word']`
			`)`
Rename some stuff 2020-01-27 12:19:33 +00:00			`self.nr_matches = min(result_len, self.match_count)`
NEw analysis stuff 2020-01-27 15:11:34 +00:00			`if self.match_count == 0:`
Add package implementation of cqi 2020-03-23 08:10:35 +00:00			`print('Query resulted in 0 matches.')`
NEw analysis stuff 2020-01-27 15:11:34 +00:00			`self.results = {'code': 0,`
			`'result': {'matches': [],`
			`'match_count': self.match_count,`
			`'cpos_lookup': {},`
			`'text_lookup': {}}`
			`}`
			`return self.results`
Add CQiWrapper 2019-11-07 14:48:47 +00:00			`else:`
Add new CQiWrapper 2019-11-18 13:24:13 +00:00			`# Get match cpos boundries`
			`# match_boundries shows the start and end cpos of one match as a`
			`# pair of cpositions`
			`# [(1355, 1357), (1477, 1479)] Example for two boundry pairs`
Fixe some things for query results 2019-11-28 14:19:52 +00:00			`offset_start = 0 if result_offset == 0 else result_offset`
Add package implementation of cqi 2020-03-23 08:10:35 +00:00			`print('Offset start is: {}'.format(offset_start))`
Rename some stuff 2020-01-27 12:19:33 +00:00			`offset_end = min((self.nr_matches + result_offset - 1), self.match_count - 1)`
Add package implementation of cqi 2020-03-23 08:10:35 +00:00			`print('Offset end is: {}'.format(offset_end))`
Add new CQiWrapper 2019-11-18 13:24:13 +00:00			`match_boundaries = zip(self.cqp_dump_subcorpus(self.result_subcorpus,`
Add package implementation of cqi 2020-03-23 08:10:35 +00:00			`CONST_FIELD_MATCH,`
Get results with wrapper 3.0 2019-11-28 13:14:56 +00:00			`offset_start,`
			`offset_end),`
Add new CQiWrapper 2019-11-18 13:24:13 +00:00			`self.cqp_dump_subcorpus(self.result_subcorpus,`
Add package implementation of cqi 2020-03-23 08:10:35 +00:00			`CONST_FIELD_MATCHEND,`
Get results with wrapper 3.0 2019-11-28 13:14:56 +00:00			`offset_start,`
			`offset_end))`
Add new CQiWrapper 2019-11-18 13:24:13 +00:00
Add get_sentences to wrapper 2019-12-02 13:19:40 +00:00			`# Generate all cpos between match boundries including start and end`
			`# boundries.`
CQiWrapper new data structure 2019-11-27 08:41:21 +00:00			`# Also generate cpos for left and right context.`
			`# Save those cpos into dict as lists for the keys 'lc', 'hit' and 'rc'`
			`# Also collect all cpos together in one list for the final request of`
			`# all cpos informations`
			`all_matches = []`
			`all_cpos = []`
			`for start, end in match_boundaries:`
Fixe some things for query results 2019-11-28 14:19:52 +00:00			`end += 1`
CQiWrapper new data structure 2019-11-27 08:41:21 +00:00			`lc_cpos = list(range(max([0, start - self.context_len]), start))`
			`lc = {'lc': lc_cpos}`
Fixe some things for query results 2019-11-28 14:19:52 +00:00			`match_cpos = list(range(start, end))`
CQiWrapper new data structure 2019-11-27 08:41:21 +00:00			`match = {'hit': match_cpos}`
Add get_sentences to wrapper 2019-12-02 13:19:40 +00:00			`rc_cpos = list(range(end, min([self.corpus_max_len,`
			`end + self.context_len])))`
CQiWrapper new data structure 2019-11-27 08:41:21 +00:00			`rc = {'rc': rc_cpos}`
			`lc.update(match)`
			`lc.update(rc)`
			`all_cpos.extend(lc_cpos + match_cpos + rc_cpos)`
			`all_matches.append(lc)`
Add get_sentences to wrapper 2019-12-02 13:19:40 +00:00
			`all_cpos = list(set(all_cpos)) # get rid of cpos duplicates`
			`len_all_cpos = len(all_cpos)`
Get results with wrapper 3.0 2019-11-28 13:14:56 +00:00			`t1 = time.time()`
			`t_total = t1 - t0`
Add package implementation of cqi 2020-03-23 08:10:35 +00:00			`print('Time to create all CPOS for query: {}'.format(t_total))`
			`print('Requesting {} CPOS with one query.'.format(len_all_cpos))`
Add new CQiWrapper 2019-11-18 13:24:13 +00:00
			`# Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for`
			`# all cpos entries in all_cpos_list`
CQiWrapper new data structure 2019-11-27 08:41:21 +00:00			`# Also saves these informations into self.results dict`
Add get_sentences to wrapper 2019-12-02 13:19:40 +00:00			`t2 = time.time()`
CQiWrapper new data structure 2019-11-27 08:41:21 +00:00			`all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)`
Add get_sentences to wrapper 2019-12-02 13:19:40 +00:00			`t3 = time.time()`
			`t_final = t3 - t2`
Add package implementation of cqi 2020-03-23 08:10:35 +00:00			`print('Got infos for {} CPOS in {} seconds:'.format(len_all_cpos,`
Remove redundant info 2020-01-20 14:53:06 +00:00			`t_final))`
Rename some stuff 2020-01-27 12:19:33 +00:00			`self.results = {'code': 0,`
			`'result': {'matches': all_matches,`
			`'match_count': self.match_count,`
			`'cpos_lookup': all_cpos_infos,`
NEw analysis stuff 2020-01-27 15:11:34 +00:00			`'text_lookup': text_lookup}`
Rename some stuff 2020-01-27 12:19:33 +00:00			`}`
CQiWrapper new data structure 2019-11-27 08:41:21 +00:00			`return self.results`
Add new CQiWrapper 2019-11-18 13:24:13 +00:00
			`def get_cpos_infos(self, all_cpos):`
			`'''`
			`Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for`
			`all cpos entries specified in the parameter all_cpos.`
			`'''`
Get results with wrapper 3.0 2019-11-28 13:14:56 +00:00			`# Get all positional attribute informations`
Add new CQiWrapper 2019-11-18 13:24:13 +00:00			`cpos_infos = {}`
CQiWrapper new data structure 2019-11-27 08:41:21 +00:00			`for p_attr_key in self.attr_strings['positional_attrs'].keys():`
			`match_strs = self.cl_cpos2str(self.attr_strings['positional_attrs'][p_attr_key], all_cpos)`
			`cpos_infos[p_attr_key] = match_strs`

Get results with wrapper 3.0 2019-11-28 13:14:56 +00:00			`# Get all strucutural attribute informations`
			`tmp_info = {}`
			`structs_to_check = []`
CQiWrapper new data structure 2019-11-27 08:41:21 +00:00			`for struct_attr_key in self.attr_strings['struct_attrs'].keys():`
Get results with wrapper 3.0 2019-11-28 13:14:56 +00:00			`key = self.attr_strings['struct_attrs'][struct_attr_key]`
			`has_value = self.corpus_structural_attribute_has_values(key)`
			`struct_ids = self.cl_cpos2struc(key, all_cpos)`
			`if has_value is False: # Get IDs of strucutural elements without values (this means get IDs of XML tags. Struct elements only have values if they are XML attributes)`
			`tmp_info[struct_attr_key] = []`
CQiWrapper new data structure 2019-11-27 08:41:21 +00:00			`for id in struct_ids:`
Get results with wrapper 3.0 2019-11-28 13:14:56 +00:00			`tmp_info[struct_attr_key].append(id)`
CQiWrapper new data structure 2019-11-27 08:41:21 +00:00			`else:`
Get results with wrapper 3.0 2019-11-28 13:14:56 +00:00			`structs_to_check.append({key: struct_attr_key})`
Add package implementation of cqi 2020-03-23 08:10:35 +00:00			`print('Structs to check: {}'.format(structs_to_check))`
Get results with wrapper 3.0 2019-11-28 13:14:56 +00:00			`struct_attr_values = list(tmp_info.values())`
Add package implementation of cqi 2020-03-23 08:10:35 +00:00			`# print('Struct attr value list: {}'.format(struct_attr_values))`
Get results with wrapper 3.0 2019-11-28 13:14:56 +00:00			`struct_attr_keys = list(tmp_info.keys())`
Add package implementation of cqi 2020-03-23 08:10:35 +00:00			`# print('Struct attr key list: {}'.format(struct_attr_keys))`
Get results with wrapper 3.0 2019-11-28 13:14:56 +00:00
			`# Build textlookup dictionary`
Work on new list bilding 2020-01-29 15:12:57 +00:00			`text_lookup_ids = list(set(struct_attr_values[0])) # every CPOS is associated with one text id. A set is build to only gather text_lookup informations for every unique text id`
			`text_lookup = {} # final dict containing all info of one text identified by its id`
Get results with wrapper 3.0 2019-11-28 13:14:56 +00:00			`for d in structs_to_check:`
			`s_key, s_value = zip(*d.items())`
Add package implementation of cqi 2020-03-23 08:10:35 +00:00			`print('dict entries: {}: {}'.format(s_key, s_value))`
Work on new list bilding 2020-01-29 15:12:57 +00:00			`s_value = s_value[0].split('_', 1)[-1]`
Add package implementation of cqi 2020-03-23 08:10:35 +00:00			`print('S_VALUE: {}'.format(s_value))`
Get results with wrapper 3.0 2019-11-28 13:14:56 +00:00			`struct_values = self.cl_struc2str(s_key[0], text_lookup_ids)`
Add package implementation of cqi 2020-03-23 08:10:35 +00:00			`print('Extracted Value with key {}: {}'.format(s_key[0], struct_values))`
Get results with wrapper 3.0 2019-11-28 13:14:56 +00:00			`zipped = dict(zip(text_lookup_ids, struct_values))`
			`for zip_key, zip_value in zipped.items():`
Add package implementation of cqi 2020-03-23 08:10:35 +00:00			`print('Text id as key is: {}'.format(zip_key))`
			`print('Value of this text is: {}'.format(zip_value))`
Get results with wrapper 3.0 2019-11-28 13:14:56 +00:00			`check = text_lookup.get(zip_key)`
Add package implementation of cqi 2020-03-23 08:10:35 +00:00			`print('check: {}'.format(check))`
Get results with wrapper 3.0 2019-11-28 13:14:56 +00:00			`if check is None:`
			`text_lookup[zip_key] = {s_value: zip_value}`
			`else:`
			`text_lookup[zip_key].update({s_value: zip_value})`

			`# zip keys and values together`
			`attr_values_list = []`
			`attr_keys_list = []`
Add new CQiWrapper 2019-11-18 13:24:13 +00:00			`for key in cpos_infos.keys():`
Get results with wrapper 3.0 2019-11-28 13:14:56 +00:00			`attr_values_list.append(cpos_infos[key])`
			`attr_keys_list.append(key)`
			`attr_keys_list.extend(struct_attr_keys)`
			`attr_values_list.extend(struct_attr_values)`
			`joined_cpos_infos = zip(all_cpos, *attr_values_list)`
Add new CQiWrapper 2019-11-18 13:24:13 +00:00			`dict_cpos_infos = {}`
			`for info in joined_cpos_infos:`
Get results with wrapper 3.0 2019-11-28 13:14:56 +00:00			`dict_cpos_infos[info[0]] = dict(zip(attr_keys_list, info[1:]))`
CQiWrapper new data structure 2019-11-27 08:41:21 +00:00			`return dict_cpos_infos, text_lookup`
Add get_sentences to wrapper 2019-12-02 13:19:40 +00:00
			`def get_sentences(self,`
			`match_cpos_list,`
			`get_surrounding_s=False,`
			`l_r_s_context_additional_len=1):`
			`'''`
			`Get sentence informations for one match also set if and how much left`
			`right context sentences should be grabbed surrounding the given CPOS.`
			`'''`
			`t0 = time.time()`
			`key = self.corpus_name + '.s'`
			`first_cpos, last_cpos = match_cpos_list[0], match_cpos_list[-1]`
			`context_sentences = {}`
			`s_ids = self.cl_cpos2struc(key, [first_cpos, last_cpos])`
Add package implementation of cqi 2020-03-23 08:10:35 +00:00			`print('s id match: {}'.format(s_ids))`
Add get_sentences to wrapper 2019-12-02 13:19:40 +00:00			`for s_id in s_ids:`
			`s_start, s_end = self.cl_struc2cpos(key, s_id)`
			`s_cpos = list(range(s_start, s_end + 1))`
			`context_sentences[s_id] = s_cpos`
			`if get_surrounding_s:`
Continue new list building for results 2020-02-03 11:58:40 +00:00			`max_s_id = self.cl_attribute_size(key) - 1`
Add package implementation of cqi 2020-03-23 08:10:35 +00:00			`print('max sid: {}'.format(max_s_id))`
Add get_sentences to wrapper 2019-12-02 13:19:40 +00:00			`additional_s_ids = []`
			`additional_s = list(range(max(s_ids[0]`
			`- l_r_s_context_additional_len,`
			`0),`
			`min(s_ids[-1]`
			`+ l_r_s_context_additional_len,`
			`max_s_id) + 1))`
			`additional_s_ids.extend(additional_s)`
			`for s_id in additional_s_ids:`
Add package implementation of cqi 2020-03-23 08:10:35 +00:00			`print('s id additional: {}'.format(s_id))`
Add get_sentences to wrapper 2019-12-02 13:19:40 +00:00			`s_start, s_end = self.cl_struc2cpos(key, s_id)`
			`s_cpos = list(range(s_start, s_end + 1))`
			`context_sentences[s_id] = s_cpos`
			`all_cpos = []`
			`for key in context_sentences.keys():`
			`all_cpos.extend(context_sentences[key])`
			`all_cpos = list(set(all_cpos))`
			`all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)`
			`t1 = time.time()`
			`t_total = t1 - t0`
Add package implementation of cqi 2020-03-23 08:10:35 +00:00			`print('Got all sentences informations in {} seconds'. format(t_total))`
Add first things to get more context for one match. 2019-12-02 15:13:53 +00:00			`match_context = {'context_s_cpos': context_sentences,`
			`'cpos_lookup': all_cpos_infos,`
Add Inspect view 2019-12-03 14:11:31 +00:00			`'text_lookup': text_lookup,`
			`'match_cpos_list': match_cpos_list}`
Add first things to get more context for one match. 2019-12-02 15:13:53 +00:00			`return match_context`