nopaque/app/corpora/CQiWrapper/CQiWrapper.py

from .CQiClient import CQiClient
import multiprocessing
import collections


class CQiWrapper(CQiClient):
    """
    CQIiWrapper object

    High level wrapper that groups and renames some functions of CQiClient
    for ease of use. Also structures recieved data into python dictionaries.

    Keyword arguments:
    username -- username used to connect to the cqp server
    password -- password of the user to connect to the cqp server
    """

    SUBCORPUS_NAMES = []

    def __init__(self, host='127.0.0.1', port=4877, username='opaque',
                 password='opaque'):
        super(CQiWrapper, self).__init__(host=host, port=port)
        self.username = username
        self.password = password

    def connect(self):
        """
        Connect with CQP server

        Connects via socket to the CQP server using the given username and
        password from class initiation.
        """
        self.ctrl_connect(self.username, self.password)

    def create_attribute_strings(self):
        p_attrs = self.corpus_positional_attributes(self.corpus_name)
        struct_attrs = self.corpus_structural_attributes(self.corpus_name)
        self.meta_struct_element = struct_attrs[0]
        print(p_attrs)
        print(struct_attrs)
        self.attr_strings = {}
        self.attr_strings['positional_attrs'] = {}
        self.attr_strings['struct_attrs'] = {}
        for p_attr in p_attrs:
            self.attr_strings['positional_attrs'][p_attr] = (self.corpus_name
                                                             + '.'
                                                             + p_attr)
        for struct_attr in struct_attrs[:-1]:
            self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
                                                              + '.'
                                                              + struct_attr)
        # self.word_str = corpus_name + '.word'
        # self.lemma_str = corpus_name + '.lemma'
        # self.pos_str = corpus_name + '.pos'
        # self.sem_str = corpus_name + '.sem'
        # self.entry_str = corpus_name + '.entry'
        # self.entry_author_str = self.entry_str + '_author'
        # self.entry_title_str = self.entry_str + '_title'
        # self.attributes = [self.word_str,
        #                    self.lemma_str,
        #                    self.pos_str,
        #                    self.sem_str,
        #                    self.entry_str,
        #                    self.entry_author_str,
        #                    self.entry_title_str]
        # print(self.attributes)

    def set_corpus_name(self, corpus_name):
        self.corpus_name = corpus_name

    def disconnect(self):
        """
        Disconnect from CQP server

        Disconnects from the CQP server. Closes used socket after disconnect.
        """
        self.ctrl_bye()
        self.connection.close()

    def query_subcorpus(self, result_subcorpus_name, query):
        """
        Create subcorpus

        Input query will be used to create a subcorpus holding all cpos match
        positions for that query.

        Keyword arguments:
        result_subcorpus_name -- user set name of the subcorpus which holds all
        cpos match positions, produced by the query
        query -- query written in cqp query language
        """
        self.cqp_query(self.corpus_name, result_subcorpus_name, query)
        self.result_subcorpus_ns = (self.corpus_name
                                    + ':'
                                    + result_subcorpus_name)
        self.SUBCORPUS_NAMES.append(self.result_subcorpus_ns)
        self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus_ns)
        print('Nr of all matches is:', self.nr_matches)

    def show_subcorpora(self):
        return self.cqp_list_subcorpora(self.corpus_name)

    def show_results(self,
                     result_start_count=0,
                     result_max_count=50,
                     context_len=10,):
        """
        Show query results

        Shows the actual matched strings produce by the query. Uses the cpos
        match indexes to grab those strings. saves them into an orderd
        dictionary. Also saves coresponding tags, lemmas and context:
        OrderedDict([
            (0,
                {
                    'tokens': ['Big', 'Brother', 'himself'],
                    'lemmas': ['big', 'brother', 'himself'],
                    'pos_tags': ['JJ', 'NN1', 'PPX1'],
                    'sem_tags': ['|A11.1+|N3.2+|N5+|', '|S2.2m|S4m|S9/S2.2m|',
                                 '|Z8m|'],
                    'context_before': ['figures', 'of', 'the', 'Party', ',',
                                       'almost', 'on', 'a', 'level', 'with'],
                    'context_after': [',', 'and', 'then', 'had', 'engaged',
                                      'in', 'counter-revolu-', 'tionary',
                                      'activities', ','],
                    'entry_title': '1984', 'entry_author':
                    'george_orwell',
                    'cpos_start': 110490,
                    'cpos_end': 110492
                }
            )
        ])

        Keyword arguments:
        result_start_count -- start position of the dumped subcorpus.
        (default 0) If it is 0 matches 0 to 50 will be shown. If it is 50
        matches 50 to 100 will be shown.
        result_max_count -- defines how many matches at once will be shown.
        (default 50)
        context_len -- defines how many words before and after a match will be
        shown (default 10)
        """
        self.context_len = context_len
        self.corpus_max_len = self.cl_attribute_size(self.attr_strings['positional_attrs']['word'])
        if self.nr_matches == 0:
            print('Query resulted in 0 matches.')
        else:
            if self.nr_matches <= 50:
                matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
                                                        0x10,
                                                        0,
                                                        self.nr_matches - 1)
                matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
                                                      0x11,
                                                      0, self.nr_matches - 1)
            else:
                matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
                                                        0x10,
                                                        result_start_count,
                                                        result_max_count - 1)
                matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
                                                      0x11,
                                                      result_start_count,
                                                      result_max_count - 1)
            match_indexes = zip(matches_start, matches_end)

            matches = []
            manager = multiprocessing.Manager()
            return_dict = manager.dict()
            for i, index_pair in enumerate(match_indexes):
                match = multiprocessing.Process(target=self.__get_matches,
                                                args=(i,
                                                      index_pair,
                                                      self.corpus_name,
                                                      return_dict))
                matches.append(match)
                match.start()
            for match in matches:
                match.join()
            #  sort matches into ordered dict
            ordered_results = collections.OrderedDict()
            for key in sorted(return_dict.keys()):
                ordered_results[key] = return_dict[key]
            return ordered_results

    def get_cpos_info(self, cpos, session):
        match_dict = {}
        for attr_dict in self.attr_strings:
            # print(self.attr_strings[attr_dict])
            if attr_dict == 'positional_attrs':
                for p_attr_key in self.attr_strings[attr_dict].keys():
                    # print(p_attr_key)
                    match_str = session.cl_cpos2str(self.attr_strings[attr_dict][p_attr_key], range(cpos[0], cpos[1]))
                    match_dict[p_attr_key] = match_str
            elif attr_dict == 'struct_attrs':
                for struct_attr_key in self.attr_strings[attr_dict].keys():
                    # print(struct_attr_key)
                    struct_entry = session.cl_cpos2struc(self.attr_strings['struct_attrs'][self.meta_struct_element],
                                                         range(cpos[0], cpos[1]))
                    match_str = session.cl_struc2str(self.attr_strings[attr_dict][struct_attr_key], struct_entry)
                    match_dict[struct_attr_key] = set(match_str)
        return match_dict

    def __get_matches(self, i, index_pair, corpus_name, return_dict):
        """
        Get matches as readable output

        Gets the actual match strings of cpos match indexes. Private helper
        method used in show_results.

        Keyword arguments:
        i -- serial number for match at given cpos
        index_pair -- match start and match end cpos
        corpus_name -- name of the parent corpus
        return_dict -- dictionary created with manager.dict() that holds the
        extracted strings tags etc.
        """
        # print('START:', index_pair[0])
        # print('END:', index_pair[1])
        # print('=============================')
        index_pair = [index_pair[0], index_pair[1] + 1]
        tmp_session = CQiWrapper(username=self.username, password=self.password,
                                 host=self.host, port=self.port)
        tmp_session.connect()
        match = self.get_cpos_info(index_pair, tmp_session)
        # tokens = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
        #                                  range(index_pair[0],
        #                                        index_pair[1] + 1))
        # lemmas = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['lemma'],
        #                                  range(index_pair[0],
        #                                        index_pair[1] + 1))
        # pos_tags = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['pos'],
        #                                    range(index_pair[0],
        #                                          index_pair[1] + 1))
        # sem_tags = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['sem'],
        #                                    range(index_pair[0],
        #                                          index_pair[1] + 1))
        # struc_entry = tmp_session.cl_cpos2struc(self.attr_strings['struct_attrs']['entry'],
        #                                         range(index_pair[0],
        #                                               index_pair[1] + 1))
        before_index = max([0, index_pair[0] - self.context_len])
        after_index = min([self.corpus_max_len,
                           index_pair[1] + self.context_len])
        context_before = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
                                                 range(before_index,
                                                       index_pair[0]))
        context_after = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
                                                range(index_pair[1] + 1,
                                                      after_index + 1))
        # entry_titles = tmp_session.cl_struc2str(self.attr_strings['struct_attrs']['entry_title'],
        #                                         struc_entry)
        # entry_authors = tmp_session.cl_struc2str(self.attr_strings['struct_attrs']['entry_author'],
        #                                          struc_entry)
        tmp_dict = {'context_before': context_before,
                    'context_after': context_after,
                    'cpos_start': index_pair[0],
                    'cpos_end': index_pair[1]}
        match.update(tmp_dict)
        return_dict[i] = match
        tmp_session.disconnect()
Add CQiWrapper 2019-11-07 14:48:47 +00:00			`from .CQiClient import CQiClient`
			`import multiprocessing`
			`import collections`


			`class CQiWrapper(CQiClient):`
			`"""`
			`CQIiWrapper object`

			`High level wrapper that groups and renames some functions of CQiClient`
			`for ease of use. Also structures recieved data into python dictionaries.`

			`Keyword arguments:`
			`username -- username used to connect to the cqp server`
			`password -- password of the user to connect to the cqp server`
			`"""`

			`SUBCORPUS_NAMES = []`

			`def __init__(self, host='127.0.0.1', port=4877, username='opaque',`
			`password='opaque'):`
			`super(CQiWrapper, self).__init__(host=host, port=port)`
			`self.username = username`
			`self.password = password`

			`def connect(self):`
			`"""`
			`Connect with CQP server`

			`Connects via socket to the CQP server using the given username and`
			`password from class initiation.`
			`"""`
			`self.ctrl_connect(self.username, self.password)`

Add new CQiWrapper 2019-11-11 14:35:37 +00:00			`def create_attribute_strings(self):`
			`p_attrs = self.corpus_positional_attributes(self.corpus_name)`
			`struct_attrs = self.corpus_structural_attributes(self.corpus_name)`
			`self.meta_struct_element = struct_attrs[0]`
			`print(p_attrs)`
			`print(struct_attrs)`
			`self.attr_strings = {}`
			`self.attr_strings['positional_attrs'] = {}`
			`self.attr_strings['struct_attrs'] = {}`
			`for p_attr in p_attrs:`
			`self.attr_strings['positional_attrs'][p_attr] = (self.corpus_name`
			`+ '.'`
			`+ p_attr)`
			`for struct_attr in struct_attrs[:-1]:`
			`self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name`
			`+ '.'`
			`+ struct_attr)`
			`# self.word_str = corpus_name + '.word'`
			`# self.lemma_str = corpus_name + '.lemma'`
			`# self.pos_str = corpus_name + '.pos'`
			`# self.sem_str = corpus_name + '.sem'`
			`# self.entry_str = corpus_name + '.entry'`
			`# self.entry_author_str = self.entry_str + '_author'`
			`# self.entry_title_str = self.entry_str + '_title'`
			`# self.attributes = [self.word_str,`
			`# self.lemma_str,`
			`# self.pos_str,`
			`# self.sem_str,`
			`# self.entry_str,`
			`# self.entry_author_str,`
			`# self.entry_title_str]`
			`# print(self.attributes)`

			`def set_corpus_name(self, corpus_name):`
			`self.corpus_name = corpus_name`
Add CQiWrapper 2019-11-07 14:48:47 +00:00
			`def disconnect(self):`
			`"""`
			`Disconnect from CQP server`

			`Disconnects from the CQP server. Closes used socket after disconnect.`
			`"""`
			`self.ctrl_bye()`
			`self.connection.close()`

Add new CQiWrapper 2019-11-11 14:35:37 +00:00			`def query_subcorpus(self, result_subcorpus_name, query):`
Add CQiWrapper 2019-11-07 14:48:47 +00:00			`"""`
			`Create subcorpus`

			`Input query will be used to create a subcorpus holding all cpos match`
			`positions for that query.`

			`Keyword arguments:`
			`result_subcorpus_name -- user set name of the subcorpus which holds all`
			`cpos match positions, produced by the query`
			`query -- query written in cqp query language`
			`"""`
Add new CQiWrapper 2019-11-11 14:35:37 +00:00			`self.cqp_query(self.corpus_name, result_subcorpus_name, query)`
			`self.result_subcorpus_ns = (self.corpus_name`
Add CQiWrapper 2019-11-07 14:48:47 +00:00			`+ ':'`
			`+ result_subcorpus_name)`
			`self.SUBCORPUS_NAMES.append(self.result_subcorpus_ns)`
			`self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus_ns)`
			`print('Nr of all matches is:', self.nr_matches)`

			`def show_subcorpora(self):`
Add new CQiWrapper 2019-11-11 14:35:37 +00:00			`return self.cqp_list_subcorpora(self.corpus_name)`
Add CQiWrapper 2019-11-07 14:48:47 +00:00
			`def show_results(self,`
			`result_start_count=0,`
			`result_max_count=50,`
			`context_len=10,):`
			`"""`
			`Show query results`

			`Shows the actual matched strings produce by the query. Uses the cpos`
			`match indexes to grab those strings. saves them into an orderd`
			`dictionary. Also saves coresponding tags, lemmas and context:`
			`OrderedDict([`
			`(0,`
			`{`
			`'tokens': ['Big', 'Brother', 'himself'],`
			`'lemmas': ['big', 'brother', 'himself'],`
			`'pos_tags': ['JJ', 'NN1', 'PPX1'],`
			`'sem_tags': ['\|A11.1+\|N3.2+\|N5+\|', '\|S2.2m\|S4m\|S9/S2.2m\|',`
			`'\|Z8m\|'],`
			`'context_before': ['figures', 'of', 'the', 'Party', ',',`
			`'almost', 'on', 'a', 'level', 'with'],`
			`'context_after': [',', 'and', 'then', 'had', 'engaged',`
			`'in', 'counter-revolu-', 'tionary',`
			`'activities', ','],`
			`'entry_title': '1984', 'entry_author':`
			`'george_orwell',`
			`'cpos_start': 110490,`
			`'cpos_end': 110492`
			`}`
			`)`
			`])`

			`Keyword arguments:`
			`result_start_count -- start position of the dumped subcorpus.`
			`(default 0) If it is 0 matches 0 to 50 will be shown. If it is 50`
			`matches 50 to 100 will be shown.`
			`result_max_count -- defines how many matches at once will be shown.`
			`(default 50)`
			`context_len -- defines how many words before and after a match will be`
			`shown (default 10)`
			`"""`
			`self.context_len = context_len`
Add new CQiWrapper 2019-11-11 14:35:37 +00:00			`self.corpus_max_len = self.cl_attribute_size(self.attr_strings['positional_attrs']['word'])`
Add CQiWrapper 2019-11-07 14:48:47 +00:00			`if self.nr_matches == 0:`
			`print('Query resulted in 0 matches.')`
			`else:`
			`if self.nr_matches <= 50:`
			`matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns,`
			`0x10,`
			`0,`
			`self.nr_matches - 1)`
			`matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns,`
			`0x11,`
			`0, self.nr_matches - 1)`
			`else:`
			`matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns,`
			`0x10,`
			`result_start_count,`
			`result_max_count - 1)`
			`matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns,`
			`0x11,`
			`result_start_count,`
			`result_max_count - 1)`
			`match_indexes = zip(matches_start, matches_end)`

			`matches = []`
			`manager = multiprocessing.Manager()`
			`return_dict = manager.dict()`
			`for i, index_pair in enumerate(match_indexes):`
			`match = multiprocessing.Process(target=self.__get_matches,`
			`args=(i,`
			`index_pair,`
Add new CQiWrapper 2019-11-11 14:35:37 +00:00			`self.corpus_name,`
Add CQiWrapper 2019-11-07 14:48:47 +00:00			`return_dict))`
			`matches.append(match)`
			`match.start()`
			`for match in matches:`
			`match.join()`
			`# sort matches into ordered dict`
			`ordered_results = collections.OrderedDict()`
			`for key in sorted(return_dict.keys()):`
			`ordered_results[key] = return_dict[key]`
Add new CQiWrapper 2019-11-11 14:35:37 +00:00			`return ordered_results`

			`def get_cpos_info(self, cpos, session):`
			`match_dict = {}`
			`for attr_dict in self.attr_strings:`
			`# print(self.attr_strings[attr_dict])`
			`if attr_dict == 'positional_attrs':`
			`for p_attr_key in self.attr_strings[attr_dict].keys():`
			`# print(p_attr_key)`
			`match_str = session.cl_cpos2str(self.attr_strings[attr_dict][p_attr_key], range(cpos[0], cpos[1]))`
			`match_dict[p_attr_key] = match_str`
			`elif attr_dict == 'struct_attrs':`
			`for struct_attr_key in self.attr_strings[attr_dict].keys():`
			`# print(struct_attr_key)`
			`struct_entry = session.cl_cpos2struc(self.attr_strings['struct_attrs'][self.meta_struct_element],`
			`range(cpos[0], cpos[1]))`
			`match_str = session.cl_struc2str(self.attr_strings[attr_dict][struct_attr_key], struct_entry)`
			`match_dict[struct_attr_key] = set(match_str)`
			`return match_dict`
Add CQiWrapper 2019-11-07 14:48:47 +00:00
			`def __get_matches(self, i, index_pair, corpus_name, return_dict):`
			`"""`
			`Get matches as readable output`

			`Gets the actual match strings of cpos match indexes. Private helper`
			`method used in show_results.`

			`Keyword arguments:`
			`i -- serial number for match at given cpos`
			`index_pair -- match start and match end cpos`
			`corpus_name -- name of the parent corpus`
			`return_dict -- dictionary created with manager.dict() that holds the`
			`extracted strings tags etc.`
			`"""`
Add new CQiWrapper 2019-11-11 14:35:37 +00:00			`# print('START:', index_pair[0])`
			`# print('END:', index_pair[1])`
			`# print('=============================')`
			`index_pair = [index_pair[0], index_pair[1] + 1]`
Add CQiWrapper 2019-11-07 14:48:47 +00:00			`tmp_session = CQiWrapper(username=self.username, password=self.password,`
			`host=self.host, port=self.port)`
			`tmp_session.connect()`
Add new CQiWrapper 2019-11-11 14:35:37 +00:00			`match = self.get_cpos_info(index_pair, tmp_session)`
			`# tokens = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],`
			`# range(index_pair[0],`
			`# index_pair[1] + 1))`
			`# lemmas = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['lemma'],`
			`# range(index_pair[0],`
			`# index_pair[1] + 1))`
			`# pos_tags = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['pos'],`
			`# range(index_pair[0],`
			`# index_pair[1] + 1))`
			`# sem_tags = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['sem'],`
			`# range(index_pair[0],`
			`# index_pair[1] + 1))`
			`# struc_entry = tmp_session.cl_cpos2struc(self.attr_strings['struct_attrs']['entry'],`
			`# range(index_pair[0],`
			`# index_pair[1] + 1))`
Add CQiWrapper 2019-11-07 14:48:47 +00:00			`before_index = max([0, index_pair[0] - self.context_len])`
			`after_index = min([self.corpus_max_len,`
			`index_pair[1] + self.context_len])`
Add new CQiWrapper 2019-11-11 14:35:37 +00:00			`context_before = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],`
Add CQiWrapper 2019-11-07 14:48:47 +00:00			`range(before_index,`
			`index_pair[0]))`
Add new CQiWrapper 2019-11-11 14:35:37 +00:00			`context_after = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],`
Add CQiWrapper 2019-11-07 14:48:47 +00:00			`range(index_pair[1] + 1,`
			`after_index + 1))`
Add new CQiWrapper 2019-11-11 14:35:37 +00:00			`# entry_titles = tmp_session.cl_struc2str(self.attr_strings['struct_attrs']['entry_title'],`
			`# struc_entry)`
			`# entry_authors = tmp_session.cl_struc2str(self.attr_strings['struct_attrs']['entry_author'],`
			`# struc_entry)`
			`tmp_dict = {'context_before': context_before,`
			`'context_after': context_after,`
			`'cpos_start': index_pair[0],`
			`'cpos_end': index_pair[1]}`
			`match.update(tmp_dict)`
			`return_dict[i] = match`
Add CQiWrapper 2019-11-07 14:48:47 +00:00			`tmp_session.disconnect()`