diff --git a/app/corpora/cqi/api/client.py b/app/corpora/cqi/api/client.py index 30896494..6a9460db 100644 --- a/app/corpora/cqi/api/client.py +++ b/app/corpora/cqi/api/client.py @@ -421,13 +421,8 @@ class APIClient: self.port = port self.socket = socket.socket() - def setup(self): - self.socket.connect((self.host, self.port)) - - def teardown(self): - self.socket.close() - def ctrl_connect(self, username, password): + self.socket.connect((self.host, self.port)) # INPUT: (STRING username, STRING password) # OUTPUT: CQI_STATUS_CONNECT_OK, CQI_ERROR_CONNECT_REFUSED self.__send_WORD(CTRL_CONNECT) @@ -439,7 +434,9 @@ class APIClient: # INPUT: () # OUTPUT: CQI_STATUS_BYE_OK self.__send_WORD(CTRL_BYE) - return self.__recv_response() + response = self.__recv_response() + self.socket.close() + return response def ctrl_user_abort(self): # INPUT: () @@ -575,7 +572,7 @@ class APIClient: # INPUT: (STRING attribute) # OUTPUT: CQI_STATUS_OK # unload attribute from memory - self.__send_WORD(CL_LEXICON_SIZE) + self.__send_WORD(CL_DROP_ATTRIBUTE) self.__send_STRING(attribute) return self.__recv_response() @@ -589,7 +586,7 @@ class APIClient: # OUTPUT: CQI_DATA_INT_LIST # returns -1 for every string in that is not found in the # lexicon - self.__send_WORD(CL_LEXICON_SIZE) + self.__send_WORD(CL_STR2ID) self.__send_STRING(attribute) self.__send_STRING_LIST(strings) return self.__recv_response() diff --git a/app/corpora/cqi/client.py b/app/corpora/cqi/client.py index 5917f2fa..50de46b4 100644 --- a/app/corpora/cqi/client.py +++ b/app/corpora/cqi/client.py @@ -1,31 +1,15 @@ from .api import APIClient -from .constants import MAJOR_VERSION, MINOR_VERSION from .models.corpora import CorpusCollection -class CQiClient(APIClient): +class CQiClient: def __init__(self, host, port=4877): - super(CQiClient, self).__init__(host, port=port) + self.api = APIClient(host, port=port) def connect(self, username='anonymous', password=''): - super(CQiClient, self).setup() - self.ctrl_connect(username, password) - self.__load() + self.api.ctrl_connect(username, password) + self.corpora = CorpusCollection(self) def disconnect(self): - self.ctrl_bye() - super(CQiClient, self).teardown() - - def __load(self): - self.corpora = CorpusCollection(self) - self.info = {'version': '{}.{}'.format(MAJOR_VERSION, MINOR_VERSION)} - - def features(self): - features = [] - if self.ask_feature_cqi_1_0(): - features.append('cqi_1_0') - if self.ask_feature_cl_2_3(): - features.append('cl_2_3') - if self.ask_feature_cqp_2_3(): - features.append('cqp_2_3') - return features + del self.corpora + self.api.ctrl_bye() diff --git a/app/corpora/cqi/models/attributes.py b/app/corpora/cqi/models/attributes.py new file mode 100644 index 00000000..8c8e0a72 --- /dev/null +++ b/app/corpora/cqi/models/attributes.py @@ -0,0 +1,139 @@ +class AttributeCollection: + def __init__(self, client, corpus): + self.client = client + self.corpus = corpus + self.alignment = AlignmentAttributeCollection(client, corpus) + self.positional = PositionalAttributeCollection(client, corpus) + self.structural = StructuralAttributeCollection(client, corpus) + + +class Attribute: + def __init__(self, client, corpus, name): + self.client = client + self.corpus = corpus + self.name = name + self._name = '{}.{}'.format(corpus.name, name) + self.size = client.api.cl_attribute_size(self._name) + + def drop(self): + return self.client.api.cl_drop_attribute(self._name) + + +class AlignmentAttributeCollection: + def __init__(self, client, corpus): + self.client = client + self.corpus = corpus + + def get(self, name): + return AlignmentAttribute(self.client, self.corpus, name) + + def list(self): + return [AlignmentAttribute(self.client, self.corpus, attr) for attr in + self.client.api.corpus_alignment_attributes(self.corpus.name)] + + +class AlignmentAttribute(Attribute): + def cpos_by_ids(self, id_list): + return self.client.api.cl_alg2cpos(self._name, id_list) + + def ids_by_cpos(self, cpos_list): + return self.client.api.cl_cpos2alg(self._name, cpos_list) + + +class PositionalAttributeCollection: + def __init__(self, client, corpus): + self.client = client + self.corpus = corpus + + def get(self, name): + return PositionalAttribute(self.client, self.corpus, name) + + def list(self): + return [PositionalAttribute(self.client, self.corpus, attr) for attr in + self.client.api.corpus_positional_attributes(self.corpus.name)] + + +class PositionalAttribute(Attribute): + def __init__(self, client, corpus, name): + super(PositionalAttribute, self).__init__(client, corpus, name) + self.lexicon_size = client.api.cl_lexicon_size(self._name) + + def cpos_by_id(self, id): + return self.client.api.cl_id2cpos(self._name, id) + + def cpos_by_ids(self, id_list): + return self.client.api.cl_idlist2cpos(self._name, id_list) + + def freqs_by_ids(self, id_list): + return self.client.api.cl_id2freq(self._name, id_list) + + def ids_by_cpos(self, cpos_list): + return self.client.api.cl_cpos2id(self._name, cpos_list) + + def ids_by_regex(self, regex): + return self.client.api.cl_regex2id(self._name, regex) + + def ids_by_values(self, value_list): + return self.client.api.cl_str2id(self._name, value_list) + + def values_by_cpos(self, cpos_list): + return self.client.api.cl_cpos2str(self._name, cpos_list) + + def values_by_ids(self, id_list): + return self.client.api.cl_id2str(self._name, id_list) + + +class StructuralAttributeCollection: + def __init__(self, client, corpus): + self.client = client + self.corpus = corpus + + def get(self, name): + return StructuralAttribute(self.client, self.corpus, name) + + def list(self, filters={}): + attrs = [StructuralAttribute(self.client, self.corpus, attr) for attr + in self.client.api.corpus_structural_attributes( + self.corpus.name)] + for k, v in filters.items(): + if k == 'part_of': + attrs = list(filter(lambda x: x.name.startswith(v.name + '_'), + attrs)) + return attrs + + +class StructuralAttribute(Attribute): + def __init__(self, client, corpus, name): + super(StructuralAttribute, self).__init__(client, corpus, name) + self.has_values = client.api.corpus_structural_attribute_has_values( + self._name + ) + + def cpos_by_id(self, id): + return self.client.api.cl_struc2cpos(self._name, id) + + def ids_by_cpos(self, cpos_list): + return self.client.api.cl_cpos2struc(self._name, cpos_list) + + def lbound_by_cpos(self, cpos_list): + return self.client.api.cl_cpos2lbound(self._name, cpos_list) + + def rbound_by_cpos(self, cpos_list): + return self.client.api.cl_cpos2rbound(self._name, cpos_list) + + def values_by_ids(self, id_list): + return self.client.api.cl_struc2str(self._name, id_list) + + def lrcontext_by_cpos(self, cpos_first, cpos_last=None, context=3): + if cpos_last is None: + cpos_last = cpos_first + id_l, id_r = self.ids_by_cpos([cpos_first, cpos_last]) + id_lc = max(0, id_l - context) + id_rc = min(id_r + context, self.size - 1) + ids = {id: list(range(*self.cpos_by_id(id))) for id + in range(id_lc, id_rc)} + cpos_list = [cpos for cpos_list in ids.values() for cpos in cpos_list] + lookups = self.corpus.lookups_by_cpos(cpos_list) + return {'ids': ids, + 'match_cpos_list': list(range(cpos_first, cpos_last)), + **lookups} diff --git a/app/corpora/cqi/models/corpora.py b/app/corpora/cqi/models/corpora.py index 4d25318d..f9537a9c 100644 --- a/app/corpora/cqi/models/corpora.py +++ b/app/corpora/cqi/models/corpora.py @@ -1,3 +1,4 @@ +from .attributes import AttributeCollection from .subcorpora import SubcorpusCollection @@ -9,94 +10,59 @@ class CorpusCollection: return Corpus(self.client, name) def list(self): - return [Corpus(self.client, corpus) for corpus - in self.client.corpus_list_coprora()] + return [Corpus(self.client, corpus) for corpus in + self.client.api.corpus_list_coprora()] class Corpus: def __init__(self, client, name): self.client = client self.name = name - self.__load() + self.size = client.api.cl_attribute_size('{}.word'.format(name)) + # self.info = client.api.corpus_info(name) + self.charset = client.api.corpus_charset(name) + # self.full_name = client.api.corpus_full_name(name) + self.properties = client.api.corpus_properties(name) + self.attributes = AttributeCollection(client, self) + self.subcorpora = SubcorpusCollection(client, self) - def __load(self): - self.size = self.client.cl_attribute_size('{}.word'.format(self.name)) - # self.info = client.corpus_info(self.name) - self.charset = self.client.corpus_charset(self.name) - # self.full_name = self.client.corpus_full_name(self.name) - self.properties = self.client.corpus_properties(self.name) - self.alignment_attributes = \ - self.client.corpus_alignment_attributes(self.name) - self.structural_attributes = \ - self.client.corpus_structural_attributes(self.name) - self.positional_attributes = \ - self.client.corpus_positional_attributes(self.name) - self.subcorpora = SubcorpusCollection(self.client, self) - - def alg2cpos(self, attribute, alg): - __attribute = '{}.{}'.format(self.name, attribute) - return self.client.cl_alg2cpos(__attribute, alg) - - def cpos2alg(self, attribute, cpos_list): - __attribute = '{}.{}'.format(self.name, attribute) - return self.client.cl_cpos2alg(__attribute, cpos_list) - - def cpos2id(self, attribute, cpos_list): - __attribute = '{}.{}'.format(self.name, attribute) - return self.client.cl_cpos2id(__attribute, cpos_list) - - def cpos2lbound(self, attribute, cpos_list): - __attribute = '{}.{}'.format(self.name, attribute) - return self.client.cl_cpos2lbound(__attribute, cpos_list) - - def cpos2rbound(self, attribute, cpos_list): - __attribute = '{}.{}'.format(self.name, attribute) - return self.client.cl_cpos2rbound(__attribute, cpos_list) - - def cpos2str(self, attribute, cpos_list): - __attribute = '{}.{}'.format(self.name, attribute) - return self.client.cl_cpos2str(__attribute, cpos_list) - - def cpos2struc(self, attribute, cpos_list): - __attribute = '{}.{}'.format(self.name, attribute) - return self.client.cl_cpos2struc(__attribute, cpos_list) - - def id2cpos(self, attribute, id): - __attribute = '{}.{}'.format(self.name, attribute) - return self.client.cl_id2cpos(__attribute, id) - - def idlist2cpos(self, attribute, ids): - __attribute = '{}.{}'.format(self.name, attribute) - return self.client.cl_idlist2cpos(__attribute, ids) - - def id2freq(self, attribute, ids): - __attribute = '{}.{}'.format(self.name, attribute) - return self.client.cl_id2freq(__attribute, ids) - - def id2str(self, attribute, ids): - __attribute = '{}.{}'.format(self.name, attribute) - return self.client.cl_id2str(__attribute, ids) + def lookups_by_cpos(self, cpos_list): + cpos_list = list(set(cpos_list)) + lookups = {} + if cpos_list: + lookups['cpos_lookup'] = {} + for cpos in cpos_list: + lookups['cpos_lookup'][cpos] = {} + for attr in self.attributes.positional.list(): + cpos_attr_values = attr.values_by_cpos(cpos_list) + for i, cpos in enumerate(cpos_list): + lookups['cpos_lookup'][cpos][attr.name] = cpos_attr_values[i] + for attr in self.attributes.structural.list(): + if attr.has_values: + continue + cpos_attr_ids = attr.ids_by_cpos(cpos_list) + for i, cpos in enumerate(cpos_list): + if cpos_attr_ids[i] != -1: + lookups['cpos_lookup'][cpos][attr.name] = cpos_attr_ids[i] + occured_attr_ids = list(filter(lambda x: x != -1, + set(cpos_attr_ids))) + if not occured_attr_ids: + continue + subattrs = self.attributes.structural.list( + filters={'part_of': attr}) + if not subattrs: + continue + lookup_name = '{}_lookup'.format(attr.name) + lookups[lookup_name] = {} + for attr_id in occured_attr_ids: + lookups[lookup_name][attr_id] = {} + for subattr in subattrs: + subattr_values = subattr.values_by_ids(occured_attr_ids) + for i, subattr_value in enumerate(subattr_values): + lookups[lookup_name][occured_attr_ids[i]][subattr.name] = \ + subattr_value + return lookups def query(self, query, subcorpus_name='Results'): - self.client.cqp_query(self.name, subcorpus_name, query) + self.client.api.cqp_query(self.name, subcorpus_name, query) return self.subcorpora.get('Results') - - def regex2id(self, attribute, regex): - __attribute = '{}.{}'.format(self.name, attribute) - return self.client.cl_regex2id(__attribute, regex) - - def structural_attribute_has_values(self, attribute): - __attribute = '{}.{}'.format(self.name, attribute) - return self.client.corpus_structural_attribute_has_values(__attribute) - - def str2id(self, attribute, strings): - __attribute = '{}.{}'.format(self.name, attribute) - return self.client.cl_str2id(__attribute, strings) - - def struc2cpos(self, attribute, struc): - __attribute = '{}.{}'.format(self.name, attribute) - return self.client.cl_struc2cpos(__attribute, struc) - - def struc2str(self, attribute, strucs): - __attribute = '{}.{}'.format(self.name, attribute) - return self.client.cl_struc2str(__attribute, strucs) diff --git a/app/corpora/cqi/models/subcorpora.py b/app/corpora/cqi/models/subcorpora.py index 6606639b..38ba3957 100644 --- a/app/corpora/cqi/models/subcorpora.py +++ b/app/corpora/cqi/models/subcorpora.py @@ -3,47 +3,42 @@ from ..constants import (CONST_FIELD_KEYWORD, CONST_FIELD_MATCH, class SubcorpusCollection: - def __init__(self, client, parent_corpus): + def __init__(self, client, corpus): self.client = client - self.parent_corpus = parent_corpus + self.corpus = corpus def get(self, name): - return Subcorpus(self.client, self.parent_corpus, name) + return Subcorpus(self.client, self.corpus, name) def list(self): - return [Subcorpus(self.client, self.parent_corpus, subcorpus) - for subcorpus - in self.client.cqp_list_subcorpora(self.parent_corpus.name)] + return [Subcorpus(self.client, self.corpus, subcorpus) for subcorpus in + self.client.api.cqp_list_subcorpora(self.corpus.name)] class Subcorpus: - def __init__(self, client, parent_corpus, name): + def __init__(self, client, corpus, name): self.client = client - self.parent_corpus = parent_corpus + self.corpus = corpus self.name = name - self.__name = '{}:{}'.format(parent_corpus.name, name) - self.__load() - - def __load(self): + self._name = '{}:{}'.format(corpus.name, name) self.fields = {} - if self.client.cqp_subcorpus_has_field(self.__name, CONST_FIELD_MATCH): + if client.api.cqp_subcorpus_has_field(self._name, CONST_FIELD_MATCH): self.fields['match'] = CONST_FIELD_MATCH - if self.client.cqp_subcorpus_has_field(self.__name, - CONST_FIELD_MATCHEND): + if client.api.cqp_subcorpus_has_field(self._name, + CONST_FIELD_MATCHEND): self.fields['matchend'] = CONST_FIELD_MATCHEND - if self.client.cqp_subcorpus_has_field(self.__name, - CONST_FIELD_TARGET): + if client.api.cqp_subcorpus_has_field(self._name, CONST_FIELD_TARGET): self.fields['target'] = CONST_FIELD_TARGET - if self.client.cqp_subcorpus_has_field(self.__name, - CONST_FIELD_KEYWORD): + if client.api.cqp_subcorpus_has_field(self._name, CONST_FIELD_KEYWORD): self.fields['keyword'] = CONST_FIELD_KEYWORD - self.size = self.client.cqp_subcorpus_size(self.__name) + self.size = client.api.cqp_subcorpus_size(self._name) def drop(self): - return self.client.cqp_drop_subcorpus(self.__name) + return self.client.api.cqp_drop_subcorpus(self._name) def dump(self, field, first, last): - return self.client.cqp_dump_subcorpus(self.__name, field, first, last) + return self.client.api.cqp_dump_subcorpus(self._name, field, first, + last) def dump_values(self, context=25, first_result=0, num_results=float('inf')): @@ -58,57 +53,21 @@ class Subcorpus: last_result)) for match_start, match_end in match_boundaries: left_start = max(0, match_start - context) - right_end = min(self.parent_corpus.size, (match_end + 1 + context)) + right_end = min(self.corpus.size, (match_end + 1 + context)) matches.append({'lc': list(range(left_start, match_start)), 'hit': list(range(match_start, match_end + 1)), 'rc': list(range(match_end + 1, right_end))}) cpos_list = [] for match in matches: cpos_list += match['lc'] + match['hit'] + match['rc'] - cpos_list = list(set(cpos_list)) - lookups = {} - if len(cpos_list) > 0: - lookups['cpos_lookup'] = {} - for cpos in cpos_list: - lookups['cpos_lookup'][cpos] = {} - for attr in self.parent_corpus.positional_attributes: - cpos_attr_values = self.parent_corpus.cpos2str(attr, cpos_list) - for i, cpos in enumerate(cpos_list): - lookups['cpos_lookup'][cpos][attr] = cpos_attr_values[i] - for attr in self.parent_corpus.structural_attributes: - if self.parent_corpus.structural_attribute_has_values(attr): - continue - cpos_attr_ids = self.parent_corpus.cpos2struc(attr, cpos_list) - for i, cpos in enumerate(cpos_list): - if cpos_attr_ids[i] != -1: - lookups['cpos_lookup'][cpos][attr] = cpos_attr_ids[i] - occured_attr_ids = list(set(cpos_attr_ids)) - occured_attr_ids = list(filter(lambda x: x != -1, - occured_attr_ids)) - if len(occured_attr_ids) == 0: - continue - attr_subattrs = \ - list(filter(lambda x: x.startswith(attr + '_'), - self.parent_corpus.structural_attributes)) - attr_subattrs = list(map(lambda x: x.split('_', 1)[1], - attr_subattrs)) - if len(attr_subattrs) == 0: - continue - lookups[attr + '_lookup'] = {} - for attr_id in occured_attr_ids: - lookups[attr + '_lookup'][attr_id] = {} - for subattr in attr_subattrs: - __subattr = attr + '_' + subattr - subattr_values = \ - self.parent_corpus.struc2str(__subattr, occured_attr_ids) - for i, value in enumerate(subattr_values): - lookups[attr + '_lookup'][occured_attr_ids[i]][subattr] = \ - value + lookups = self.corpus.lookups_by_cpos(cpos_list) return {'matches': matches, **lookups} def fdist_1(self, cutoff, field, attribute): - return self.client.cqp_fdist_1(self.__name, cutoff, field, attribute) + return self.client.api.cqp_fdist_1(self._name, cutoff, field, + attribute._name) def fdist_2(self, cutoff, field_1, attribute_1, field_2, attribute_2): - return self.client.cqp_fdist_2(self.__name, cutoff, field_1, - attribute_1, field_2, attribute_2) + return self.client.api.cqp_fdist_2(self._name, cutoff, + field_1, attribute_1._name, + field_2, attribute_2._name) diff --git a/app/corpora/cqi/wrapper.py b/app/corpora/cqi/wrapper.py index 0b970f11..3dcfc2d5 100644 --- a/app/corpora/cqi/wrapper.py +++ b/app/corpora/cqi/wrapper.py @@ -32,7 +32,6 @@ class CQiWrapper(APIClient): Connects via socket to the CQP server using the given username and password from class initiation. ''' - super(CQiWrapper, self).setup() self.ctrl_connect(self.username, self.password) def __create_attribute_strings(self): @@ -79,7 +78,6 @@ class CQiWrapper(APIClient): Disconnects from the CQP server. Closes used socket after disconnect. ''' self.ctrl_bye() - super(CQiWrapper, self).teardown() print('Disconnected from cqp server.') def query_subcorpus(self, query, result_subcorpus_name='Query-results'):