Merge branch 'development' of gitlab.ub.uni-bielefeld.de:sfb1288inf/opaque into development

This commit is contained in:
Stephan Porada 2020-03-26 15:28:36 +01:00
commit 257600a2a8
6 changed files with 222 additions and 179 deletions

View File

@ -421,13 +421,8 @@ class APIClient:
self.port = port self.port = port
self.socket = socket.socket() self.socket = socket.socket()
def setup(self):
self.socket.connect((self.host, self.port))
def teardown(self):
self.socket.close()
def ctrl_connect(self, username, password): def ctrl_connect(self, username, password):
self.socket.connect((self.host, self.port))
# INPUT: (STRING username, STRING password) # INPUT: (STRING username, STRING password)
# OUTPUT: CQI_STATUS_CONNECT_OK, CQI_ERROR_CONNECT_REFUSED # OUTPUT: CQI_STATUS_CONNECT_OK, CQI_ERROR_CONNECT_REFUSED
self.__send_WORD(CTRL_CONNECT) self.__send_WORD(CTRL_CONNECT)
@ -439,7 +434,9 @@ class APIClient:
# INPUT: () # INPUT: ()
# OUTPUT: CQI_STATUS_BYE_OK # OUTPUT: CQI_STATUS_BYE_OK
self.__send_WORD(CTRL_BYE) self.__send_WORD(CTRL_BYE)
return self.__recv_response() response = self.__recv_response()
self.socket.close()
return response
def ctrl_user_abort(self): def ctrl_user_abort(self):
# INPUT: () # INPUT: ()
@ -575,7 +572,7 @@ class APIClient:
# INPUT: (STRING attribute) # INPUT: (STRING attribute)
# OUTPUT: CQI_STATUS_OK # OUTPUT: CQI_STATUS_OK
# unload attribute from memory # unload attribute from memory
self.__send_WORD(CL_LEXICON_SIZE) self.__send_WORD(CL_DROP_ATTRIBUTE)
self.__send_STRING(attribute) self.__send_STRING(attribute)
return self.__recv_response() return self.__recv_response()
@ -589,7 +586,7 @@ class APIClient:
# OUTPUT: CQI_DATA_INT_LIST # OUTPUT: CQI_DATA_INT_LIST
# returns -1 for every string in <strings> that is not found in the # returns -1 for every string in <strings> that is not found in the
# lexicon # lexicon
self.__send_WORD(CL_LEXICON_SIZE) self.__send_WORD(CL_STR2ID)
self.__send_STRING(attribute) self.__send_STRING(attribute)
self.__send_STRING_LIST(strings) self.__send_STRING_LIST(strings)
return self.__recv_response() return self.__recv_response()

View File

@ -1,31 +1,15 @@
from .api import APIClient from .api import APIClient
from .constants import MAJOR_VERSION, MINOR_VERSION
from .models.corpora import CorpusCollection from .models.corpora import CorpusCollection
class CQiClient(APIClient): class CQiClient:
def __init__(self, host, port=4877): def __init__(self, host, port=4877):
super(CQiClient, self).__init__(host, port=port) self.api = APIClient(host, port=port)
def connect(self, username='anonymous', password=''): def connect(self, username='anonymous', password=''):
super(CQiClient, self).setup() self.api.ctrl_connect(username, password)
self.ctrl_connect(username, password) self.corpora = CorpusCollection(self)
self.__load()
def disconnect(self): def disconnect(self):
self.ctrl_bye() del self.corpora
super(CQiClient, self).teardown() self.api.ctrl_bye()
def __load(self):
self.corpora = CorpusCollection(self)
self.info = {'version': '{}.{}'.format(MAJOR_VERSION, MINOR_VERSION)}
def features(self):
features = []
if self.ask_feature_cqi_1_0():
features.append('cqi_1_0')
if self.ask_feature_cl_2_3():
features.append('cl_2_3')
if self.ask_feature_cqp_2_3():
features.append('cqp_2_3')
return features

View File

@ -0,0 +1,139 @@
class AttributeCollection:
def __init__(self, client, corpus):
self.client = client
self.corpus = corpus
self.alignment = AlignmentAttributeCollection(client, corpus)
self.positional = PositionalAttributeCollection(client, corpus)
self.structural = StructuralAttributeCollection(client, corpus)
class Attribute:
def __init__(self, client, corpus, name):
self.client = client
self.corpus = corpus
self.name = name
self._name = '{}.{}'.format(corpus.name, name)
self.size = client.api.cl_attribute_size(self._name)
def drop(self):
return self.client.api.cl_drop_attribute(self._name)
class AlignmentAttributeCollection:
def __init__(self, client, corpus):
self.client = client
self.corpus = corpus
def get(self, name):
return AlignmentAttribute(self.client, self.corpus, name)
def list(self):
return [AlignmentAttribute(self.client, self.corpus, attr) for attr in
self.client.api.corpus_alignment_attributes(self.corpus.name)]
class AlignmentAttribute(Attribute):
def cpos_by_ids(self, id_list):
return self.client.api.cl_alg2cpos(self._name, id_list)
def ids_by_cpos(self, cpos_list):
return self.client.api.cl_cpos2alg(self._name, cpos_list)
class PositionalAttributeCollection:
def __init__(self, client, corpus):
self.client = client
self.corpus = corpus
def get(self, name):
return PositionalAttribute(self.client, self.corpus, name)
def list(self):
return [PositionalAttribute(self.client, self.corpus, attr) for attr in
self.client.api.corpus_positional_attributes(self.corpus.name)]
class PositionalAttribute(Attribute):
def __init__(self, client, corpus, name):
super(PositionalAttribute, self).__init__(client, corpus, name)
self.lexicon_size = client.api.cl_lexicon_size(self._name)
def cpos_by_id(self, id):
return self.client.api.cl_id2cpos(self._name, id)
def cpos_by_ids(self, id_list):
return self.client.api.cl_idlist2cpos(self._name, id_list)
def freqs_by_ids(self, id_list):
return self.client.api.cl_id2freq(self._name, id_list)
def ids_by_cpos(self, cpos_list):
return self.client.api.cl_cpos2id(self._name, cpos_list)
def ids_by_regex(self, regex):
return self.client.api.cl_regex2id(self._name, regex)
def ids_by_values(self, value_list):
return self.client.api.cl_str2id(self._name, value_list)
def values_by_cpos(self, cpos_list):
return self.client.api.cl_cpos2str(self._name, cpos_list)
def values_by_ids(self, id_list):
return self.client.api.cl_id2str(self._name, id_list)
class StructuralAttributeCollection:
def __init__(self, client, corpus):
self.client = client
self.corpus = corpus
def get(self, name):
return StructuralAttribute(self.client, self.corpus, name)
def list(self, filters={}):
attrs = [StructuralAttribute(self.client, self.corpus, attr) for attr
in self.client.api.corpus_structural_attributes(
self.corpus.name)]
for k, v in filters.items():
if k == 'part_of':
attrs = list(filter(lambda x: x.name.startswith(v.name + '_'),
attrs))
return attrs
class StructuralAttribute(Attribute):
def __init__(self, client, corpus, name):
super(StructuralAttribute, self).__init__(client, corpus, name)
self.has_values = client.api.corpus_structural_attribute_has_values(
self._name
)
def cpos_by_id(self, id):
return self.client.api.cl_struc2cpos(self._name, id)
def ids_by_cpos(self, cpos_list):
return self.client.api.cl_cpos2struc(self._name, cpos_list)
def lbound_by_cpos(self, cpos_list):
return self.client.api.cl_cpos2lbound(self._name, cpos_list)
def rbound_by_cpos(self, cpos_list):
return self.client.api.cl_cpos2rbound(self._name, cpos_list)
def values_by_ids(self, id_list):
return self.client.api.cl_struc2str(self._name, id_list)
def lrcontext_by_cpos(self, cpos_first, cpos_last=None, context=3):
if cpos_last is None:
cpos_last = cpos_first
id_l, id_r = self.ids_by_cpos([cpos_first, cpos_last])
id_lc = max(0, id_l - context)
id_rc = min(id_r + context, self.size - 1)
ids = {id: list(range(*self.cpos_by_id(id))) for id
in range(id_lc, id_rc)}
cpos_list = [cpos for cpos_list in ids.values() for cpos in cpos_list]
lookups = self.corpus.lookups_by_cpos(cpos_list)
return {'ids': ids,
'match_cpos_list': list(range(cpos_first, cpos_last)),
**lookups}

View File

@ -1,3 +1,4 @@
from .attributes import AttributeCollection
from .subcorpora import SubcorpusCollection from .subcorpora import SubcorpusCollection
@ -9,94 +10,59 @@ class CorpusCollection:
return Corpus(self.client, name) return Corpus(self.client, name)
def list(self): def list(self):
return [Corpus(self.client, corpus) for corpus return [Corpus(self.client, corpus) for corpus in
in self.client.corpus_list_coprora()] self.client.api.corpus_list_coprora()]
class Corpus: class Corpus:
def __init__(self, client, name): def __init__(self, client, name):
self.client = client self.client = client
self.name = name self.name = name
self.__load() self.size = client.api.cl_attribute_size('{}.word'.format(name))
# self.info = client.api.corpus_info(name)
self.charset = client.api.corpus_charset(name)
# self.full_name = client.api.corpus_full_name(name)
self.properties = client.api.corpus_properties(name)
self.attributes = AttributeCollection(client, self)
self.subcorpora = SubcorpusCollection(client, self)
def __load(self): def lookups_by_cpos(self, cpos_list):
self.size = self.client.cl_attribute_size('{}.word'.format(self.name)) cpos_list = list(set(cpos_list))
# self.info = client.corpus_info(self.name) lookups = {}
self.charset = self.client.corpus_charset(self.name) if cpos_list:
# self.full_name = self.client.corpus_full_name(self.name) lookups['cpos_lookup'] = {}
self.properties = self.client.corpus_properties(self.name) for cpos in cpos_list:
self.alignment_attributes = \ lookups['cpos_lookup'][cpos] = {}
self.client.corpus_alignment_attributes(self.name) for attr in self.attributes.positional.list():
self.structural_attributes = \ cpos_attr_values = attr.values_by_cpos(cpos_list)
self.client.corpus_structural_attributes(self.name) for i, cpos in enumerate(cpos_list):
self.positional_attributes = \ lookups['cpos_lookup'][cpos][attr.name] = cpos_attr_values[i]
self.client.corpus_positional_attributes(self.name) for attr in self.attributes.structural.list():
self.subcorpora = SubcorpusCollection(self.client, self) if attr.has_values:
continue
def alg2cpos(self, attribute, alg): cpos_attr_ids = attr.ids_by_cpos(cpos_list)
__attribute = '{}.{}'.format(self.name, attribute) for i, cpos in enumerate(cpos_list):
return self.client.cl_alg2cpos(__attribute, alg) if cpos_attr_ids[i] != -1:
lookups['cpos_lookup'][cpos][attr.name] = cpos_attr_ids[i]
def cpos2alg(self, attribute, cpos_list): occured_attr_ids = list(filter(lambda x: x != -1,
__attribute = '{}.{}'.format(self.name, attribute) set(cpos_attr_ids)))
return self.client.cl_cpos2alg(__attribute, cpos_list) if not occured_attr_ids:
continue
def cpos2id(self, attribute, cpos_list): subattrs = self.attributes.structural.list(
__attribute = '{}.{}'.format(self.name, attribute) filters={'part_of': attr})
return self.client.cl_cpos2id(__attribute, cpos_list) if not subattrs:
continue
def cpos2lbound(self, attribute, cpos_list): lookup_name = '{}_lookup'.format(attr.name)
__attribute = '{}.{}'.format(self.name, attribute) lookups[lookup_name] = {}
return self.client.cl_cpos2lbound(__attribute, cpos_list) for attr_id in occured_attr_ids:
lookups[lookup_name][attr_id] = {}
def cpos2rbound(self, attribute, cpos_list): for subattr in subattrs:
__attribute = '{}.{}'.format(self.name, attribute) subattr_values = subattr.values_by_ids(occured_attr_ids)
return self.client.cl_cpos2rbound(__attribute, cpos_list) for i, subattr_value in enumerate(subattr_values):
lookups[lookup_name][occured_attr_ids[i]][subattr.name] = \
def cpos2str(self, attribute, cpos_list): subattr_value
__attribute = '{}.{}'.format(self.name, attribute) return lookups
return self.client.cl_cpos2str(__attribute, cpos_list)
def cpos2struc(self, attribute, cpos_list):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_cpos2struc(__attribute, cpos_list)
def id2cpos(self, attribute, id):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_id2cpos(__attribute, id)
def idlist2cpos(self, attribute, ids):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_idlist2cpos(__attribute, ids)
def id2freq(self, attribute, ids):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_id2freq(__attribute, ids)
def id2str(self, attribute, ids):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_id2str(__attribute, ids)
def query(self, query, subcorpus_name='Results'): def query(self, query, subcorpus_name='Results'):
self.client.cqp_query(self.name, subcorpus_name, query) self.client.api.cqp_query(self.name, subcorpus_name, query)
return self.subcorpora.get('Results') return self.subcorpora.get('Results')
def regex2id(self, attribute, regex):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_regex2id(__attribute, regex)
def structural_attribute_has_values(self, attribute):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.corpus_structural_attribute_has_values(__attribute)
def str2id(self, attribute, strings):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_str2id(__attribute, strings)
def struc2cpos(self, attribute, struc):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_struc2cpos(__attribute, struc)
def struc2str(self, attribute, strucs):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_struc2str(__attribute, strucs)

View File

@ -3,47 +3,42 @@ from ..constants import (CONST_FIELD_KEYWORD, CONST_FIELD_MATCH,
class SubcorpusCollection: class SubcorpusCollection:
def __init__(self, client, parent_corpus): def __init__(self, client, corpus):
self.client = client self.client = client
self.parent_corpus = parent_corpus self.corpus = corpus
def get(self, name): def get(self, name):
return Subcorpus(self.client, self.parent_corpus, name) return Subcorpus(self.client, self.corpus, name)
def list(self): def list(self):
return [Subcorpus(self.client, self.parent_corpus, subcorpus) return [Subcorpus(self.client, self.corpus, subcorpus) for subcorpus in
for subcorpus self.client.api.cqp_list_subcorpora(self.corpus.name)]
in self.client.cqp_list_subcorpora(self.parent_corpus.name)]
class Subcorpus: class Subcorpus:
def __init__(self, client, parent_corpus, name): def __init__(self, client, corpus, name):
self.client = client self.client = client
self.parent_corpus = parent_corpus self.corpus = corpus
self.name = name self.name = name
self.__name = '{}:{}'.format(parent_corpus.name, name) self._name = '{}:{}'.format(corpus.name, name)
self.__load()
def __load(self):
self.fields = {} self.fields = {}
if self.client.cqp_subcorpus_has_field(self.__name, CONST_FIELD_MATCH): if client.api.cqp_subcorpus_has_field(self._name, CONST_FIELD_MATCH):
self.fields['match'] = CONST_FIELD_MATCH self.fields['match'] = CONST_FIELD_MATCH
if self.client.cqp_subcorpus_has_field(self.__name, if client.api.cqp_subcorpus_has_field(self._name,
CONST_FIELD_MATCHEND): CONST_FIELD_MATCHEND):
self.fields['matchend'] = CONST_FIELD_MATCHEND self.fields['matchend'] = CONST_FIELD_MATCHEND
if self.client.cqp_subcorpus_has_field(self.__name, if client.api.cqp_subcorpus_has_field(self._name, CONST_FIELD_TARGET):
CONST_FIELD_TARGET):
self.fields['target'] = CONST_FIELD_TARGET self.fields['target'] = CONST_FIELD_TARGET
if self.client.cqp_subcorpus_has_field(self.__name, if client.api.cqp_subcorpus_has_field(self._name, CONST_FIELD_KEYWORD):
CONST_FIELD_KEYWORD):
self.fields['keyword'] = CONST_FIELD_KEYWORD self.fields['keyword'] = CONST_FIELD_KEYWORD
self.size = self.client.cqp_subcorpus_size(self.__name) self.size = client.api.cqp_subcorpus_size(self._name)
def drop(self): def drop(self):
return self.client.cqp_drop_subcorpus(self.__name) return self.client.api.cqp_drop_subcorpus(self._name)
def dump(self, field, first, last): def dump(self, field, first, last):
return self.client.cqp_dump_subcorpus(self.__name, field, first, last) return self.client.api.cqp_dump_subcorpus(self._name, field, first,
last)
def dump_values(self, context=25, first_result=0, def dump_values(self, context=25, first_result=0,
num_results=float('inf')): num_results=float('inf')):
@ -58,57 +53,21 @@ class Subcorpus:
last_result)) last_result))
for match_start, match_end in match_boundaries: for match_start, match_end in match_boundaries:
left_start = max(0, match_start - context) left_start = max(0, match_start - context)
right_end = min(self.parent_corpus.size, (match_end + 1 + context)) right_end = min(self.corpus.size, (match_end + 1 + context))
matches.append({'lc': list(range(left_start, match_start)), matches.append({'lc': list(range(left_start, match_start)),
'hit': list(range(match_start, match_end + 1)), 'hit': list(range(match_start, match_end + 1)),
'rc': list(range(match_end + 1, right_end))}) 'rc': list(range(match_end + 1, right_end))})
cpos_list = [] cpos_list = []
for match in matches: for match in matches:
cpos_list += match['lc'] + match['hit'] + match['rc'] cpos_list += match['lc'] + match['hit'] + match['rc']
cpos_list = list(set(cpos_list)) lookups = self.corpus.lookups_by_cpos(cpos_list)
lookups = {}
if len(cpos_list) > 0:
lookups['cpos_lookup'] = {}
for cpos in cpos_list:
lookups['cpos_lookup'][cpos] = {}
for attr in self.parent_corpus.positional_attributes:
cpos_attr_values = self.parent_corpus.cpos2str(attr, cpos_list)
for i, cpos in enumerate(cpos_list):
lookups['cpos_lookup'][cpos][attr] = cpos_attr_values[i]
for attr in self.parent_corpus.structural_attributes:
if self.parent_corpus.structural_attribute_has_values(attr):
continue
cpos_attr_ids = self.parent_corpus.cpos2struc(attr, cpos_list)
for i, cpos in enumerate(cpos_list):
if cpos_attr_ids[i] != -1:
lookups['cpos_lookup'][cpos][attr] = cpos_attr_ids[i]
occured_attr_ids = list(set(cpos_attr_ids))
occured_attr_ids = list(filter(lambda x: x != -1,
occured_attr_ids))
if len(occured_attr_ids) == 0:
continue
attr_subattrs = \
list(filter(lambda x: x.startswith(attr + '_'),
self.parent_corpus.structural_attributes))
attr_subattrs = list(map(lambda x: x.split('_', 1)[1],
attr_subattrs))
if len(attr_subattrs) == 0:
continue
lookups[attr + '_lookup'] = {}
for attr_id in occured_attr_ids:
lookups[attr + '_lookup'][attr_id] = {}
for subattr in attr_subattrs:
__subattr = attr + '_' + subattr
subattr_values = \
self.parent_corpus.struc2str(__subattr, occured_attr_ids)
for i, value in enumerate(subattr_values):
lookups[attr + '_lookup'][occured_attr_ids[i]][subattr] = \
value
return {'matches': matches, **lookups} return {'matches': matches, **lookups}
def fdist_1(self, cutoff, field, attribute): def fdist_1(self, cutoff, field, attribute):
return self.client.cqp_fdist_1(self.__name, cutoff, field, attribute) return self.client.api.cqp_fdist_1(self._name, cutoff, field,
attribute._name)
def fdist_2(self, cutoff, field_1, attribute_1, field_2, attribute_2): def fdist_2(self, cutoff, field_1, attribute_1, field_2, attribute_2):
return self.client.cqp_fdist_2(self.__name, cutoff, field_1, return self.client.api.cqp_fdist_2(self._name, cutoff,
attribute_1, field_2, attribute_2) field_1, attribute_1._name,
field_2, attribute_2._name)

View File

@ -32,7 +32,6 @@ class CQiWrapper(APIClient):
Connects via socket to the CQP server using the given username and Connects via socket to the CQP server using the given username and
password from class initiation. password from class initiation.
''' '''
super(CQiWrapper, self).setup()
self.ctrl_connect(self.username, self.password) self.ctrl_connect(self.username, self.password)
def __create_attribute_strings(self): def __create_attribute_strings(self):
@ -79,7 +78,6 @@ class CQiWrapper(APIClient):
Disconnects from the CQP server. Closes used socket after disconnect. Disconnects from the CQP server. Closes used socket after disconnect.
''' '''
self.ctrl_bye() self.ctrl_bye()
super(CQiWrapper, self).teardown()
print('Disconnected from cqp server.') print('Disconnected from cqp server.')
def query_subcorpus(self, query, result_subcorpus_name='Query-results'): def query_subcorpus(self, query, result_subcorpus_name='Query-results'):