Add package implementation of cqi

This commit is contained in:
Patrick Jentsch
2020-03-23 09:10:35 +01:00
parent 7752e7fb57
commit acfcc0321b
13 changed files with 353 additions and 33 deletions

View File

View File

@ -0,0 +1,102 @@
from .subcorpora import SubcorpusCollection
class CorpusCollection:
def __init__(self, client):
self.client = client
def get(self, name):
return Corpus(self.client, name)
def list(self):
return [Corpus(self.client, corpus) for corpus
in self.client.corpus_list_coprora()]
class Corpus:
def __init__(self, client, name):
self.client = client
self.name = name
self.__load()
def __load(self):
self.size = self.client.cl_attribute_size('{}.word'.format(self.name))
# self.info = client.corpus_info(self.name)
self.charset = self.client.corpus_charset(self.name)
# self.full_name = self.client.corpus_full_name(self.name)
self.properties = self.client.corpus_properties(self.name)
self.alignment_attributes = \
self.client.corpus_alignment_attributes(self.name)
self.structural_attributes = \
self.client.corpus_structural_attributes(self.name)
self.positional_attributes = \
self.client.corpus_positional_attributes(self.name)
self.subcorpora = SubcorpusCollection(self.client, self)
def alg2cpos(self, attribute, alg):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_alg2cpos(__attribute, alg)
def cpos2alg(self, attribute, cpos_list):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_cpos2alg(__attribute, cpos_list)
def cpos2id(self, attribute, cpos_list):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_cpos2id(__attribute, cpos_list)
def cpos2lbound(self, attribute, cpos_list):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_cpos2lbound(__attribute, cpos_list)
def cpos2rbound(self, attribute, cpos_list):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_cpos2rbound(__attribute, cpos_list)
def cpos2str(self, attribute, cpos_list):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_cpos2str(__attribute, cpos_list)
def cpos2struc(self, attribute, cpos_list):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_cpos2struc(__attribute, cpos_list)
def id2cpos(self, attribute, id):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_id2cpos(__attribute, id)
def idlist2cpos(self, attribute, ids):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_idlist2cpos(__attribute, ids)
def id2freq(self, attribute, ids):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_id2freq(__attribute, ids)
def id2str(self, attribute, ids):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_id2str(__attribute, ids)
def query(self, query, subcorpus_name='Results'):
self.client.cqp_query(self.name, subcorpus_name, query)
return self.subcorpora.get('Results')
def regex2id(self, attribute, regex):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_regex2id(__attribute, regex)
def structural_attribute_has_values(self, attribute):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.corpus_structural_attribute_has_values(__attribute)
def str2id(self, attribute, strings):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_str2id(__attribute, strings)
def struc2cpos(self, attribute, struc):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_struc2cpos(__attribute, struc)
def struc2str(self, attribute, strucs):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_struc2str(__attribute, strucs)

View File

@ -0,0 +1,112 @@
from ..constants import (CONST_FIELD_KEYWORD, CONST_FIELD_MATCH,
CONST_FIELD_MATCHEND, CONST_FIELD_TARGET)
class SubcorpusCollection:
def __init__(self, client, parent_corpus):
self.client = client
self.parent_corpus = parent_corpus
def get(self, name):
return Subcorpus(self.client, self.parent_corpus, name)
def list(self):
return [Subcorpus(self.client, self.parent_corpus, subcorpus)
for subcorpus
in self.client.cqp_list_subcorpora(self.parent_corpus.name)]
class Subcorpus:
def __init__(self, client, parent_corpus, name):
self.client = client
self.parent_corpus = parent_corpus
self.name = name
self.__name = '{}:{}'.format(parent_corpus.name, name)
self.__load()
def __load(self):
self.fields = {}
if self.client.cqp_subcorpus_has_field(self.__name, CONST_FIELD_MATCH):
self.fields['match'] = CONST_FIELD_MATCH
if self.client.cqp_subcorpus_has_field(self.__name,
CONST_FIELD_MATCHEND):
self.fields['matchend'] = CONST_FIELD_MATCHEND
if self.client.cqp_subcorpus_has_field(self.__name,
CONST_FIELD_TARGET):
self.fields['target'] = CONST_FIELD_TARGET
if self.client.cqp_subcorpus_has_field(self.__name,
CONST_FIELD_KEYWORD):
self.fields['keyword'] = CONST_FIELD_KEYWORD
self.size = self.client.cqp_subcorpus_size(self.__name)
def drop(self):
return self.client.cqp_drop_subcorpus(self.__name)
def dump(self, field, first, last):
return self.client.cqp_dump_subcorpus(self.__name, field, first, last)
def dump_values(self, context=25, first_result=0,
num_results=float('inf')):
first_result = max(0, first_result)
last_result = min((first_result + num_results), (self.size - 1))
matches = []
match_boundaries = zip(self.dump(self.fields['match'], first_result,
last_result),
self.dump(self.fields['matchend'], first_result,
last_result))
for match_start, match_end in match_boundaries:
left_start = max(0, match_start - context)
right_end = min(self.parent_corpus.size, (match_end + 1 + context))
matches.append({'left': list(range(left_start, match_start)),
'hit': list(range(match_start, match_end + 1)),
'right': list(range(match_end + 1, right_end))})
cpos_list = []
for match in matches:
cpos_list += match['left'] + match['hit'] + match['right']
cpos_list = list(set(cpos_list))
lookups = {}
if len(cpos_list) > 0:
lookups['cpos_lookup'] = {}
for cpos in cpos_list:
lookups['cpos_lookup'][cpos] = {}
for attr in self.parent_corpus.positional_attributes:
cpos_attr_values = self.parent_corpus.cpos2str(attr, cpos_list)
for i, cpos in enumerate(cpos_list):
lookups['cpos_lookup'][cpos][attr] = cpos_attr_values[i]
for attr in self.parent_corpus.structural_attributes:
if self.parent_corpus.structural_attribute_has_values(attr):
continue
cpos_attr_ids = self.parent_corpus.cpos2struc(attr, cpos_list)
for i, cpos in enumerate(cpos_list):
if cpos_attr_ids[i] != -1:
lookups['cpos_lookup'][cpos][attr] = cpos_attr_ids[i]
occured_attr_ids = list(set(cpos_attr_ids))
occured_attr_ids = list(filter(lambda x: x != -1,
occured_attr_ids))
if len(occured_attr_ids) == 0:
continue
attr_subattrs = \
list(filter(lambda x: x.startswith(attr + '_'),
self.parent_corpus.structural_attributes))
attr_subattrs = list(map(lambda x: x.split('_', 1)[1],
attr_subattrs))
if len(attr_subattrs) == 0:
continue
lookups[attr + '_lookup'] = {}
for attr_id in occured_attr_ids:
lookups[attr + '_lookup'][attr_id] = {}
for subattr in attr_subattrs:
__subattr = attr + '_' + subattr
subattr_values = \
self.parent_corpus.struc2str(__subattr, occured_attr_ids)
for i, value in enumerate(subattr_values):
lookups[attr + '_lookup'][occured_attr_ids[i]][subattr] = \
value
return {'matches': matches, **lookups}
def fdist_1(self, cutoff, field, attribute):
return self.client.cqp_fdist_1(self.__name, cutoff, field, attribute)
def fdist_2(self, cutoff, field_1, attribute_1, field_2, attribute_2):
return self.client.cqp_fdist_2(self.__name, cutoff, field_1,
attribute_1, field_2, attribute_2)