mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2025-06-21 21:40:34 +00:00
Add package implementation of cqi
This commit is contained in:
0
app/corpora/cqi/models/__init__.py
Normal file
0
app/corpora/cqi/models/__init__.py
Normal file
102
app/corpora/cqi/models/corpora.py
Normal file
102
app/corpora/cqi/models/corpora.py
Normal file
@ -0,0 +1,102 @@
|
||||
from .subcorpora import SubcorpusCollection
|
||||
|
||||
|
||||
class CorpusCollection:
|
||||
def __init__(self, client):
|
||||
self.client = client
|
||||
|
||||
def get(self, name):
|
||||
return Corpus(self.client, name)
|
||||
|
||||
def list(self):
|
||||
return [Corpus(self.client, corpus) for corpus
|
||||
in self.client.corpus_list_coprora()]
|
||||
|
||||
|
||||
class Corpus:
|
||||
def __init__(self, client, name):
|
||||
self.client = client
|
||||
self.name = name
|
||||
self.__load()
|
||||
|
||||
def __load(self):
|
||||
self.size = self.client.cl_attribute_size('{}.word'.format(self.name))
|
||||
# self.info = client.corpus_info(self.name)
|
||||
self.charset = self.client.corpus_charset(self.name)
|
||||
# self.full_name = self.client.corpus_full_name(self.name)
|
||||
self.properties = self.client.corpus_properties(self.name)
|
||||
self.alignment_attributes = \
|
||||
self.client.corpus_alignment_attributes(self.name)
|
||||
self.structural_attributes = \
|
||||
self.client.corpus_structural_attributes(self.name)
|
||||
self.positional_attributes = \
|
||||
self.client.corpus_positional_attributes(self.name)
|
||||
self.subcorpora = SubcorpusCollection(self.client, self)
|
||||
|
||||
def alg2cpos(self, attribute, alg):
|
||||
__attribute = '{}.{}'.format(self.name, attribute)
|
||||
return self.client.cl_alg2cpos(__attribute, alg)
|
||||
|
||||
def cpos2alg(self, attribute, cpos_list):
|
||||
__attribute = '{}.{}'.format(self.name, attribute)
|
||||
return self.client.cl_cpos2alg(__attribute, cpos_list)
|
||||
|
||||
def cpos2id(self, attribute, cpos_list):
|
||||
__attribute = '{}.{}'.format(self.name, attribute)
|
||||
return self.client.cl_cpos2id(__attribute, cpos_list)
|
||||
|
||||
def cpos2lbound(self, attribute, cpos_list):
|
||||
__attribute = '{}.{}'.format(self.name, attribute)
|
||||
return self.client.cl_cpos2lbound(__attribute, cpos_list)
|
||||
|
||||
def cpos2rbound(self, attribute, cpos_list):
|
||||
__attribute = '{}.{}'.format(self.name, attribute)
|
||||
return self.client.cl_cpos2rbound(__attribute, cpos_list)
|
||||
|
||||
def cpos2str(self, attribute, cpos_list):
|
||||
__attribute = '{}.{}'.format(self.name, attribute)
|
||||
return self.client.cl_cpos2str(__attribute, cpos_list)
|
||||
|
||||
def cpos2struc(self, attribute, cpos_list):
|
||||
__attribute = '{}.{}'.format(self.name, attribute)
|
||||
return self.client.cl_cpos2struc(__attribute, cpos_list)
|
||||
|
||||
def id2cpos(self, attribute, id):
|
||||
__attribute = '{}.{}'.format(self.name, attribute)
|
||||
return self.client.cl_id2cpos(__attribute, id)
|
||||
|
||||
def idlist2cpos(self, attribute, ids):
|
||||
__attribute = '{}.{}'.format(self.name, attribute)
|
||||
return self.client.cl_idlist2cpos(__attribute, ids)
|
||||
|
||||
def id2freq(self, attribute, ids):
|
||||
__attribute = '{}.{}'.format(self.name, attribute)
|
||||
return self.client.cl_id2freq(__attribute, ids)
|
||||
|
||||
def id2str(self, attribute, ids):
|
||||
__attribute = '{}.{}'.format(self.name, attribute)
|
||||
return self.client.cl_id2str(__attribute, ids)
|
||||
|
||||
def query(self, query, subcorpus_name='Results'):
|
||||
self.client.cqp_query(self.name, subcorpus_name, query)
|
||||
return self.subcorpora.get('Results')
|
||||
|
||||
def regex2id(self, attribute, regex):
|
||||
__attribute = '{}.{}'.format(self.name, attribute)
|
||||
return self.client.cl_regex2id(__attribute, regex)
|
||||
|
||||
def structural_attribute_has_values(self, attribute):
|
||||
__attribute = '{}.{}'.format(self.name, attribute)
|
||||
return self.client.corpus_structural_attribute_has_values(__attribute)
|
||||
|
||||
def str2id(self, attribute, strings):
|
||||
__attribute = '{}.{}'.format(self.name, attribute)
|
||||
return self.client.cl_str2id(__attribute, strings)
|
||||
|
||||
def struc2cpos(self, attribute, struc):
|
||||
__attribute = '{}.{}'.format(self.name, attribute)
|
||||
return self.client.cl_struc2cpos(__attribute, struc)
|
||||
|
||||
def struc2str(self, attribute, strucs):
|
||||
__attribute = '{}.{}'.format(self.name, attribute)
|
||||
return self.client.cl_struc2str(__attribute, strucs)
|
112
app/corpora/cqi/models/subcorpora.py
Normal file
112
app/corpora/cqi/models/subcorpora.py
Normal file
@ -0,0 +1,112 @@
|
||||
from ..constants import (CONST_FIELD_KEYWORD, CONST_FIELD_MATCH,
|
||||
CONST_FIELD_MATCHEND, CONST_FIELD_TARGET)
|
||||
|
||||
|
||||
class SubcorpusCollection:
|
||||
def __init__(self, client, parent_corpus):
|
||||
self.client = client
|
||||
self.parent_corpus = parent_corpus
|
||||
|
||||
def get(self, name):
|
||||
return Subcorpus(self.client, self.parent_corpus, name)
|
||||
|
||||
def list(self):
|
||||
return [Subcorpus(self.client, self.parent_corpus, subcorpus)
|
||||
for subcorpus
|
||||
in self.client.cqp_list_subcorpora(self.parent_corpus.name)]
|
||||
|
||||
|
||||
class Subcorpus:
|
||||
def __init__(self, client, parent_corpus, name):
|
||||
self.client = client
|
||||
self.parent_corpus = parent_corpus
|
||||
self.name = name
|
||||
self.__name = '{}:{}'.format(parent_corpus.name, name)
|
||||
self.__load()
|
||||
|
||||
def __load(self):
|
||||
self.fields = {}
|
||||
if self.client.cqp_subcorpus_has_field(self.__name, CONST_FIELD_MATCH):
|
||||
self.fields['match'] = CONST_FIELD_MATCH
|
||||
if self.client.cqp_subcorpus_has_field(self.__name,
|
||||
CONST_FIELD_MATCHEND):
|
||||
self.fields['matchend'] = CONST_FIELD_MATCHEND
|
||||
if self.client.cqp_subcorpus_has_field(self.__name,
|
||||
CONST_FIELD_TARGET):
|
||||
self.fields['target'] = CONST_FIELD_TARGET
|
||||
if self.client.cqp_subcorpus_has_field(self.__name,
|
||||
CONST_FIELD_KEYWORD):
|
||||
self.fields['keyword'] = CONST_FIELD_KEYWORD
|
||||
self.size = self.client.cqp_subcorpus_size(self.__name)
|
||||
|
||||
def drop(self):
|
||||
return self.client.cqp_drop_subcorpus(self.__name)
|
||||
|
||||
def dump(self, field, first, last):
|
||||
return self.client.cqp_dump_subcorpus(self.__name, field, first, last)
|
||||
|
||||
def dump_values(self, context=25, first_result=0,
|
||||
num_results=float('inf')):
|
||||
first_result = max(0, first_result)
|
||||
last_result = min((first_result + num_results), (self.size - 1))
|
||||
matches = []
|
||||
match_boundaries = zip(self.dump(self.fields['match'], first_result,
|
||||
last_result),
|
||||
self.dump(self.fields['matchend'], first_result,
|
||||
last_result))
|
||||
for match_start, match_end in match_boundaries:
|
||||
left_start = max(0, match_start - context)
|
||||
right_end = min(self.parent_corpus.size, (match_end + 1 + context))
|
||||
matches.append({'left': list(range(left_start, match_start)),
|
||||
'hit': list(range(match_start, match_end + 1)),
|
||||
'right': list(range(match_end + 1, right_end))})
|
||||
cpos_list = []
|
||||
for match in matches:
|
||||
cpos_list += match['left'] + match['hit'] + match['right']
|
||||
cpos_list = list(set(cpos_list))
|
||||
lookups = {}
|
||||
if len(cpos_list) > 0:
|
||||
lookups['cpos_lookup'] = {}
|
||||
for cpos in cpos_list:
|
||||
lookups['cpos_lookup'][cpos] = {}
|
||||
for attr in self.parent_corpus.positional_attributes:
|
||||
cpos_attr_values = self.parent_corpus.cpos2str(attr, cpos_list)
|
||||
for i, cpos in enumerate(cpos_list):
|
||||
lookups['cpos_lookup'][cpos][attr] = cpos_attr_values[i]
|
||||
for attr in self.parent_corpus.structural_attributes:
|
||||
if self.parent_corpus.structural_attribute_has_values(attr):
|
||||
continue
|
||||
cpos_attr_ids = self.parent_corpus.cpos2struc(attr, cpos_list)
|
||||
for i, cpos in enumerate(cpos_list):
|
||||
if cpos_attr_ids[i] != -1:
|
||||
lookups['cpos_lookup'][cpos][attr] = cpos_attr_ids[i]
|
||||
occured_attr_ids = list(set(cpos_attr_ids))
|
||||
occured_attr_ids = list(filter(lambda x: x != -1,
|
||||
occured_attr_ids))
|
||||
if len(occured_attr_ids) == 0:
|
||||
continue
|
||||
attr_subattrs = \
|
||||
list(filter(lambda x: x.startswith(attr + '_'),
|
||||
self.parent_corpus.structural_attributes))
|
||||
attr_subattrs = list(map(lambda x: x.split('_', 1)[1],
|
||||
attr_subattrs))
|
||||
if len(attr_subattrs) == 0:
|
||||
continue
|
||||
lookups[attr + '_lookup'] = {}
|
||||
for attr_id in occured_attr_ids:
|
||||
lookups[attr + '_lookup'][attr_id] = {}
|
||||
for subattr in attr_subattrs:
|
||||
__subattr = attr + '_' + subattr
|
||||
subattr_values = \
|
||||
self.parent_corpus.struc2str(__subattr, occured_attr_ids)
|
||||
for i, value in enumerate(subattr_values):
|
||||
lookups[attr + '_lookup'][occured_attr_ids[i]][subattr] = \
|
||||
value
|
||||
return {'matches': matches, **lookups}
|
||||
|
||||
def fdist_1(self, cutoff, field, attribute):
|
||||
return self.client.cqp_fdist_1(self.__name, cutoff, field, attribute)
|
||||
|
||||
def fdist_2(self, cutoff, field_1, attribute_1, field_2, attribute_2):
|
||||
return self.client.cqp_fdist_2(self.__name, cutoff, field_1,
|
||||
attribute_1, field_2, attribute_2)
|
Reference in New Issue
Block a user