Update cqi package

This commit is contained in:
Patrick Jentsch 2020-03-28 11:56:01 +01:00
parent 97fb8ded9a
commit a51394dddd
3 changed files with 74 additions and 36 deletions

View File

@ -105,9 +105,8 @@ class StructuralAttributeCollection:
class StructuralAttribute(Attribute): class StructuralAttribute(Attribute):
def __init__(self, client, corpus, name): def __init__(self, client, corpus, name):
super(StructuralAttribute, self).__init__(client, corpus, name) super(StructuralAttribute, self).__init__(client, corpus, name)
self.has_values = client.api.corpus_structural_attribute_has_values( self.has_values = \
self._name client.api.corpus_structural_attribute_has_values(self._name)
)
def cpos_by_id(self, id): def cpos_by_id(self, id):
return self.client.api.cl_struc2cpos(self._name, id) return self.client.api.cl_struc2cpos(self._name, id)
@ -124,16 +123,34 @@ class StructuralAttribute(Attribute):
def values_by_ids(self, id_list): def values_by_ids(self, id_list):
return self.client.api.cl_struc2str(self._name, id_list) return self.client.api.cl_struc2str(self._name, id_list)
def lrcontext_by_cpos(self, cpos_first, cpos_last=None, context=3): def export(self, first_cpos, last_cpos, context=0, expand_lists=False):
if cpos_last is None: first_id, last_id = self.ids_by_cpos([first_cpos, last_cpos])
cpos_last = cpos_first c = (first_cpos, last_cpos)
id_l, id_r = self.ids_by_cpos([cpos_first, cpos_last]) lc = rc = None
id_lc = max(0, id_l - context) if context == 0:
id_rc = min(id_r + context, self.size - 1) cpos_list = list(range(first_cpos, (last_cpos + 1)))
ids = {id: list(range(*self.cpos_by_id(id))) for id else:
in range(id_lc, id_rc)} lc_lbound = self.cpos_by_id(max(0, (first_id - context)))[0]
cpos_list = [cpos for cpos_list in ids.values() for cpos in cpos_list] if lc_lbound != first_cpos:
lc_rbound = max(0, (first_cpos - 1))
lc = (lc_lbound, lc_rbound)
cpos_list_lbound = lc_lbound
else:
cpos_list_lbound = first_cpos
rc_rbound = \
self.cpos_by_id(min((last_id + context), (self.size - 1)))[1]
if rc_rbound != last_cpos:
rc_lbound = min((last_cpos + 1), (self.size - 1))
rc = (rc_lbound, rc_rbound)
cpos_list_rbound = rc_rbound
else:
cpos_list_rbound = last_cpos
cpos_list = list(range(cpos_list_lbound, (cpos_list_rbound + 1)))
if expand_lists:
match = {'lc': list(range(lc[0], (lc[1] + 1))),
'c': list(range(c[0], (c[1] + 1))),
'rc': list(range(rc[0], (rc[1] + 1)))}
else:
match = {'lc': lc, 'c': c, 'rc': rc}
lookups = self.corpus.lookups_by_cpos(cpos_list) lookups = self.corpus.lookups_by_cpos(cpos_list)
return {'ids': ids, return {'match': match, **lookups}
'match_cpos_list': list(range(cpos_first, cpos_last)),
**lookups}

View File

@ -48,8 +48,8 @@ class Corpus:
set(cpos_attr_ids))) set(cpos_attr_ids)))
if not occured_attr_ids: if not occured_attr_ids:
continue continue
subattrs = self.attributes.structural.list( subattrs = \
filters={'part_of': attr}) self.attributes.structural.list(filters={'part_of': attr})
if not subattrs: if not subattrs:
continue continue
lookup_name = '{}_lookup'.format(attr.name) lookup_name = '{}_lookup'.format(attr.name)

View File

@ -40,32 +40,53 @@ class Subcorpus:
return self.client.api.cqp_dump_subcorpus(self._name, field, first, return self.client.api.cqp_dump_subcorpus(self._name, field, first,
last) last)
def dump_values(self, context=25, first_result=0, def export(self, context=25, cutoff=float('inf'), expand_lists=False,
num_results=float('inf')): offset=0):
if self.size == 0: if self.size == 0:
return {"matches": []} return {"matches": []}
first_result = max(0, first_result) first_match = max(0, offset)
last_result = min((first_result + num_results - 1), (self.size - 1)) last_match = min((offset + cutoff - 1), (self.size - 1))
matches = [] match_boundaries = \
match_boundaries = zip(self.dump(self.fields['match'], first_result, zip(self.dump(self.fields['match'], first_match, last_match),
last_result), self.dump(self.fields['matchend'], first_match, last_match))
self.dump(self.fields['matchend'], first_result,
last_result))
for match_start, match_end in match_boundaries:
left_start = max(0, match_start - context)
right_end = min(self.corpus.size, (match_end + 1 + context))
matches.append({'lc': list(range(left_start, match_start)),
'hit': list(range(match_start, match_end + 1)),
'rc': list(range(match_end + 1, right_end))})
cpos_list = [] cpos_list = []
for match in matches: matches = []
cpos_list += match['lc'] + match['hit'] + match['rc'] for match_start, match_end in match_boundaries:
c = (match_start, match_end)
lc = rc = None
if context == 0:
cpos_list += list(range(match_start, (match_end + 1)))
else:
lc_rbound = max(0, (match_start - 1))
if lc_rbound != match_start:
lc_lbound = max(0, match_start - context)
lc = (lc_lbound, lc_rbound)
cpos_list_lbound = lc_lbound
else:
cpos_list_lbound = match_start
rc_lbound = min((match_end + 1), (self.corpus.size - 1))
if rc_lbound != match_end:
rc_rbound = min((match_end + context),
(self.corpus.size - 1))
rc = (rc_lbound, rc_rbound)
cpos_list_rbound = rc_rbound
else:
cpos_list_rbound = match_end
cpos_list += list(range(cpos_list_lbound,
(cpos_list_rbound + 1)))
if expand_lists:
match = {'lc': list(range(lc[0], (lc[1] + 1))),
'c': list(range(c[0], (c[1] + 1))),
'rc': list(range(rc[0], (rc[1] + 1)))}
else:
match = {'lc': lc, 'c': c, 'rc': rc}
matches.append(match)
lookups = self.corpus.lookups_by_cpos(cpos_list) lookups = self.corpus.lookups_by_cpos(cpos_list)
return {'matches': matches, **lookups} return {'matches': matches, **lookups}
def fdist_1(self, cutoff, field, attribute): def fdist_1(self, cutoff, field, attribute):
return self.client.api.cqp_fdist_1(self._name, cutoff, field, return self.client.api.cqp_fdist_1(self._name, cutoff,
attribute._name) field, attribute._name)
def fdist_2(self, cutoff, field_1, attribute_1, field_2, attribute_2): def fdist_2(self, cutoff, field_1, attribute_1, field_2, attribute_2):
return self.client.api.cqp_fdist_2(self._name, cutoff, return self.client.api.cqp_fdist_2(self._name, cutoff,