Merge branch 'development' of gitlab.ub.uni-bielefeld.de:sfb1288inf/opaque into development

This commit is contained in:
Stephan Porada 2020-04-08 09:46:38 +02:00
commit b0104fe606
10 changed files with 269 additions and 485 deletions

View File

@ -1,7 +1,6 @@
# flake8: noqa
from .api import APIClient
from .client import CQiClient
from .wrapper import CQiWrapper
from .version import version, version_info

View File

@ -20,10 +20,11 @@ class APIClient:
>>> client.ctrl_bye()
{'code': 259, 'msg': 'CQI_STATUS_BYE_OK'}
Args:
Attributes:
host (str): URL to the CQP server. For example,
``cqpserver.localhost`` or ``127.0.0.1``.
port (int): Port the CQP server listens on. Default: ``4877``
socket (socket.socket): Socket for communicating with a CQP server.
"""
def __init__(self, host, port=4877):

View File

@ -15,8 +15,9 @@ class CQiClient:
{'code': 260, 'msg': 'CQI_STATUS_PING_OK'}
>>> client.disconnect()
{'code': 259, 'msg': 'CQI_STATUS_BYE_OK'}
Attributes:
api (APIClient): A client pointing to the specified to the CQP server.
api (APIClient): An API client pointing to the specified CQP server.
"""
def __init__(self, host, port=4877):
@ -32,12 +33,14 @@ class CQiClient:
def connect(self, username='anonymous', password=''):
status = self.api.ctrl_connect(username, password)
self.corpora = CorpusCollection(self)
return status
def disconnect(self):
del self.corpora
return self.api.ctrl_bye()
def ping(self):
return self.api.ctrl_ping()
@property
def corpora(self):
return CorpusCollection(client=self)

View File

@ -1,139 +1,119 @@
class AttributeCollection:
def __init__(self, client, corpus):
self.client = client
self.corpus = corpus
self.alignment = AlignmentAttributeCollection(client, corpus)
self.positional = PositionalAttributeCollection(client, corpus)
self.structural = StructuralAttributeCollection(client, corpus)
from .ressource import Collection, Model
class Attribute:
class Attribute(Model):
"""
This is a class representing an attribute. Attributes denote the general
category of information. A specific occurence is identified by an Id.
Attributes:
client (CQiClient): A connected client pointing at the server that this
object is on.
corpus (Corpus): The corpus, this attribute belongs to.
name (str): The name of the Attribute.
size (int): The number of occurences of this attribute within the corpus.
"""
def __init__(self, client, corpus, name):
self.client = client
self.corpus = corpus
self.name = name
self._name = '{}.{}'.format(corpus.name, name)
self.size = client.api.cl_attribute_size(self._name)
id_attribute = 'api_name'
@staticmethod
def _attrs(client, corpus, name):
api_name = '{}.{}'.format(corpus.attrs['api_name'], name)
return {'api_name': api_name,
'name': name,
'size': client.api.cl_attribute_size(api_name)}
def drop(self):
return self.client.api.cl_drop_attribute(self._name)
return self.client.api.cl_drop_attribute(self.attrs['api_name'])
class AlignmentAttributeCollection:
def __init__(self, client, corpus):
self.client = client
class AttributeCollection(Collection):
model = Attribute
def __init__(self, client=None, corpus=None):
super(AttributeCollection, self).__init__(client=client)
self.corpus = corpus
def get(self, name):
return AlignmentAttribute(self.client, self.corpus, name)
def get(self, attribute_name):
return self.prepare_model(self.model._attrs(self.client, self.corpus,
attribute_name))
def list(self):
return [AlignmentAttribute(self.client, self.corpus, attr) for attr in
self.client.api.corpus_alignment_attributes(self.corpus.name)]
raise NotImplementedError
class AlignmentAttribute(Attribute):
def cpos_by_ids(self, id_list):
return self.client.api.cl_alg2cpos(self._name, id_list)
return self.client.api.cl_alg2cpos(self.attrs['api_name'], id_list)
def ids_by_cpos(self, cpos_list):
return self.client.api.cl_cpos2alg(self._name, cpos_list)
return self.client.api.cl_cpos2alg(self.attrs['api_name'], cpos_list)
class PositionalAttributeCollection:
def __init__(self, client, corpus):
self.client = client
self.corpus = corpus
def get(self, name):
return PositionalAttribute(self.client, self.corpus, name)
class AlignmentAttributeCollection(AttributeCollection):
model = AlignmentAttribute
def list(self):
return [PositionalAttribute(self.client, self.corpus, attr) for attr in
self.client.api.corpus_positional_attributes(self.corpus.name)]
return [self.prepare_model(self.model._attrs(self.client, self.corpus, attr)) # noqa
for attr in self.client.api.corpus_alignment_attributes(self.corpus.attrs['api_name'])] # noqa
class PositionalAttribute(Attribute):
def __init__(self, client, corpus, name):
super(PositionalAttribute, self).__init__(client, corpus, name)
self.lexicon_size = client.api.cl_lexicon_size(self._name)
@staticmethod
def _attrs(client, corpus, name):
attrs = super(PositionalAttribute, PositionalAttribute)._attrs(client, corpus, name) # noqa
attrs['lexicon_size'] = client.api.cl_lexicon_size(attrs['api_name'])
return attrs
def cpos_by_id(self, id):
return self.client.api.cl_id2cpos(self._name, id)
return self.client.api.cl_id2cpos(self.attrs['api_name'], id)
def cpos_by_ids(self, id_list):
return self.client.api.cl_idlist2cpos(self._name, id_list)
return self.client.api.cl_idlist2cpos(self.attrs['api_name'], id_list)
def freqs_by_ids(self, id_list):
return self.client.api.cl_id2freq(self._name, id_list)
return self.client.api.cl_id2freq(self.attrs['api_name'], id_list)
def ids_by_cpos(self, cpos_list):
return self.client.api.cl_cpos2id(self._name, cpos_list)
return self.client.api.cl_cpos2id(self.attrs['api_name'], cpos_list)
def ids_by_regex(self, regex):
return self.client.api.cl_regex2id(self._name, regex)
return self.client.api.cl_regex2id(self.attrs['api_name'], regex)
def ids_by_values(self, value_list):
return self.client.api.cl_str2id(self._name, value_list)
return self.client.api.cl_str2id(self.attrs['api_name'], value_list)
def values_by_cpos(self, cpos_list):
return self.client.api.cl_cpos2str(self._name, cpos_list)
return self.client.api.cl_cpos2str(self.attrs['api_name'], cpos_list)
def values_by_ids(self, id_list):
return self.client.api.cl_id2str(self._name, id_list)
return self.client.api.cl_id2str(self.attrs['api_name'], id_list)
class StructuralAttributeCollection:
def __init__(self, client, corpus):
self.client = client
self.corpus = corpus
class PositionalAttributeCollection(AttributeCollection):
model = PositionalAttribute
def get(self, name):
return StructuralAttribute(self.client, self.corpus, name)
def list(self, filters={}):
attrs = [StructuralAttribute(self.client, self.corpus, attr) for attr
in self.client.api.corpus_structural_attributes(
self.corpus.name)]
for k, v in filters.items():
if k == 'part_of':
attrs = list(filter(lambda x: x.name.startswith(v.name + '_'),
attrs))
return attrs
def list(self):
return [self.prepare_model(self.model._attrs(self.client, self.corpus, attr)) # noqa
for attr in self.client.api.corpus_positional_attributes(self.corpus.attrs['api_name'])] # noqa
class StructuralAttribute(Attribute):
def __init__(self, client, corpus, name):
super(StructuralAttribute, self).__init__(client, corpus, name)
self.has_values = \
client.api.corpus_structural_attribute_has_values(self._name)
@staticmethod
def _attrs(client, corpus, name):
attrs = super(StructuralAttribute, StructuralAttribute)._attrs(client, corpus, name) # noqa
attrs['has_values'] = client.api.corpus_structural_attribute_has_values(attrs['api_name']) # noqa
return attrs
def cpos_by_id(self, id):
return self.client.api.cl_struc2cpos(self._name, id)
return self.client.api.cl_struc2cpos(self.attrs['api_name'], id)
def ids_by_cpos(self, cpos_list):
return self.client.api.cl_cpos2struc(self._name, cpos_list)
return self.client.api.cl_cpos2struc(self.attrs['api_name'], cpos_list)
def lbound_by_cpos(self, cpos_list):
return self.client.api.cl_cpos2lbound(self._name, cpos_list)
return self.client.api.cl_cpos2lbound(self.attrs['api_name'],
cpos_list)
def rbound_by_cpos(self, cpos_list):
return self.client.api.cl_cpos2rbound(self._name, cpos_list)
return self.client.api.cl_cpos2rbound(self.attrs['api_name'],
cpos_list)
def values_by_ids(self, id_list):
return self.client.api.cl_struc2str(self._name, id_list)
return self.client.api.cl_struc2str(self.attrs['api_name'], id_list)
def export(self, first_cpos, last_cpos, context=0, expand_lists=False):
first_id, last_id = self.ids_by_cpos([first_cpos, last_cpos])
@ -150,9 +130,10 @@ class StructuralAttribute(Attribute):
else:
cpos_list_lbound = first_cpos
rc_rbound = \
self.cpos_by_id(min((last_id + context), (self.size - 1)))[1]
self.cpos_by_id(min((last_id + context), (self.attrs['size'] - 1)))[1]
if rc_rbound != last_cpos:
rc_lbound = min((last_cpos + 1), (self.corpus.size - 1))
rc_lbound = min((last_cpos + 1),
(self.collection.corpus.attrs['size'] - 1))
rc = (rc_lbound, rc_rbound)
cpos_list_rbound = rc_rbound
else:
@ -164,5 +145,17 @@ class StructuralAttribute(Attribute):
'rc': list(range(rc[0], (rc[1] + 1)))}
else:
match = {'lc': lc, 'c': c, 'rc': rc}
lookups = self.corpus.lookups_by_cpos(cpos_list)
lookups = self.collection.corpus.lookups_by_cpos(cpos_list)
return {'match': match, **lookups}
class StructuralAttributeCollection(AttributeCollection):
model = StructuralAttribute
def list(self, filters={}):
attrs = [self.prepare_model(self.model._attrs(self.client, self.corpus, attr)) # noqa
for attr in self.client.api.corpus_structural_attributes(self.corpus.attrs['api_name'])] # noqa
for k, v in filters.items():
if k == 'part_of':
attrs = list(filter(lambda x: x.attrs['name'].startswith(v.attrs['name'] + '_'), attrs)) # noqa
return attrs

View File

@ -1,30 +1,24 @@
from .attributes import AttributeCollection
from .attributes import (AlignmentAttributeCollection,
PositionalAttributeCollection,
StructuralAttributeCollection)
from .ressource import Collection, Model
from .subcorpora import SubcorpusCollection
class CorpusCollection:
def __init__(self, client):
self.client = client
class Corpus(Model):
id_attribute = 'api_name'
def get(self, name):
return Corpus(self.client, name)
def list(self):
return [Corpus(self.client, corpus) for corpus in
self.client.api.corpus_list_coprora()]
class Corpus:
def __init__(self, client, name):
self.client = client
self.name = name
self.size = client.api.cl_attribute_size('{}.word'.format(name))
# self.info = client.api.corpus_info(name)
self.charset = client.api.corpus_charset(name)
# self.full_name = client.api.corpus_full_name(name)
self.properties = client.api.corpus_properties(name)
self.attributes = AttributeCollection(client, self)
self.subcorpora = SubcorpusCollection(client, self)
@staticmethod
def _attrs(client, name):
api_name = name
return {'api_name': api_name,
'name': name,
'size': client.api.cl_attribute_size(
'{}.word'.format(api_name)),
# 'info': client.api.corpus_info(name),
'charset': client.api.corpus_charset(api_name),
# 'full_name' = client.api.corpus_full_name(name),
'properties': client.api.corpus_properties(api_name)}
def lookups_by_cpos(self, cpos_list):
cpos_list = list(set(cpos_list))
@ -33,35 +27,64 @@ class Corpus:
lookups['cpos_lookup'] = {}
for cpos in cpos_list:
lookups['cpos_lookup'][cpos] = {}
for attr in self.attributes.positional.list():
for attr in self.positional_attributes.list():
cpos_attr_values = attr.values_by_cpos(cpos_list)
for i, cpos in enumerate(cpos_list):
lookups['cpos_lookup'][cpos][attr.name] = cpos_attr_values[i]
for attr in self.attributes.structural.list():
if attr.has_values:
lookups['cpos_lookup'][cpos][attr.attrs['name']] = cpos_attr_values[i]
for attr in self.structural_attributes.list():
if attr.attrs['has_values']:
continue
cpos_attr_ids = attr.ids_by_cpos(cpos_list)
for i, cpos in enumerate(cpos_list):
if cpos_attr_ids[i] != -1:
lookups['cpos_lookup'][cpos][attr.name] = cpos_attr_ids[i]
lookups['cpos_lookup'][cpos][attr.attrs['name']] = cpos_attr_ids[i]
occured_attr_ids = list(filter(lambda x: x != -1,
set(cpos_attr_ids)))
if not occured_attr_ids:
continue
subattrs = \
self.attributes.structural.list(filters={'part_of': attr})
self.structural_attributes.list(filters={'part_of': attr})
if not subattrs:
continue
lookup_name = '{}_lookup'.format(attr.name)
lookup_name = '{}_lookup'.format(attr.attrs['name'])
lookups[lookup_name] = {}
for attr_id in occured_attr_ids:
lookups[lookup_name][attr_id] = {}
for subattr in subattrs:
subattr_values = subattr.values_by_ids(occured_attr_ids)
for i, subattr_value in enumerate(subattr_values):
lookups[lookup_name][occured_attr_ids[i]][subattr.name] = \
subattr_name = subattr.attrs['name'][(len(attr.attrs['name']) + 1):]
lookups[lookup_name][occured_attr_ids[i]][subattr_name] = \
subattr_value
return lookups
def query(self, query, subcorpus_name='Results'):
return self.client.api.cqp_query(self.name, subcorpus_name, query)
return self.client.api.cqp_query(self.attrs['api_name'],
subcorpus_name, query)
@property
def alignment_attributes(self):
return AlignmentAttributeCollection(client=self.client, corpus=self)
@property
def positional_attributes(self):
return PositionalAttributeCollection(client=self.client, corpus=self)
@property
def structural_attributes(self):
return StructuralAttributeCollection(client=self.client, corpus=self)
@property
def subcorpora(self):
return SubcorpusCollection(client=self.client, corpus=self)
class CorpusCollection(Collection):
model = Corpus
def get(self, corpus_name):
return self.prepare_model(self.model._attrs(self.client, corpus_name))
def list(self):
return [self.prepare_model(self.model._attrs(self.client, corpus))
for corpus in self.client.api.corpus_list_coprora()]

View File

@ -0,0 +1,78 @@
class Model:
"""
A base class for representing a single object on the server.
"""
id_attribute = 'Id'
def __init__(self, attrs=None, client=None, collection=None):
#: A client pointing at the server that this object is on.
self.client = client
#: The collection that this model is part of.
self.collection = collection
#: The raw representation of this object from the API
self.attrs = attrs or {}
@staticmethod
def _attrs(client, key):
raise NotImplementedError
def __repr__(self):
return "<{}: {}>".format(self.__class__.__name__, self.id)
def __eq__(self, other):
return isinstance(other, self.__class__) and self.id == other.id
def __hash__(self):
return hash("{}:{}".format(self.__class__.__name__, self.id))
@property
def id(self):
"""
The ID of the object.
"""
return self.attrs.get(self.id_attribute)
def reload(self):
"""
Load this object from the server again and update ``attrs`` with the
new data.
"""
new_model = self.collection.get(self.id)
self.attrs = new_model.attrs
class Collection:
"""
A base class for representing all objects of a particular type on the
server.
"""
#: The type of object this collection represents, set by subclasses
model = None
def __init__(self, client=None):
#: The client pointing at the server that this collection of objects
#: is on.
self.client = client
def list(self):
raise NotImplementedError
def get(self, key):
raise NotImplementedError
def prepare_model(self, attrs):
"""
Create a model from a set of attributes.
"""
if isinstance(attrs, Model):
attrs.client = self.client
attrs.collection = self
return attrs
elif isinstance(attrs, dict):
return self.model(attrs=attrs, client=self.client, collection=self)
else:
raise Exception("Can't create {} from {}".format(
self.model.__name__, attrs))

View File

@ -1,54 +1,45 @@
from .ressource import Collection, Model
from ..api.specification import (CONST_FIELD_KEYWORD, CONST_FIELD_MATCH,
CONST_FIELD_MATCHEND, CONST_FIELD_TARGET)
class SubcorpusCollection:
def __init__(self, client, corpus):
self.client = client
self.corpus = corpus
class Subcorpus(Model):
id_attribute = 'api_name'
def get(self, name):
return Subcorpus(self.client, self.corpus, name)
def list(self):
return [Subcorpus(self.client, self.corpus, subcorpus) for subcorpus in
self.client.api.cqp_list_subcorpora(self.corpus.name)]
class Subcorpus:
def __init__(self, client, corpus, name):
self.client = client
self.corpus = corpus
self.name = name
self._name = '{}:{}'.format(corpus.name, name)
self.fields = {}
if client.api.cqp_subcorpus_has_field(self._name, CONST_FIELD_MATCH):
self.fields['match'] = CONST_FIELD_MATCH
if client.api.cqp_subcorpus_has_field(self._name,
@staticmethod
def _attrs(client, corpus, name):
api_name = '{}:{}'.format(corpus.attrs['api_name'], name)
fields = {}
if client.api.cqp_subcorpus_has_field(api_name, CONST_FIELD_MATCH):
fields['match'] = CONST_FIELD_MATCH
if client.api.cqp_subcorpus_has_field(api_name,
CONST_FIELD_MATCHEND):
self.fields['matchend'] = CONST_FIELD_MATCHEND
if client.api.cqp_subcorpus_has_field(self._name, CONST_FIELD_TARGET):
self.fields['target'] = CONST_FIELD_TARGET
if client.api.cqp_subcorpus_has_field(self._name, CONST_FIELD_KEYWORD):
self.fields['keyword'] = CONST_FIELD_KEYWORD
self.size = client.api.cqp_subcorpus_size(self._name)
fields['matchend'] = CONST_FIELD_MATCHEND
if client.api.cqp_subcorpus_has_field(api_name, CONST_FIELD_TARGET):
fields['target'] = CONST_FIELD_TARGET
if client.api.cqp_subcorpus_has_field(api_name, CONST_FIELD_KEYWORD):
fields['keyword'] = CONST_FIELD_KEYWORD
return {'api_name': api_name,
'name': name,
'fields': fields,
'size': client.api.cqp_subcorpus_size(api_name)}
def drop(self):
return self.client.api.cqp_drop_subcorpus(self._name)
return self.client.api.cqp_drop_subcorpus(self.attrs['api_name'])
def dump(self, field, first, last):
return self.client.api.cqp_dump_subcorpus(self._name, field, first,
last)
return self.client.api.cqp_dump_subcorpus(self.attrs['api_name'],
field, first, last)
def export(self, context=25, cutoff=float('inf'), expand_lists=False,
offset=0):
if self.size == 0:
if self.attrs['size'] == 0:
return {"matches": []}
first_match = max(0, offset)
last_match = min((offset + cutoff - 1), (self.size - 1))
last_match = min((offset + cutoff - 1), (self.attrs['size'] - 1))
match_boundaries = \
zip(self.dump(self.fields['match'], first_match, last_match),
self.dump(self.fields['matchend'], first_match, last_match))
zip(self.dump(self.attrs['fields']['match'], first_match, last_match), # noqa
self.dump(self.attrs['fields']['matchend'], first_match, last_match)) # noqa
cpos_list = []
matches = []
for match_start, match_end in match_boundaries:
@ -64,10 +55,11 @@ class Subcorpus:
cpos_list_lbound = lc_lbound
else:
cpos_list_lbound = match_start
rc_lbound = min((match_end + 1), (self.corpus.size - 1))
rc_lbound = min((match_end + 1),
(self.collection.corpus.attrs['size'] - 1))
if rc_lbound != match_end:
rc_rbound = min((match_end + 1 + context),
(self.corpus.size - 1))
(self.collection.corpus.attrs['size'] - 1))
rc = (rc_lbound, rc_rbound)
cpos_list_rbound = rc_rbound
else:
@ -81,14 +73,30 @@ class Subcorpus:
else:
match = {'lc': lc, 'c': c, 'rc': rc}
matches.append(match)
lookups = self.corpus.lookups_by_cpos(cpos_list)
lookups = self.collection.corpus.lookups_by_cpos(cpos_list)
return {'matches': matches, **lookups}
def fdist_1(self, cutoff, field, attribute):
return self.client.api.cqp_fdist_1(self._name, cutoff,
return self.client.api.cqp_fdist_1(self.attrs['api_name'], cutoff,
field, attribute._name)
def fdist_2(self, cutoff, field_1, attribute_1, field_2, attribute_2):
return self.client.api.cqp_fdist_2(self._name, cutoff,
return self.client.api.cqp_fdist_2(self.attrs['api_name'], cutoff,
field_1, attribute_1._name,
field_2, attribute_2._name)
class SubcorpusCollection(Collection):
model = Subcorpus
def __init__(self, client=None, corpus=None):
super(SubcorpusCollection, self).__init__(client=client)
self.corpus = corpus
def get(self, subcorpus_name):
return self.prepare_model(self.model._attrs(self.client, self.corpus,
subcorpus_name))
def list(self):
return [self.prepare_model(self.model._attrs(self.client, self.corpus, subcorpus)) # noqa
for subcorpus in self.client.api.cqp_list_subcorpora(self.corpus.attrs['api_name'])] # noqa

View File

@ -1,321 +0,0 @@
from .api import APIClient
from .api.specification import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND
import time
class CQiWrapper(APIClient):
'''
CQIiWrapper object
High level wrapper that groups and renames some functions of CQiClient
for ease of use. Also structures recieved data into python dictionaries.
Keyword arguments:
host -- host IP adress or hostname wher the cqp server is running
port -- port of the cqp server
username -- username used to connect to the cqp server
password -- password of the user to connect to the cqp server
'''
SUBCORPUS_NAMES = []
def __init__(self, host='127.0.0.1', port=4877, username='anonymous',
password=''):
super(CQiWrapper, self).__init__(host, port=port)
self.username = username
self.password = password
def connect(self):
'''
Connect with CQP server
Connects via socket to the CQP server using the given username and
password from class initiation.
'''
self.ctrl_connect(self.username, self.password)
def __create_attribute_strings(self):
'''
Creates all needed attribute strings to query for word, lemma etc. in
the given corpus.
For example: CORPUS_NAME.word to query words
Automaticalle creates strings for all pre defined tags.
'''
p_attrs = self.corpus_positional_attributes(self.corpus_name)
struct_attrs = self.corpus_structural_attributes(self.corpus_name)
self.attr_strings = {}
self.attr_strings['positional_attrs'] = {}
self.attr_strings['struct_attrs'] = {}
for p_attr in p_attrs:
self.attr_strings['positional_attrs'][p_attr] = (self.corpus_name
+ '.'
+ p_attr)
for struct_attr in struct_attrs:
self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
+ '.'
+ struct_attr)
print(('All positional and '
'structural attributes: {}').format(self.attr_strings))
def select_corpus(self, corpus_name):
'''
Checks if given copus name exists. If it exists set it as the main
corpus name used to create the needed query attribute strings like
CORPUS_NAME.word.
'''
if corpus_name in self.corpus_list_coprora():
self.corpus_name = corpus_name
self.__create_attribute_strings()
print('{} does exist.'.format(corpus_name))
else:
print('{} does not exist.'.format(corpus_name))
raise Exception('Given Corpus Name is not in corpora list.')
def disconnect(self):
'''
Disconnect from CQP server
Disconnects from the CQP server. Closes used socket after disconnect.
'''
self.ctrl_bye()
print('Disconnected from cqp server.')
def query_subcorpus(self, query, result_subcorpus_name='Query-results'):
'''
Create subcorpus
Input query will be used to create a subcorpus holding all cpos match
positions for that query.
Keyword arguments:
result_subcorpus_name -- set name of the subcorpus which holds all
cpos match positions, produced by the query
query -- query written in cqp query language
'''
self.query = query
self.cqp_query(self.corpus_name, result_subcorpus_name, query)
self.result_subcorpus = (self.corpus_name
+ ':'
+ result_subcorpus_name)
self.SUBCORPUS_NAMES.append(self.result_subcorpus)
self.match_count = self.cqp_subcorpus_size(self.result_subcorpus)
print('Nr of all matches is: {}'.format(self.match_count))
def show_subcorpora(self):
'''
Show all subcorpora currently saved by the cqp server.
'''
return self.cqp_list_subcorpora(self.corpus_name)
def show_query_results(self,
context_len=10,
result_len=1000,
result_offset=0):
'''
Show query results
Shows the actual matched strings produce by the query. Uses the cpos
match indexes to grab those strings. saves them into an orderd
dictionary. Also saves coresponding tags, lemmas and context. Gets those
informations using the corresponding cpos.
Keyword arguments:
context_len -- defines how many words before and after a match will be
shown (default 10)
result_len -- defines for how many matches all informations like lemma
and POS are being grabbed
result_offset -- defines the offset of the matches being requested. If
the offset is 100 informations for matches 100 to result_len are being
grabbed
'''
t0 = time.time()
self.context_len = context_len
self.corpus_max_len = self.cl_attribute_size(
self.attr_strings['positional_attrs']['word']
)
self.nr_matches = min(result_len, self.match_count)
if self.match_count == 0:
print('Query resulted in 0 matches.')
self.results = {'code': 0,
'result': {'matches': [],
'match_count': self.match_count,
'cpos_lookup': {},
'text_lookup': {}}
}
return self.results
else:
# Get match cpos boundries
# match_boundries shows the start and end cpos of one match as a
# pair of cpositions
# [(1355, 1357), (1477, 1479)] Example for two boundry pairs
offset_start = 0 if result_offset == 0 else result_offset
print('Offset start is: {}'.format(offset_start))
offset_end = min((self.nr_matches + result_offset - 1), self.match_count - 1)
print('Offset end is: {}'.format(offset_end))
match_boundaries = zip(self.cqp_dump_subcorpus(self.result_subcorpus,
CONST_FIELD_MATCH,
offset_start,
offset_end),
self.cqp_dump_subcorpus(self.result_subcorpus,
CONST_FIELD_MATCHEND,
offset_start,
offset_end))
# Generate all cpos between match boundries including start and end
# boundries.
# Also generate cpos for left and right context.
# Save those cpos into dict as lists for the keys 'lc', 'hit' and 'rc'
# Also collect all cpos together in one list for the final request of
# all cpos informations
all_matches = []
all_cpos = []
for start, end in match_boundaries:
end += 1
lc_cpos = list(range(max([0, start - self.context_len]), start))
lc = {'lc': lc_cpos}
match_cpos = list(range(start, end))
match = {'hit': match_cpos}
rc_cpos = list(range(end, min([self.corpus_max_len,
end + self.context_len])))
rc = {'rc': rc_cpos}
lc.update(match)
lc.update(rc)
all_cpos.extend(lc_cpos + match_cpos + rc_cpos)
all_matches.append(lc)
all_cpos = list(set(all_cpos)) # get rid of cpos duplicates
len_all_cpos = len(all_cpos)
t1 = time.time()
t_total = t1 - t0
print('Time to create all CPOS for query: {}'.format(t_total))
print('Requesting {} CPOS with one query.'.format(len_all_cpos))
# Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
# all cpos entries in all_cpos_list
# Also saves these informations into self.results dict
t2 = time.time()
all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
t3 = time.time()
t_final = t3 - t2
print('Got infos for {} CPOS in {} seconds:'.format(len_all_cpos,
t_final))
self.results = {'code': 0,
'result': {'matches': all_matches,
'match_count': self.match_count,
'cpos_lookup': all_cpos_infos,
'text_lookup': text_lookup}
}
return self.results
def get_cpos_infos(self, all_cpos):
'''
Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
all cpos entries specified in the parameter all_cpos.
'''
# Get all positional attribute informations
cpos_infos = {}
for p_attr_key in self.attr_strings['positional_attrs'].keys():
match_strs = self.cl_cpos2str(self.attr_strings['positional_attrs'][p_attr_key], all_cpos)
cpos_infos[p_attr_key] = match_strs
# Get all strucutural attribute informations
tmp_info = {}
structs_to_check = []
for struct_attr_key in self.attr_strings['struct_attrs'].keys():
key = self.attr_strings['struct_attrs'][struct_attr_key]
has_value = self.corpus_structural_attribute_has_values(key)
struct_ids = self.cl_cpos2struc(key, all_cpos)
if has_value is False: # Get IDs of strucutural elements without values (this means get IDs of XML tags. Struct elements only have values if they are XML attributes)
tmp_info[struct_attr_key] = []
for id in struct_ids:
tmp_info[struct_attr_key].append(id)
else:
structs_to_check.append({key: struct_attr_key})
print('Structs to check: {}'.format(structs_to_check))
struct_attr_values = list(tmp_info.values())
# print('Struct attr value list: {}'.format(struct_attr_values))
struct_attr_keys = list(tmp_info.keys())
# print('Struct attr key list: {}'.format(struct_attr_keys))
# Build textlookup dictionary
text_lookup_ids = list(set(struct_attr_values[0])) # every CPOS is associated with one text id. A set is build to only gather text_lookup informations for every unique text id
text_lookup = {} # final dict containing all info of one text identified by its id
for d in structs_to_check:
s_key, s_value = zip(*d.items())
print('dict entries: {}: {}'.format(s_key, s_value))
s_value = s_value[0].split('_', 1)[-1]
print('S_VALUE: {}'.format(s_value))
struct_values = self.cl_struc2str(s_key[0], text_lookup_ids)
print('Extracted Value with key {}: {}'.format(s_key[0], struct_values))
zipped = dict(zip(text_lookup_ids, struct_values))
for zip_key, zip_value in zipped.items():
print('Text id as key is: {}'.format(zip_key))
print('Value of this text is: {}'.format(zip_value))
check = text_lookup.get(zip_key)
print('check: {}'.format(check))
if check is None:
text_lookup[zip_key] = {s_value: zip_value}
else:
text_lookup[zip_key].update({s_value: zip_value})
# zip keys and values together
attr_values_list = []
attr_keys_list = []
for key in cpos_infos.keys():
attr_values_list.append(cpos_infos[key])
attr_keys_list.append(key)
attr_keys_list.extend(struct_attr_keys)
attr_values_list.extend(struct_attr_values)
joined_cpos_infos = zip(all_cpos, *attr_values_list)
dict_cpos_infos = {}
for info in joined_cpos_infos:
dict_cpos_infos[info[0]] = dict(zip(attr_keys_list, info[1:]))
return dict_cpos_infos, text_lookup
def get_sentences(self,
match_cpos_list,
get_surrounding_s=False,
l_r_s_context_additional_len=1):
'''
Get sentence informations for one match also set if and how much left
right context sentences should be grabbed surrounding the given CPOS.
'''
t0 = time.time()
key = self.corpus_name + '.s'
first_cpos, last_cpos = match_cpos_list[0], match_cpos_list[-1]
context_sentences = {}
s_ids = self.cl_cpos2struc(key, [first_cpos, last_cpos])
print('s id match: {}'.format(s_ids))
for s_id in s_ids:
s_start, s_end = self.cl_struc2cpos(key, s_id)
s_cpos = list(range(s_start, s_end + 1))
context_sentences[s_id] = s_cpos
if get_surrounding_s:
max_s_id = self.cl_attribute_size(key) - 1
print('max sid: {}'.format(max_s_id))
additional_s_ids = []
additional_s = list(range(max(s_ids[0]
- l_r_s_context_additional_len,
0),
min(s_ids[-1]
+ l_r_s_context_additional_len,
max_s_id) + 1))
additional_s_ids.extend(additional_s)
for s_id in additional_s_ids:
print('s id additional: {}'.format(s_id))
s_start, s_end = self.cl_struc2cpos(key, s_id)
s_cpos = list(range(s_start, s_end + 1))
context_sentences[s_id] = s_cpos
all_cpos = []
for key in context_sentences.keys():
all_cpos.extend(context_sentences[key])
all_cpos = list(set(all_cpos))
all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
t1 = time.time()
t_total = t1 - t0
print('Got all sentences informations in {} seconds'. format(t_total))
match_context = {'context_s_cpos': context_sentences,
'cpos_lookup': all_cpos_infos,
'text_lookup': text_lookup,
'match_cpos_list': match_cpos_list}
return match_context

View File

@ -67,23 +67,23 @@ def corpus_analysis_query(query):
socketio.emit('corpus_analysis_query', response, room=request.sid)
return
response = {'code': 200, 'desc': None, 'msg': 'OK',
'payload': {**query_status, 'match_count': results.size}}
'payload': {**query_status, 'match_count': results.attrs['size']}}
socketio.emit('corpus_analysis_query', response, room=request.sid)
chunk_size = 100
chunk_start = 0
context = 100
progress = 0
client.status = 'running'
while chunk_start <= results.size:
while chunk_start <= results.attrs['size']:
if client.status == 'abort':
break
chunk = results.export(context=context, cutoff=chunk_size,
expand_lists=False, offset=chunk_start)
chunk['cpos_ranges'] = True
if (results.size == 0):
if (results.attrs['size'] == 0):
progress = 100
else:
progress = ((chunk_start + chunk_size) / results.size) * 100
progress = ((chunk_start + chunk_size) / results.attrs['size']) * 100
progress = min(100, int(math.ceil(progress)))
response = {'code': 200, 'desc': None, 'msg': 'OK',
'payload': {'chunk': chunk, 'progress': progress}}
@ -104,12 +104,12 @@ def corpus_analysis_inspect_match(payload):
return
# Get more context for given match CPOS
corpus = client.corpora.get('CORPUS')
s = corpus.attributes.structural.get('s')
s = corpus.structural_attributes.get('s')
match_context = s.export(payload['first_cpos'], payload['last_cpos'],
context=3, expand_lists=False)
match_context['cpos_ranges'] = True
socketio.emit('match_context',
{'payload': match_context}, room=request.sid)
socketio.emit('match_context', {'payload': match_context},
room=request.sid)
def corpus_analysis_session_handler(app, corpus_id, user_id, session_id):

View File

@ -381,7 +381,7 @@ class ResultsList extends List {
token = chunk["cpos_lookup"][cpos];
hitCellElement.insertAdjacentHTML("beforeend", `<span class="token" data-cpos="${cpos}">${token["word"]} </span>`);
// get text titles of every hit cpos token
textTitles.add(chunk["text_lookup"][token["text"]]["text_title"]);
textTitles.add(chunk["text_lookup"][token["text"]]["title"]);
// add button to trigger more context to every match td
var inspectBtn = document.createElement("a");
inspectBtn.setAttribute("class", "btn-floating btn-flat waves-effect waves-light grey right inspect disabled");