mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2024-12-24 10:34:17 +00:00
Merge branch 'development' of gitlab.ub.uni-bielefeld.de:sfb1288inf/opaque into development
This commit is contained in:
commit
b0104fe606
@ -1,7 +1,6 @@
|
|||||||
# flake8: noqa
|
# flake8: noqa
|
||||||
from .api import APIClient
|
from .api import APIClient
|
||||||
from .client import CQiClient
|
from .client import CQiClient
|
||||||
from .wrapper import CQiWrapper
|
|
||||||
from .version import version, version_info
|
from .version import version, version_info
|
||||||
|
|
||||||
|
|
||||||
|
@ -20,10 +20,11 @@ class APIClient:
|
|||||||
>>> client.ctrl_bye()
|
>>> client.ctrl_bye()
|
||||||
{'code': 259, 'msg': 'CQI_STATUS_BYE_OK'}
|
{'code': 259, 'msg': 'CQI_STATUS_BYE_OK'}
|
||||||
|
|
||||||
Args:
|
Attributes:
|
||||||
host (str): URL to the CQP server. For example,
|
host (str): URL to the CQP server. For example,
|
||||||
``cqpserver.localhost`` or ``127.0.0.1``.
|
``cqpserver.localhost`` or ``127.0.0.1``.
|
||||||
port (int): Port the CQP server listens on. Default: ``4877``
|
port (int): Port the CQP server listens on. Default: ``4877``
|
||||||
|
socket (socket.socket): Socket for communicating with a CQP server.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, host, port=4877):
|
def __init__(self, host, port=4877):
|
||||||
|
@ -15,8 +15,9 @@ class CQiClient:
|
|||||||
{'code': 260, 'msg': 'CQI_STATUS_PING_OK'}
|
{'code': 260, 'msg': 'CQI_STATUS_PING_OK'}
|
||||||
>>> client.disconnect()
|
>>> client.disconnect()
|
||||||
{'code': 259, 'msg': 'CQI_STATUS_BYE_OK'}
|
{'code': 259, 'msg': 'CQI_STATUS_BYE_OK'}
|
||||||
|
|
||||||
Attributes:
|
Attributes:
|
||||||
api (APIClient): A client pointing to the specified to the CQP server.
|
api (APIClient): An API client pointing to the specified CQP server.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, host, port=4877):
|
def __init__(self, host, port=4877):
|
||||||
@ -32,12 +33,14 @@ class CQiClient:
|
|||||||
|
|
||||||
def connect(self, username='anonymous', password=''):
|
def connect(self, username='anonymous', password=''):
|
||||||
status = self.api.ctrl_connect(username, password)
|
status = self.api.ctrl_connect(username, password)
|
||||||
self.corpora = CorpusCollection(self)
|
|
||||||
return status
|
return status
|
||||||
|
|
||||||
def disconnect(self):
|
def disconnect(self):
|
||||||
del self.corpora
|
|
||||||
return self.api.ctrl_bye()
|
return self.api.ctrl_bye()
|
||||||
|
|
||||||
def ping(self):
|
def ping(self):
|
||||||
return self.api.ctrl_ping()
|
return self.api.ctrl_ping()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def corpora(self):
|
||||||
|
return CorpusCollection(client=self)
|
||||||
|
@ -1,139 +1,119 @@
|
|||||||
class AttributeCollection:
|
from .ressource import Collection, Model
|
||||||
def __init__(self, client, corpus):
|
|
||||||
self.client = client
|
|
||||||
self.corpus = corpus
|
|
||||||
self.alignment = AlignmentAttributeCollection(client, corpus)
|
|
||||||
self.positional = PositionalAttributeCollection(client, corpus)
|
|
||||||
self.structural = StructuralAttributeCollection(client, corpus)
|
|
||||||
|
|
||||||
|
|
||||||
class Attribute:
|
class Attribute(Model):
|
||||||
"""
|
"""
|
||||||
This is a class representing an attribute. Attributes denote the general
|
This is a class representing an attribute. Attributes denote the general
|
||||||
category of information. A specific occurence is identified by an Id.
|
category of information. A specific occurence is identified by an Id.
|
||||||
|
|
||||||
Attributes:
|
|
||||||
client (CQiClient): A connected client pointing at the server that this
|
|
||||||
object is on.
|
|
||||||
corpus (Corpus): The corpus, this attribute belongs to.
|
|
||||||
name (str): The name of the Attribute.
|
|
||||||
size (int): The number of occurences of this attribute within the corpus.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, client, corpus, name):
|
id_attribute = 'api_name'
|
||||||
self.client = client
|
|
||||||
self.corpus = corpus
|
@staticmethod
|
||||||
self.name = name
|
def _attrs(client, corpus, name):
|
||||||
self._name = '{}.{}'.format(corpus.name, name)
|
api_name = '{}.{}'.format(corpus.attrs['api_name'], name)
|
||||||
self.size = client.api.cl_attribute_size(self._name)
|
return {'api_name': api_name,
|
||||||
|
'name': name,
|
||||||
|
'size': client.api.cl_attribute_size(api_name)}
|
||||||
|
|
||||||
def drop(self):
|
def drop(self):
|
||||||
return self.client.api.cl_drop_attribute(self._name)
|
return self.client.api.cl_drop_attribute(self.attrs['api_name'])
|
||||||
|
|
||||||
|
|
||||||
class AlignmentAttributeCollection:
|
class AttributeCollection(Collection):
|
||||||
def __init__(self, client, corpus):
|
model = Attribute
|
||||||
self.client = client
|
|
||||||
|
def __init__(self, client=None, corpus=None):
|
||||||
|
super(AttributeCollection, self).__init__(client=client)
|
||||||
self.corpus = corpus
|
self.corpus = corpus
|
||||||
|
|
||||||
def get(self, name):
|
def get(self, attribute_name):
|
||||||
return AlignmentAttribute(self.client, self.corpus, name)
|
return self.prepare_model(self.model._attrs(self.client, self.corpus,
|
||||||
|
attribute_name))
|
||||||
|
|
||||||
def list(self):
|
def list(self):
|
||||||
return [AlignmentAttribute(self.client, self.corpus, attr) for attr in
|
raise NotImplementedError
|
||||||
self.client.api.corpus_alignment_attributes(self.corpus.name)]
|
|
||||||
|
|
||||||
|
|
||||||
class AlignmentAttribute(Attribute):
|
class AlignmentAttribute(Attribute):
|
||||||
def cpos_by_ids(self, id_list):
|
def cpos_by_ids(self, id_list):
|
||||||
return self.client.api.cl_alg2cpos(self._name, id_list)
|
return self.client.api.cl_alg2cpos(self.attrs['api_name'], id_list)
|
||||||
|
|
||||||
def ids_by_cpos(self, cpos_list):
|
def ids_by_cpos(self, cpos_list):
|
||||||
return self.client.api.cl_cpos2alg(self._name, cpos_list)
|
return self.client.api.cl_cpos2alg(self.attrs['api_name'], cpos_list)
|
||||||
|
|
||||||
|
|
||||||
class PositionalAttributeCollection:
|
class AlignmentAttributeCollection(AttributeCollection):
|
||||||
def __init__(self, client, corpus):
|
model = AlignmentAttribute
|
||||||
self.client = client
|
|
||||||
self.corpus = corpus
|
|
||||||
|
|
||||||
def get(self, name):
|
|
||||||
return PositionalAttribute(self.client, self.corpus, name)
|
|
||||||
|
|
||||||
def list(self):
|
def list(self):
|
||||||
return [PositionalAttribute(self.client, self.corpus, attr) for attr in
|
return [self.prepare_model(self.model._attrs(self.client, self.corpus, attr)) # noqa
|
||||||
self.client.api.corpus_positional_attributes(self.corpus.name)]
|
for attr in self.client.api.corpus_alignment_attributes(self.corpus.attrs['api_name'])] # noqa
|
||||||
|
|
||||||
|
|
||||||
class PositionalAttribute(Attribute):
|
class PositionalAttribute(Attribute):
|
||||||
def __init__(self, client, corpus, name):
|
@staticmethod
|
||||||
super(PositionalAttribute, self).__init__(client, corpus, name)
|
def _attrs(client, corpus, name):
|
||||||
self.lexicon_size = client.api.cl_lexicon_size(self._name)
|
attrs = super(PositionalAttribute, PositionalAttribute)._attrs(client, corpus, name) # noqa
|
||||||
|
attrs['lexicon_size'] = client.api.cl_lexicon_size(attrs['api_name'])
|
||||||
|
return attrs
|
||||||
|
|
||||||
def cpos_by_id(self, id):
|
def cpos_by_id(self, id):
|
||||||
return self.client.api.cl_id2cpos(self._name, id)
|
return self.client.api.cl_id2cpos(self.attrs['api_name'], id)
|
||||||
|
|
||||||
def cpos_by_ids(self, id_list):
|
def cpos_by_ids(self, id_list):
|
||||||
return self.client.api.cl_idlist2cpos(self._name, id_list)
|
return self.client.api.cl_idlist2cpos(self.attrs['api_name'], id_list)
|
||||||
|
|
||||||
def freqs_by_ids(self, id_list):
|
def freqs_by_ids(self, id_list):
|
||||||
return self.client.api.cl_id2freq(self._name, id_list)
|
return self.client.api.cl_id2freq(self.attrs['api_name'], id_list)
|
||||||
|
|
||||||
def ids_by_cpos(self, cpos_list):
|
def ids_by_cpos(self, cpos_list):
|
||||||
return self.client.api.cl_cpos2id(self._name, cpos_list)
|
return self.client.api.cl_cpos2id(self.attrs['api_name'], cpos_list)
|
||||||
|
|
||||||
def ids_by_regex(self, regex):
|
def ids_by_regex(self, regex):
|
||||||
return self.client.api.cl_regex2id(self._name, regex)
|
return self.client.api.cl_regex2id(self.attrs['api_name'], regex)
|
||||||
|
|
||||||
def ids_by_values(self, value_list):
|
def ids_by_values(self, value_list):
|
||||||
return self.client.api.cl_str2id(self._name, value_list)
|
return self.client.api.cl_str2id(self.attrs['api_name'], value_list)
|
||||||
|
|
||||||
def values_by_cpos(self, cpos_list):
|
def values_by_cpos(self, cpos_list):
|
||||||
return self.client.api.cl_cpos2str(self._name, cpos_list)
|
return self.client.api.cl_cpos2str(self.attrs['api_name'], cpos_list)
|
||||||
|
|
||||||
def values_by_ids(self, id_list):
|
def values_by_ids(self, id_list):
|
||||||
return self.client.api.cl_id2str(self._name, id_list)
|
return self.client.api.cl_id2str(self.attrs['api_name'], id_list)
|
||||||
|
|
||||||
|
|
||||||
class StructuralAttributeCollection:
|
class PositionalAttributeCollection(AttributeCollection):
|
||||||
def __init__(self, client, corpus):
|
model = PositionalAttribute
|
||||||
self.client = client
|
|
||||||
self.corpus = corpus
|
|
||||||
|
|
||||||
def get(self, name):
|
def list(self):
|
||||||
return StructuralAttribute(self.client, self.corpus, name)
|
return [self.prepare_model(self.model._attrs(self.client, self.corpus, attr)) # noqa
|
||||||
|
for attr in self.client.api.corpus_positional_attributes(self.corpus.attrs['api_name'])] # noqa
|
||||||
def list(self, filters={}):
|
|
||||||
attrs = [StructuralAttribute(self.client, self.corpus, attr) for attr
|
|
||||||
in self.client.api.corpus_structural_attributes(
|
|
||||||
self.corpus.name)]
|
|
||||||
for k, v in filters.items():
|
|
||||||
if k == 'part_of':
|
|
||||||
attrs = list(filter(lambda x: x.name.startswith(v.name + '_'),
|
|
||||||
attrs))
|
|
||||||
return attrs
|
|
||||||
|
|
||||||
|
|
||||||
class StructuralAttribute(Attribute):
|
class StructuralAttribute(Attribute):
|
||||||
def __init__(self, client, corpus, name):
|
@staticmethod
|
||||||
super(StructuralAttribute, self).__init__(client, corpus, name)
|
def _attrs(client, corpus, name):
|
||||||
self.has_values = \
|
attrs = super(StructuralAttribute, StructuralAttribute)._attrs(client, corpus, name) # noqa
|
||||||
client.api.corpus_structural_attribute_has_values(self._name)
|
attrs['has_values'] = client.api.corpus_structural_attribute_has_values(attrs['api_name']) # noqa
|
||||||
|
return attrs
|
||||||
|
|
||||||
def cpos_by_id(self, id):
|
def cpos_by_id(self, id):
|
||||||
return self.client.api.cl_struc2cpos(self._name, id)
|
return self.client.api.cl_struc2cpos(self.attrs['api_name'], id)
|
||||||
|
|
||||||
def ids_by_cpos(self, cpos_list):
|
def ids_by_cpos(self, cpos_list):
|
||||||
return self.client.api.cl_cpos2struc(self._name, cpos_list)
|
return self.client.api.cl_cpos2struc(self.attrs['api_name'], cpos_list)
|
||||||
|
|
||||||
def lbound_by_cpos(self, cpos_list):
|
def lbound_by_cpos(self, cpos_list):
|
||||||
return self.client.api.cl_cpos2lbound(self._name, cpos_list)
|
return self.client.api.cl_cpos2lbound(self.attrs['api_name'],
|
||||||
|
cpos_list)
|
||||||
|
|
||||||
def rbound_by_cpos(self, cpos_list):
|
def rbound_by_cpos(self, cpos_list):
|
||||||
return self.client.api.cl_cpos2rbound(self._name, cpos_list)
|
return self.client.api.cl_cpos2rbound(self.attrs['api_name'],
|
||||||
|
cpos_list)
|
||||||
|
|
||||||
def values_by_ids(self, id_list):
|
def values_by_ids(self, id_list):
|
||||||
return self.client.api.cl_struc2str(self._name, id_list)
|
return self.client.api.cl_struc2str(self.attrs['api_name'], id_list)
|
||||||
|
|
||||||
def export(self, first_cpos, last_cpos, context=0, expand_lists=False):
|
def export(self, first_cpos, last_cpos, context=0, expand_lists=False):
|
||||||
first_id, last_id = self.ids_by_cpos([first_cpos, last_cpos])
|
first_id, last_id = self.ids_by_cpos([first_cpos, last_cpos])
|
||||||
@ -150,9 +130,10 @@ class StructuralAttribute(Attribute):
|
|||||||
else:
|
else:
|
||||||
cpos_list_lbound = first_cpos
|
cpos_list_lbound = first_cpos
|
||||||
rc_rbound = \
|
rc_rbound = \
|
||||||
self.cpos_by_id(min((last_id + context), (self.size - 1)))[1]
|
self.cpos_by_id(min((last_id + context), (self.attrs['size'] - 1)))[1]
|
||||||
if rc_rbound != last_cpos:
|
if rc_rbound != last_cpos:
|
||||||
rc_lbound = min((last_cpos + 1), (self.corpus.size - 1))
|
rc_lbound = min((last_cpos + 1),
|
||||||
|
(self.collection.corpus.attrs['size'] - 1))
|
||||||
rc = (rc_lbound, rc_rbound)
|
rc = (rc_lbound, rc_rbound)
|
||||||
cpos_list_rbound = rc_rbound
|
cpos_list_rbound = rc_rbound
|
||||||
else:
|
else:
|
||||||
@ -164,5 +145,17 @@ class StructuralAttribute(Attribute):
|
|||||||
'rc': list(range(rc[0], (rc[1] + 1)))}
|
'rc': list(range(rc[0], (rc[1] + 1)))}
|
||||||
else:
|
else:
|
||||||
match = {'lc': lc, 'c': c, 'rc': rc}
|
match = {'lc': lc, 'c': c, 'rc': rc}
|
||||||
lookups = self.corpus.lookups_by_cpos(cpos_list)
|
lookups = self.collection.corpus.lookups_by_cpos(cpos_list)
|
||||||
return {'match': match, **lookups}
|
return {'match': match, **lookups}
|
||||||
|
|
||||||
|
|
||||||
|
class StructuralAttributeCollection(AttributeCollection):
|
||||||
|
model = StructuralAttribute
|
||||||
|
|
||||||
|
def list(self, filters={}):
|
||||||
|
attrs = [self.prepare_model(self.model._attrs(self.client, self.corpus, attr)) # noqa
|
||||||
|
for attr in self.client.api.corpus_structural_attributes(self.corpus.attrs['api_name'])] # noqa
|
||||||
|
for k, v in filters.items():
|
||||||
|
if k == 'part_of':
|
||||||
|
attrs = list(filter(lambda x: x.attrs['name'].startswith(v.attrs['name'] + '_'), attrs)) # noqa
|
||||||
|
return attrs
|
||||||
|
@ -1,30 +1,24 @@
|
|||||||
from .attributes import AttributeCollection
|
from .attributes import (AlignmentAttributeCollection,
|
||||||
|
PositionalAttributeCollection,
|
||||||
|
StructuralAttributeCollection)
|
||||||
|
from .ressource import Collection, Model
|
||||||
from .subcorpora import SubcorpusCollection
|
from .subcorpora import SubcorpusCollection
|
||||||
|
|
||||||
|
|
||||||
class CorpusCollection:
|
class Corpus(Model):
|
||||||
def __init__(self, client):
|
id_attribute = 'api_name'
|
||||||
self.client = client
|
|
||||||
|
|
||||||
def get(self, name):
|
@staticmethod
|
||||||
return Corpus(self.client, name)
|
def _attrs(client, name):
|
||||||
|
api_name = name
|
||||||
def list(self):
|
return {'api_name': api_name,
|
||||||
return [Corpus(self.client, corpus) for corpus in
|
'name': name,
|
||||||
self.client.api.corpus_list_coprora()]
|
'size': client.api.cl_attribute_size(
|
||||||
|
'{}.word'.format(api_name)),
|
||||||
|
# 'info': client.api.corpus_info(name),
|
||||||
class Corpus:
|
'charset': client.api.corpus_charset(api_name),
|
||||||
def __init__(self, client, name):
|
# 'full_name' = client.api.corpus_full_name(name),
|
||||||
self.client = client
|
'properties': client.api.corpus_properties(api_name)}
|
||||||
self.name = name
|
|
||||||
self.size = client.api.cl_attribute_size('{}.word'.format(name))
|
|
||||||
# self.info = client.api.corpus_info(name)
|
|
||||||
self.charset = client.api.corpus_charset(name)
|
|
||||||
# self.full_name = client.api.corpus_full_name(name)
|
|
||||||
self.properties = client.api.corpus_properties(name)
|
|
||||||
self.attributes = AttributeCollection(client, self)
|
|
||||||
self.subcorpora = SubcorpusCollection(client, self)
|
|
||||||
|
|
||||||
def lookups_by_cpos(self, cpos_list):
|
def lookups_by_cpos(self, cpos_list):
|
||||||
cpos_list = list(set(cpos_list))
|
cpos_list = list(set(cpos_list))
|
||||||
@ -33,35 +27,64 @@ class Corpus:
|
|||||||
lookups['cpos_lookup'] = {}
|
lookups['cpos_lookup'] = {}
|
||||||
for cpos in cpos_list:
|
for cpos in cpos_list:
|
||||||
lookups['cpos_lookup'][cpos] = {}
|
lookups['cpos_lookup'][cpos] = {}
|
||||||
for attr in self.attributes.positional.list():
|
for attr in self.positional_attributes.list():
|
||||||
cpos_attr_values = attr.values_by_cpos(cpos_list)
|
cpos_attr_values = attr.values_by_cpos(cpos_list)
|
||||||
for i, cpos in enumerate(cpos_list):
|
for i, cpos in enumerate(cpos_list):
|
||||||
lookups['cpos_lookup'][cpos][attr.name] = cpos_attr_values[i]
|
lookups['cpos_lookup'][cpos][attr.attrs['name']] = cpos_attr_values[i]
|
||||||
for attr in self.attributes.structural.list():
|
for attr in self.structural_attributes.list():
|
||||||
if attr.has_values:
|
if attr.attrs['has_values']:
|
||||||
continue
|
continue
|
||||||
cpos_attr_ids = attr.ids_by_cpos(cpos_list)
|
cpos_attr_ids = attr.ids_by_cpos(cpos_list)
|
||||||
for i, cpos in enumerate(cpos_list):
|
for i, cpos in enumerate(cpos_list):
|
||||||
if cpos_attr_ids[i] != -1:
|
if cpos_attr_ids[i] != -1:
|
||||||
lookups['cpos_lookup'][cpos][attr.name] = cpos_attr_ids[i]
|
lookups['cpos_lookup'][cpos][attr.attrs['name']] = cpos_attr_ids[i]
|
||||||
occured_attr_ids = list(filter(lambda x: x != -1,
|
occured_attr_ids = list(filter(lambda x: x != -1,
|
||||||
set(cpos_attr_ids)))
|
set(cpos_attr_ids)))
|
||||||
if not occured_attr_ids:
|
if not occured_attr_ids:
|
||||||
continue
|
continue
|
||||||
subattrs = \
|
subattrs = \
|
||||||
self.attributes.structural.list(filters={'part_of': attr})
|
self.structural_attributes.list(filters={'part_of': attr})
|
||||||
if not subattrs:
|
if not subattrs:
|
||||||
continue
|
continue
|
||||||
lookup_name = '{}_lookup'.format(attr.name)
|
lookup_name = '{}_lookup'.format(attr.attrs['name'])
|
||||||
lookups[lookup_name] = {}
|
lookups[lookup_name] = {}
|
||||||
for attr_id in occured_attr_ids:
|
for attr_id in occured_attr_ids:
|
||||||
lookups[lookup_name][attr_id] = {}
|
lookups[lookup_name][attr_id] = {}
|
||||||
for subattr in subattrs:
|
for subattr in subattrs:
|
||||||
subattr_values = subattr.values_by_ids(occured_attr_ids)
|
subattr_values = subattr.values_by_ids(occured_attr_ids)
|
||||||
for i, subattr_value in enumerate(subattr_values):
|
for i, subattr_value in enumerate(subattr_values):
|
||||||
lookups[lookup_name][occured_attr_ids[i]][subattr.name] = \
|
subattr_name = subattr.attrs['name'][(len(attr.attrs['name']) + 1):]
|
||||||
|
lookups[lookup_name][occured_attr_ids[i]][subattr_name] = \
|
||||||
subattr_value
|
subattr_value
|
||||||
return lookups
|
return lookups
|
||||||
|
|
||||||
def query(self, query, subcorpus_name='Results'):
|
def query(self, query, subcorpus_name='Results'):
|
||||||
return self.client.api.cqp_query(self.name, subcorpus_name, query)
|
return self.client.api.cqp_query(self.attrs['api_name'],
|
||||||
|
subcorpus_name, query)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def alignment_attributes(self):
|
||||||
|
return AlignmentAttributeCollection(client=self.client, corpus=self)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def positional_attributes(self):
|
||||||
|
return PositionalAttributeCollection(client=self.client, corpus=self)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def structural_attributes(self):
|
||||||
|
return StructuralAttributeCollection(client=self.client, corpus=self)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def subcorpora(self):
|
||||||
|
return SubcorpusCollection(client=self.client, corpus=self)
|
||||||
|
|
||||||
|
|
||||||
|
class CorpusCollection(Collection):
|
||||||
|
model = Corpus
|
||||||
|
|
||||||
|
def get(self, corpus_name):
|
||||||
|
return self.prepare_model(self.model._attrs(self.client, corpus_name))
|
||||||
|
|
||||||
|
def list(self):
|
||||||
|
return [self.prepare_model(self.model._attrs(self.client, corpus))
|
||||||
|
for corpus in self.client.api.corpus_list_coprora()]
|
||||||
|
78
app/corpora/cqi/models/ressource.py
Normal file
78
app/corpora/cqi/models/ressource.py
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
class Model:
|
||||||
|
"""
|
||||||
|
A base class for representing a single object on the server.
|
||||||
|
"""
|
||||||
|
id_attribute = 'Id'
|
||||||
|
|
||||||
|
def __init__(self, attrs=None, client=None, collection=None):
|
||||||
|
#: A client pointing at the server that this object is on.
|
||||||
|
self.client = client
|
||||||
|
|
||||||
|
#: The collection that this model is part of.
|
||||||
|
self.collection = collection
|
||||||
|
|
||||||
|
#: The raw representation of this object from the API
|
||||||
|
self.attrs = attrs or {}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _attrs(client, key):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return "<{}: {}>".format(self.__class__.__name__, self.id)
|
||||||
|
|
||||||
|
def __eq__(self, other):
|
||||||
|
return isinstance(other, self.__class__) and self.id == other.id
|
||||||
|
|
||||||
|
def __hash__(self):
|
||||||
|
return hash("{}:{}".format(self.__class__.__name__, self.id))
|
||||||
|
|
||||||
|
@property
|
||||||
|
def id(self):
|
||||||
|
"""
|
||||||
|
The ID of the object.
|
||||||
|
"""
|
||||||
|
return self.attrs.get(self.id_attribute)
|
||||||
|
|
||||||
|
def reload(self):
|
||||||
|
"""
|
||||||
|
Load this object from the server again and update ``attrs`` with the
|
||||||
|
new data.
|
||||||
|
"""
|
||||||
|
new_model = self.collection.get(self.id)
|
||||||
|
self.attrs = new_model.attrs
|
||||||
|
|
||||||
|
|
||||||
|
class Collection:
|
||||||
|
"""
|
||||||
|
A base class for representing all objects of a particular type on the
|
||||||
|
server.
|
||||||
|
"""
|
||||||
|
|
||||||
|
#: The type of object this collection represents, set by subclasses
|
||||||
|
model = None
|
||||||
|
|
||||||
|
def __init__(self, client=None):
|
||||||
|
#: The client pointing at the server that this collection of objects
|
||||||
|
#: is on.
|
||||||
|
self.client = client
|
||||||
|
|
||||||
|
def list(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def get(self, key):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def prepare_model(self, attrs):
|
||||||
|
"""
|
||||||
|
Create a model from a set of attributes.
|
||||||
|
"""
|
||||||
|
if isinstance(attrs, Model):
|
||||||
|
attrs.client = self.client
|
||||||
|
attrs.collection = self
|
||||||
|
return attrs
|
||||||
|
elif isinstance(attrs, dict):
|
||||||
|
return self.model(attrs=attrs, client=self.client, collection=self)
|
||||||
|
else:
|
||||||
|
raise Exception("Can't create {} from {}".format(
|
||||||
|
self.model.__name__, attrs))
|
@ -1,54 +1,45 @@
|
|||||||
|
from .ressource import Collection, Model
|
||||||
from ..api.specification import (CONST_FIELD_KEYWORD, CONST_FIELD_MATCH,
|
from ..api.specification import (CONST_FIELD_KEYWORD, CONST_FIELD_MATCH,
|
||||||
CONST_FIELD_MATCHEND, CONST_FIELD_TARGET)
|
CONST_FIELD_MATCHEND, CONST_FIELD_TARGET)
|
||||||
|
|
||||||
|
|
||||||
class SubcorpusCollection:
|
class Subcorpus(Model):
|
||||||
def __init__(self, client, corpus):
|
id_attribute = 'api_name'
|
||||||
self.client = client
|
|
||||||
self.corpus = corpus
|
|
||||||
|
|
||||||
def get(self, name):
|
@staticmethod
|
||||||
return Subcorpus(self.client, self.corpus, name)
|
def _attrs(client, corpus, name):
|
||||||
|
api_name = '{}:{}'.format(corpus.attrs['api_name'], name)
|
||||||
def list(self):
|
fields = {}
|
||||||
return [Subcorpus(self.client, self.corpus, subcorpus) for subcorpus in
|
if client.api.cqp_subcorpus_has_field(api_name, CONST_FIELD_MATCH):
|
||||||
self.client.api.cqp_list_subcorpora(self.corpus.name)]
|
fields['match'] = CONST_FIELD_MATCH
|
||||||
|
if client.api.cqp_subcorpus_has_field(api_name,
|
||||||
|
|
||||||
class Subcorpus:
|
|
||||||
def __init__(self, client, corpus, name):
|
|
||||||
self.client = client
|
|
||||||
self.corpus = corpus
|
|
||||||
self.name = name
|
|
||||||
self._name = '{}:{}'.format(corpus.name, name)
|
|
||||||
self.fields = {}
|
|
||||||
if client.api.cqp_subcorpus_has_field(self._name, CONST_FIELD_MATCH):
|
|
||||||
self.fields['match'] = CONST_FIELD_MATCH
|
|
||||||
if client.api.cqp_subcorpus_has_field(self._name,
|
|
||||||
CONST_FIELD_MATCHEND):
|
CONST_FIELD_MATCHEND):
|
||||||
self.fields['matchend'] = CONST_FIELD_MATCHEND
|
fields['matchend'] = CONST_FIELD_MATCHEND
|
||||||
if client.api.cqp_subcorpus_has_field(self._name, CONST_FIELD_TARGET):
|
if client.api.cqp_subcorpus_has_field(api_name, CONST_FIELD_TARGET):
|
||||||
self.fields['target'] = CONST_FIELD_TARGET
|
fields['target'] = CONST_FIELD_TARGET
|
||||||
if client.api.cqp_subcorpus_has_field(self._name, CONST_FIELD_KEYWORD):
|
if client.api.cqp_subcorpus_has_field(api_name, CONST_FIELD_KEYWORD):
|
||||||
self.fields['keyword'] = CONST_FIELD_KEYWORD
|
fields['keyword'] = CONST_FIELD_KEYWORD
|
||||||
self.size = client.api.cqp_subcorpus_size(self._name)
|
return {'api_name': api_name,
|
||||||
|
'name': name,
|
||||||
|
'fields': fields,
|
||||||
|
'size': client.api.cqp_subcorpus_size(api_name)}
|
||||||
|
|
||||||
def drop(self):
|
def drop(self):
|
||||||
return self.client.api.cqp_drop_subcorpus(self._name)
|
return self.client.api.cqp_drop_subcorpus(self.attrs['api_name'])
|
||||||
|
|
||||||
def dump(self, field, first, last):
|
def dump(self, field, first, last):
|
||||||
return self.client.api.cqp_dump_subcorpus(self._name, field, first,
|
return self.client.api.cqp_dump_subcorpus(self.attrs['api_name'],
|
||||||
last)
|
field, first, last)
|
||||||
|
|
||||||
def export(self, context=25, cutoff=float('inf'), expand_lists=False,
|
def export(self, context=25, cutoff=float('inf'), expand_lists=False,
|
||||||
offset=0):
|
offset=0):
|
||||||
if self.size == 0:
|
if self.attrs['size'] == 0:
|
||||||
return {"matches": []}
|
return {"matches": []}
|
||||||
first_match = max(0, offset)
|
first_match = max(0, offset)
|
||||||
last_match = min((offset + cutoff - 1), (self.size - 1))
|
last_match = min((offset + cutoff - 1), (self.attrs['size'] - 1))
|
||||||
match_boundaries = \
|
match_boundaries = \
|
||||||
zip(self.dump(self.fields['match'], first_match, last_match),
|
zip(self.dump(self.attrs['fields']['match'], first_match, last_match), # noqa
|
||||||
self.dump(self.fields['matchend'], first_match, last_match))
|
self.dump(self.attrs['fields']['matchend'], first_match, last_match)) # noqa
|
||||||
cpos_list = []
|
cpos_list = []
|
||||||
matches = []
|
matches = []
|
||||||
for match_start, match_end in match_boundaries:
|
for match_start, match_end in match_boundaries:
|
||||||
@ -64,10 +55,11 @@ class Subcorpus:
|
|||||||
cpos_list_lbound = lc_lbound
|
cpos_list_lbound = lc_lbound
|
||||||
else:
|
else:
|
||||||
cpos_list_lbound = match_start
|
cpos_list_lbound = match_start
|
||||||
rc_lbound = min((match_end + 1), (self.corpus.size - 1))
|
rc_lbound = min((match_end + 1),
|
||||||
|
(self.collection.corpus.attrs['size'] - 1))
|
||||||
if rc_lbound != match_end:
|
if rc_lbound != match_end:
|
||||||
rc_rbound = min((match_end + 1 + context),
|
rc_rbound = min((match_end + 1 + context),
|
||||||
(self.corpus.size - 1))
|
(self.collection.corpus.attrs['size'] - 1))
|
||||||
rc = (rc_lbound, rc_rbound)
|
rc = (rc_lbound, rc_rbound)
|
||||||
cpos_list_rbound = rc_rbound
|
cpos_list_rbound = rc_rbound
|
||||||
else:
|
else:
|
||||||
@ -81,14 +73,30 @@ class Subcorpus:
|
|||||||
else:
|
else:
|
||||||
match = {'lc': lc, 'c': c, 'rc': rc}
|
match = {'lc': lc, 'c': c, 'rc': rc}
|
||||||
matches.append(match)
|
matches.append(match)
|
||||||
lookups = self.corpus.lookups_by_cpos(cpos_list)
|
lookups = self.collection.corpus.lookups_by_cpos(cpos_list)
|
||||||
return {'matches': matches, **lookups}
|
return {'matches': matches, **lookups}
|
||||||
|
|
||||||
def fdist_1(self, cutoff, field, attribute):
|
def fdist_1(self, cutoff, field, attribute):
|
||||||
return self.client.api.cqp_fdist_1(self._name, cutoff,
|
return self.client.api.cqp_fdist_1(self.attrs['api_name'], cutoff,
|
||||||
field, attribute._name)
|
field, attribute._name)
|
||||||
|
|
||||||
def fdist_2(self, cutoff, field_1, attribute_1, field_2, attribute_2):
|
def fdist_2(self, cutoff, field_1, attribute_1, field_2, attribute_2):
|
||||||
return self.client.api.cqp_fdist_2(self._name, cutoff,
|
return self.client.api.cqp_fdist_2(self.attrs['api_name'], cutoff,
|
||||||
field_1, attribute_1._name,
|
field_1, attribute_1._name,
|
||||||
field_2, attribute_2._name)
|
field_2, attribute_2._name)
|
||||||
|
|
||||||
|
|
||||||
|
class SubcorpusCollection(Collection):
|
||||||
|
model = Subcorpus
|
||||||
|
|
||||||
|
def __init__(self, client=None, corpus=None):
|
||||||
|
super(SubcorpusCollection, self).__init__(client=client)
|
||||||
|
self.corpus = corpus
|
||||||
|
|
||||||
|
def get(self, subcorpus_name):
|
||||||
|
return self.prepare_model(self.model._attrs(self.client, self.corpus,
|
||||||
|
subcorpus_name))
|
||||||
|
|
||||||
|
def list(self):
|
||||||
|
return [self.prepare_model(self.model._attrs(self.client, self.corpus, subcorpus)) # noqa
|
||||||
|
for subcorpus in self.client.api.cqp_list_subcorpora(self.corpus.attrs['api_name'])] # noqa
|
||||||
|
@ -1,321 +0,0 @@
|
|||||||
from .api import APIClient
|
|
||||||
from .api.specification import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND
|
|
||||||
import time
|
|
||||||
|
|
||||||
|
|
||||||
class CQiWrapper(APIClient):
|
|
||||||
'''
|
|
||||||
CQIiWrapper object
|
|
||||||
|
|
||||||
High level wrapper that groups and renames some functions of CQiClient
|
|
||||||
for ease of use. Also structures recieved data into python dictionaries.
|
|
||||||
|
|
||||||
Keyword arguments:
|
|
||||||
host -- host IP adress or hostname wher the cqp server is running
|
|
||||||
port -- port of the cqp server
|
|
||||||
username -- username used to connect to the cqp server
|
|
||||||
password -- password of the user to connect to the cqp server
|
|
||||||
'''
|
|
||||||
|
|
||||||
SUBCORPUS_NAMES = []
|
|
||||||
|
|
||||||
def __init__(self, host='127.0.0.1', port=4877, username='anonymous',
|
|
||||||
password=''):
|
|
||||||
super(CQiWrapper, self).__init__(host, port=port)
|
|
||||||
self.username = username
|
|
||||||
self.password = password
|
|
||||||
|
|
||||||
def connect(self):
|
|
||||||
'''
|
|
||||||
Connect with CQP server
|
|
||||||
|
|
||||||
Connects via socket to the CQP server using the given username and
|
|
||||||
password from class initiation.
|
|
||||||
'''
|
|
||||||
self.ctrl_connect(self.username, self.password)
|
|
||||||
|
|
||||||
def __create_attribute_strings(self):
|
|
||||||
'''
|
|
||||||
Creates all needed attribute strings to query for word, lemma etc. in
|
|
||||||
the given corpus.
|
|
||||||
For example: CORPUS_NAME.word to query words
|
|
||||||
Automaticalle creates strings for all pre defined tags.
|
|
||||||
'''
|
|
||||||
p_attrs = self.corpus_positional_attributes(self.corpus_name)
|
|
||||||
struct_attrs = self.corpus_structural_attributes(self.corpus_name)
|
|
||||||
self.attr_strings = {}
|
|
||||||
self.attr_strings['positional_attrs'] = {}
|
|
||||||
self.attr_strings['struct_attrs'] = {}
|
|
||||||
for p_attr in p_attrs:
|
|
||||||
self.attr_strings['positional_attrs'][p_attr] = (self.corpus_name
|
|
||||||
+ '.'
|
|
||||||
+ p_attr)
|
|
||||||
for struct_attr in struct_attrs:
|
|
||||||
self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
|
|
||||||
+ '.'
|
|
||||||
+ struct_attr)
|
|
||||||
print(('All positional and '
|
|
||||||
'structural attributes: {}').format(self.attr_strings))
|
|
||||||
|
|
||||||
def select_corpus(self, corpus_name):
|
|
||||||
'''
|
|
||||||
Checks if given copus name exists. If it exists set it as the main
|
|
||||||
corpus name used to create the needed query attribute strings like
|
|
||||||
CORPUS_NAME.word.
|
|
||||||
'''
|
|
||||||
if corpus_name in self.corpus_list_coprora():
|
|
||||||
self.corpus_name = corpus_name
|
|
||||||
self.__create_attribute_strings()
|
|
||||||
print('{} does exist.'.format(corpus_name))
|
|
||||||
else:
|
|
||||||
print('{} does not exist.'.format(corpus_name))
|
|
||||||
raise Exception('Given Corpus Name is not in corpora list.')
|
|
||||||
|
|
||||||
def disconnect(self):
|
|
||||||
'''
|
|
||||||
Disconnect from CQP server
|
|
||||||
|
|
||||||
Disconnects from the CQP server. Closes used socket after disconnect.
|
|
||||||
'''
|
|
||||||
self.ctrl_bye()
|
|
||||||
print('Disconnected from cqp server.')
|
|
||||||
|
|
||||||
def query_subcorpus(self, query, result_subcorpus_name='Query-results'):
|
|
||||||
'''
|
|
||||||
Create subcorpus
|
|
||||||
|
|
||||||
Input query will be used to create a subcorpus holding all cpos match
|
|
||||||
positions for that query.
|
|
||||||
|
|
||||||
Keyword arguments:
|
|
||||||
result_subcorpus_name -- set name of the subcorpus which holds all
|
|
||||||
cpos match positions, produced by the query
|
|
||||||
query -- query written in cqp query language
|
|
||||||
'''
|
|
||||||
self.query = query
|
|
||||||
self.cqp_query(self.corpus_name, result_subcorpus_name, query)
|
|
||||||
self.result_subcorpus = (self.corpus_name
|
|
||||||
+ ':'
|
|
||||||
+ result_subcorpus_name)
|
|
||||||
self.SUBCORPUS_NAMES.append(self.result_subcorpus)
|
|
||||||
self.match_count = self.cqp_subcorpus_size(self.result_subcorpus)
|
|
||||||
print('Nr of all matches is: {}'.format(self.match_count))
|
|
||||||
|
|
||||||
def show_subcorpora(self):
|
|
||||||
'''
|
|
||||||
Show all subcorpora currently saved by the cqp server.
|
|
||||||
'''
|
|
||||||
return self.cqp_list_subcorpora(self.corpus_name)
|
|
||||||
|
|
||||||
def show_query_results(self,
|
|
||||||
context_len=10,
|
|
||||||
result_len=1000,
|
|
||||||
result_offset=0):
|
|
||||||
'''
|
|
||||||
Show query results
|
|
||||||
|
|
||||||
Shows the actual matched strings produce by the query. Uses the cpos
|
|
||||||
match indexes to grab those strings. saves them into an orderd
|
|
||||||
dictionary. Also saves coresponding tags, lemmas and context. Gets those
|
|
||||||
informations using the corresponding cpos.
|
|
||||||
|
|
||||||
Keyword arguments:
|
|
||||||
context_len -- defines how many words before and after a match will be
|
|
||||||
shown (default 10)
|
|
||||||
result_len -- defines for how many matches all informations like lemma
|
|
||||||
and POS are being grabbed
|
|
||||||
result_offset -- defines the offset of the matches being requested. If
|
|
||||||
the offset is 100 informations for matches 100 to result_len are being
|
|
||||||
grabbed
|
|
||||||
'''
|
|
||||||
t0 = time.time()
|
|
||||||
self.context_len = context_len
|
|
||||||
self.corpus_max_len = self.cl_attribute_size(
|
|
||||||
self.attr_strings['positional_attrs']['word']
|
|
||||||
)
|
|
||||||
self.nr_matches = min(result_len, self.match_count)
|
|
||||||
if self.match_count == 0:
|
|
||||||
print('Query resulted in 0 matches.')
|
|
||||||
self.results = {'code': 0,
|
|
||||||
'result': {'matches': [],
|
|
||||||
'match_count': self.match_count,
|
|
||||||
'cpos_lookup': {},
|
|
||||||
'text_lookup': {}}
|
|
||||||
}
|
|
||||||
return self.results
|
|
||||||
else:
|
|
||||||
# Get match cpos boundries
|
|
||||||
# match_boundries shows the start and end cpos of one match as a
|
|
||||||
# pair of cpositions
|
|
||||||
# [(1355, 1357), (1477, 1479)] Example for two boundry pairs
|
|
||||||
offset_start = 0 if result_offset == 0 else result_offset
|
|
||||||
print('Offset start is: {}'.format(offset_start))
|
|
||||||
offset_end = min((self.nr_matches + result_offset - 1), self.match_count - 1)
|
|
||||||
print('Offset end is: {}'.format(offset_end))
|
|
||||||
match_boundaries = zip(self.cqp_dump_subcorpus(self.result_subcorpus,
|
|
||||||
CONST_FIELD_MATCH,
|
|
||||||
offset_start,
|
|
||||||
offset_end),
|
|
||||||
self.cqp_dump_subcorpus(self.result_subcorpus,
|
|
||||||
CONST_FIELD_MATCHEND,
|
|
||||||
offset_start,
|
|
||||||
offset_end))
|
|
||||||
|
|
||||||
# Generate all cpos between match boundries including start and end
|
|
||||||
# boundries.
|
|
||||||
# Also generate cpos for left and right context.
|
|
||||||
# Save those cpos into dict as lists for the keys 'lc', 'hit' and 'rc'
|
|
||||||
# Also collect all cpos together in one list for the final request of
|
|
||||||
# all cpos informations
|
|
||||||
all_matches = []
|
|
||||||
all_cpos = []
|
|
||||||
for start, end in match_boundaries:
|
|
||||||
end += 1
|
|
||||||
lc_cpos = list(range(max([0, start - self.context_len]), start))
|
|
||||||
lc = {'lc': lc_cpos}
|
|
||||||
match_cpos = list(range(start, end))
|
|
||||||
match = {'hit': match_cpos}
|
|
||||||
rc_cpos = list(range(end, min([self.corpus_max_len,
|
|
||||||
end + self.context_len])))
|
|
||||||
rc = {'rc': rc_cpos}
|
|
||||||
lc.update(match)
|
|
||||||
lc.update(rc)
|
|
||||||
all_cpos.extend(lc_cpos + match_cpos + rc_cpos)
|
|
||||||
all_matches.append(lc)
|
|
||||||
|
|
||||||
all_cpos = list(set(all_cpos)) # get rid of cpos duplicates
|
|
||||||
len_all_cpos = len(all_cpos)
|
|
||||||
t1 = time.time()
|
|
||||||
t_total = t1 - t0
|
|
||||||
print('Time to create all CPOS for query: {}'.format(t_total))
|
|
||||||
print('Requesting {} CPOS with one query.'.format(len_all_cpos))
|
|
||||||
|
|
||||||
# Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
|
|
||||||
# all cpos entries in all_cpos_list
|
|
||||||
# Also saves these informations into self.results dict
|
|
||||||
t2 = time.time()
|
|
||||||
all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
|
|
||||||
t3 = time.time()
|
|
||||||
t_final = t3 - t2
|
|
||||||
print('Got infos for {} CPOS in {} seconds:'.format(len_all_cpos,
|
|
||||||
t_final))
|
|
||||||
self.results = {'code': 0,
|
|
||||||
'result': {'matches': all_matches,
|
|
||||||
'match_count': self.match_count,
|
|
||||||
'cpos_lookup': all_cpos_infos,
|
|
||||||
'text_lookup': text_lookup}
|
|
||||||
}
|
|
||||||
return self.results
|
|
||||||
|
|
||||||
def get_cpos_infos(self, all_cpos):
|
|
||||||
'''
|
|
||||||
Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
|
|
||||||
all cpos entries specified in the parameter all_cpos.
|
|
||||||
'''
|
|
||||||
# Get all positional attribute informations
|
|
||||||
cpos_infos = {}
|
|
||||||
for p_attr_key in self.attr_strings['positional_attrs'].keys():
|
|
||||||
match_strs = self.cl_cpos2str(self.attr_strings['positional_attrs'][p_attr_key], all_cpos)
|
|
||||||
cpos_infos[p_attr_key] = match_strs
|
|
||||||
|
|
||||||
# Get all strucutural attribute informations
|
|
||||||
tmp_info = {}
|
|
||||||
structs_to_check = []
|
|
||||||
for struct_attr_key in self.attr_strings['struct_attrs'].keys():
|
|
||||||
key = self.attr_strings['struct_attrs'][struct_attr_key]
|
|
||||||
has_value = self.corpus_structural_attribute_has_values(key)
|
|
||||||
struct_ids = self.cl_cpos2struc(key, all_cpos)
|
|
||||||
if has_value is False: # Get IDs of strucutural elements without values (this means get IDs of XML tags. Struct elements only have values if they are XML attributes)
|
|
||||||
tmp_info[struct_attr_key] = []
|
|
||||||
for id in struct_ids:
|
|
||||||
tmp_info[struct_attr_key].append(id)
|
|
||||||
else:
|
|
||||||
structs_to_check.append({key: struct_attr_key})
|
|
||||||
print('Structs to check: {}'.format(structs_to_check))
|
|
||||||
struct_attr_values = list(tmp_info.values())
|
|
||||||
# print('Struct attr value list: {}'.format(struct_attr_values))
|
|
||||||
struct_attr_keys = list(tmp_info.keys())
|
|
||||||
# print('Struct attr key list: {}'.format(struct_attr_keys))
|
|
||||||
|
|
||||||
# Build textlookup dictionary
|
|
||||||
text_lookup_ids = list(set(struct_attr_values[0])) # every CPOS is associated with one text id. A set is build to only gather text_lookup informations for every unique text id
|
|
||||||
text_lookup = {} # final dict containing all info of one text identified by its id
|
|
||||||
for d in structs_to_check:
|
|
||||||
s_key, s_value = zip(*d.items())
|
|
||||||
print('dict entries: {}: {}'.format(s_key, s_value))
|
|
||||||
s_value = s_value[0].split('_', 1)[-1]
|
|
||||||
print('S_VALUE: {}'.format(s_value))
|
|
||||||
struct_values = self.cl_struc2str(s_key[0], text_lookup_ids)
|
|
||||||
print('Extracted Value with key {}: {}'.format(s_key[0], struct_values))
|
|
||||||
zipped = dict(zip(text_lookup_ids, struct_values))
|
|
||||||
for zip_key, zip_value in zipped.items():
|
|
||||||
print('Text id as key is: {}'.format(zip_key))
|
|
||||||
print('Value of this text is: {}'.format(zip_value))
|
|
||||||
check = text_lookup.get(zip_key)
|
|
||||||
print('check: {}'.format(check))
|
|
||||||
if check is None:
|
|
||||||
text_lookup[zip_key] = {s_value: zip_value}
|
|
||||||
else:
|
|
||||||
text_lookup[zip_key].update({s_value: zip_value})
|
|
||||||
|
|
||||||
# zip keys and values together
|
|
||||||
attr_values_list = []
|
|
||||||
attr_keys_list = []
|
|
||||||
for key in cpos_infos.keys():
|
|
||||||
attr_values_list.append(cpos_infos[key])
|
|
||||||
attr_keys_list.append(key)
|
|
||||||
attr_keys_list.extend(struct_attr_keys)
|
|
||||||
attr_values_list.extend(struct_attr_values)
|
|
||||||
joined_cpos_infos = zip(all_cpos, *attr_values_list)
|
|
||||||
dict_cpos_infos = {}
|
|
||||||
for info in joined_cpos_infos:
|
|
||||||
dict_cpos_infos[info[0]] = dict(zip(attr_keys_list, info[1:]))
|
|
||||||
return dict_cpos_infos, text_lookup
|
|
||||||
|
|
||||||
def get_sentences(self,
|
|
||||||
match_cpos_list,
|
|
||||||
get_surrounding_s=False,
|
|
||||||
l_r_s_context_additional_len=1):
|
|
||||||
'''
|
|
||||||
Get sentence informations for one match also set if and how much left
|
|
||||||
right context sentences should be grabbed surrounding the given CPOS.
|
|
||||||
'''
|
|
||||||
t0 = time.time()
|
|
||||||
key = self.corpus_name + '.s'
|
|
||||||
first_cpos, last_cpos = match_cpos_list[0], match_cpos_list[-1]
|
|
||||||
context_sentences = {}
|
|
||||||
s_ids = self.cl_cpos2struc(key, [first_cpos, last_cpos])
|
|
||||||
print('s id match: {}'.format(s_ids))
|
|
||||||
for s_id in s_ids:
|
|
||||||
s_start, s_end = self.cl_struc2cpos(key, s_id)
|
|
||||||
s_cpos = list(range(s_start, s_end + 1))
|
|
||||||
context_sentences[s_id] = s_cpos
|
|
||||||
if get_surrounding_s:
|
|
||||||
max_s_id = self.cl_attribute_size(key) - 1
|
|
||||||
print('max sid: {}'.format(max_s_id))
|
|
||||||
additional_s_ids = []
|
|
||||||
additional_s = list(range(max(s_ids[0]
|
|
||||||
- l_r_s_context_additional_len,
|
|
||||||
0),
|
|
||||||
min(s_ids[-1]
|
|
||||||
+ l_r_s_context_additional_len,
|
|
||||||
max_s_id) + 1))
|
|
||||||
additional_s_ids.extend(additional_s)
|
|
||||||
for s_id in additional_s_ids:
|
|
||||||
print('s id additional: {}'.format(s_id))
|
|
||||||
s_start, s_end = self.cl_struc2cpos(key, s_id)
|
|
||||||
s_cpos = list(range(s_start, s_end + 1))
|
|
||||||
context_sentences[s_id] = s_cpos
|
|
||||||
all_cpos = []
|
|
||||||
for key in context_sentences.keys():
|
|
||||||
all_cpos.extend(context_sentences[key])
|
|
||||||
all_cpos = list(set(all_cpos))
|
|
||||||
all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
|
|
||||||
t1 = time.time()
|
|
||||||
t_total = t1 - t0
|
|
||||||
print('Got all sentences informations in {} seconds'. format(t_total))
|
|
||||||
match_context = {'context_s_cpos': context_sentences,
|
|
||||||
'cpos_lookup': all_cpos_infos,
|
|
||||||
'text_lookup': text_lookup,
|
|
||||||
'match_cpos_list': match_cpos_list}
|
|
||||||
return match_context
|
|
@ -67,23 +67,23 @@ def corpus_analysis_query(query):
|
|||||||
socketio.emit('corpus_analysis_query', response, room=request.sid)
|
socketio.emit('corpus_analysis_query', response, room=request.sid)
|
||||||
return
|
return
|
||||||
response = {'code': 200, 'desc': None, 'msg': 'OK',
|
response = {'code': 200, 'desc': None, 'msg': 'OK',
|
||||||
'payload': {**query_status, 'match_count': results.size}}
|
'payload': {**query_status, 'match_count': results.attrs['size']}}
|
||||||
socketio.emit('corpus_analysis_query', response, room=request.sid)
|
socketio.emit('corpus_analysis_query', response, room=request.sid)
|
||||||
chunk_size = 100
|
chunk_size = 100
|
||||||
chunk_start = 0
|
chunk_start = 0
|
||||||
context = 100
|
context = 100
|
||||||
progress = 0
|
progress = 0
|
||||||
client.status = 'running'
|
client.status = 'running'
|
||||||
while chunk_start <= results.size:
|
while chunk_start <= results.attrs['size']:
|
||||||
if client.status == 'abort':
|
if client.status == 'abort':
|
||||||
break
|
break
|
||||||
chunk = results.export(context=context, cutoff=chunk_size,
|
chunk = results.export(context=context, cutoff=chunk_size,
|
||||||
expand_lists=False, offset=chunk_start)
|
expand_lists=False, offset=chunk_start)
|
||||||
chunk['cpos_ranges'] = True
|
chunk['cpos_ranges'] = True
|
||||||
if (results.size == 0):
|
if (results.attrs['size'] == 0):
|
||||||
progress = 100
|
progress = 100
|
||||||
else:
|
else:
|
||||||
progress = ((chunk_start + chunk_size) / results.size) * 100
|
progress = ((chunk_start + chunk_size) / results.attrs['size']) * 100
|
||||||
progress = min(100, int(math.ceil(progress)))
|
progress = min(100, int(math.ceil(progress)))
|
||||||
response = {'code': 200, 'desc': None, 'msg': 'OK',
|
response = {'code': 200, 'desc': None, 'msg': 'OK',
|
||||||
'payload': {'chunk': chunk, 'progress': progress}}
|
'payload': {'chunk': chunk, 'progress': progress}}
|
||||||
@ -104,12 +104,12 @@ def corpus_analysis_inspect_match(payload):
|
|||||||
return
|
return
|
||||||
# Get more context for given match CPOS
|
# Get more context for given match CPOS
|
||||||
corpus = client.corpora.get('CORPUS')
|
corpus = client.corpora.get('CORPUS')
|
||||||
s = corpus.attributes.structural.get('s')
|
s = corpus.structural_attributes.get('s')
|
||||||
match_context = s.export(payload['first_cpos'], payload['last_cpos'],
|
match_context = s.export(payload['first_cpos'], payload['last_cpos'],
|
||||||
context=3, expand_lists=False)
|
context=3, expand_lists=False)
|
||||||
match_context['cpos_ranges'] = True
|
match_context['cpos_ranges'] = True
|
||||||
socketio.emit('match_context',
|
socketio.emit('match_context', {'payload': match_context},
|
||||||
{'payload': match_context}, room=request.sid)
|
room=request.sid)
|
||||||
|
|
||||||
|
|
||||||
def corpus_analysis_session_handler(app, corpus_id, user_id, session_id):
|
def corpus_analysis_session_handler(app, corpus_id, user_id, session_id):
|
||||||
|
@ -381,7 +381,7 @@ class ResultsList extends List {
|
|||||||
token = chunk["cpos_lookup"][cpos];
|
token = chunk["cpos_lookup"][cpos];
|
||||||
hitCellElement.insertAdjacentHTML("beforeend", `<span class="token" data-cpos="${cpos}">${token["word"]} </span>`);
|
hitCellElement.insertAdjacentHTML("beforeend", `<span class="token" data-cpos="${cpos}">${token["word"]} </span>`);
|
||||||
// get text titles of every hit cpos token
|
// get text titles of every hit cpos token
|
||||||
textTitles.add(chunk["text_lookup"][token["text"]]["text_title"]);
|
textTitles.add(chunk["text_lookup"][token["text"]]["title"]);
|
||||||
// add button to trigger more context to every match td
|
// add button to trigger more context to every match td
|
||||||
var inspectBtn = document.createElement("a");
|
var inspectBtn = document.createElement("a");
|
||||||
inspectBtn.setAttribute("class", "btn-floating btn-flat waves-effect waves-light grey right inspect disabled");
|
inspectBtn.setAttribute("class", "btn-floating btn-flat waves-effect waves-light grey right inspect disabled");
|
||||||
|
Loading…
Reference in New Issue
Block a user