From 83a607728ddf5f051453f8a7a585ddecfbbd5a02 Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Tue, 7 Apr 2020 16:25:44 +0200 Subject: [PATCH 1/4] cqi package update and changes to match the new functions --- app/corpora/cqi/client.py | 6 +- app/corpora/cqi/models/attributes.py | 151 +++++++++++++-------------- app/corpora/cqi/models/corpora.py | 87 +++++++++------ app/corpora/cqi/models/ressource.py | 78 ++++++++++++++ app/corpora/cqi/models/subcorpora.py | 88 +++++++++------- app/corpora/events.py | 14 +-- app/static/js/nopaque.lists.js | 2 +- 7 files changed, 265 insertions(+), 161 deletions(-) create mode 100644 app/corpora/cqi/models/ressource.py diff --git a/app/corpora/cqi/client.py b/app/corpora/cqi/client.py index 5cc867c1..1f965e15 100644 --- a/app/corpora/cqi/client.py +++ b/app/corpora/cqi/client.py @@ -32,12 +32,14 @@ class CQiClient: def connect(self, username='anonymous', password=''): status = self.api.ctrl_connect(username, password) - self.corpora = CorpusCollection(self) return status def disconnect(self): - del self.corpora return self.api.ctrl_bye() def ping(self): return self.api.ctrl_ping() + + @property + def corpora(self): + return CorpusCollection(client=self) diff --git a/app/corpora/cqi/models/attributes.py b/app/corpora/cqi/models/attributes.py index 20dd359b..ea92d5af 100644 --- a/app/corpora/cqi/models/attributes.py +++ b/app/corpora/cqi/models/attributes.py @@ -1,139 +1,119 @@ -class AttributeCollection: - def __init__(self, client, corpus): - self.client = client - self.corpus = corpus - self.alignment = AlignmentAttributeCollection(client, corpus) - self.positional = PositionalAttributeCollection(client, corpus) - self.structural = StructuralAttributeCollection(client, corpus) +from .ressource import Collection, Model -class Attribute: +class Attribute(Model): """ This is a class representing an attribute. Attributes denote the general category of information. A specific occurence is identified by an Id. - - Attributes: - client (CQiClient): A connected client pointing at the server that this - object is on. - corpus (Corpus): The corpus, this attribute belongs to. - name (str): The name of the Attribute. - size (int): The number of occurences of this attribute within the corpus. """ - def __init__(self, client, corpus, name): - self.client = client - self.corpus = corpus - self.name = name - self._name = '{}.{}'.format(corpus.name, name) - self.size = client.api.cl_attribute_size(self._name) + id_attribute = 'api_name' + + @staticmethod + def _attrs(client, corpus, name): + api_name = '{}.{}'.format(corpus.attrs['api_name'], name) + return {'api_name': api_name, + 'name': name, + 'size': client.api.cl_attribute_size(api_name)} def drop(self): - return self.client.api.cl_drop_attribute(self._name) + return self.client.api.cl_drop_attribute(self.attrs['api_name']) -class AlignmentAttributeCollection: - def __init__(self, client, corpus): - self.client = client +class AttributeCollection(Collection): + model = Attribute + + def __init__(self, client=None, corpus=None): + super(AttributeCollection, self).__init__(client=client) self.corpus = corpus - def get(self, name): - return AlignmentAttribute(self.client, self.corpus, name) + def get(self, attribute_name): + return self.prepare_model(self.model._attrs(self.client, self.corpus, + attribute_name)) def list(self): - return [AlignmentAttribute(self.client, self.corpus, attr) for attr in - self.client.api.corpus_alignment_attributes(self.corpus.name)] + raise NotImplementedError class AlignmentAttribute(Attribute): def cpos_by_ids(self, id_list): - return self.client.api.cl_alg2cpos(self._name, id_list) + return self.client.api.cl_alg2cpos(self.attrs['api_name'], id_list) def ids_by_cpos(self, cpos_list): - return self.client.api.cl_cpos2alg(self._name, cpos_list) + return self.client.api.cl_cpos2alg(self.attrs['api_name'], cpos_list) -class PositionalAttributeCollection: - def __init__(self, client, corpus): - self.client = client - self.corpus = corpus - - def get(self, name): - return PositionalAttribute(self.client, self.corpus, name) +class AlignmentAttributeCollection(AttributeCollection): + model = AlignmentAttribute def list(self): - return [PositionalAttribute(self.client, self.corpus, attr) for attr in - self.client.api.corpus_positional_attributes(self.corpus.name)] + return [self.prepare_model(self.model._attrs(self.client, self.corpus, attr)) # noqa + for attr in self.client.api.corpus_alignment_attributes(self.corpus.attrs['api_name'])] # noqa class PositionalAttribute(Attribute): - def __init__(self, client, corpus, name): - super(PositionalAttribute, self).__init__(client, corpus, name) - self.lexicon_size = client.api.cl_lexicon_size(self._name) + @staticmethod + def _attrs(client, corpus, name): + attrs = super(PositionalAttribute, PositionalAttribute)._attrs(client, corpus, name) # noqa + attrs['lexicon_size'] = client.api.cl_lexicon_size(attrs['api_name']) + return attrs def cpos_by_id(self, id): - return self.client.api.cl_id2cpos(self._name, id) + return self.client.api.cl_id2cpos(self.attrs['api_name'], id) def cpos_by_ids(self, id_list): - return self.client.api.cl_idlist2cpos(self._name, id_list) + return self.client.api.cl_idlist2cpos(self.attrs['api_name'], id_list) def freqs_by_ids(self, id_list): - return self.client.api.cl_id2freq(self._name, id_list) + return self.client.api.cl_id2freq(self.attrs['api_name'], id_list) def ids_by_cpos(self, cpos_list): - return self.client.api.cl_cpos2id(self._name, cpos_list) + return self.client.api.cl_cpos2id(self.attrs['api_name'], cpos_list) def ids_by_regex(self, regex): - return self.client.api.cl_regex2id(self._name, regex) + return self.client.api.cl_regex2id(self.attrs['api_name'], regex) def ids_by_values(self, value_list): - return self.client.api.cl_str2id(self._name, value_list) + return self.client.api.cl_str2id(self.attrs['api_name'], value_list) def values_by_cpos(self, cpos_list): - return self.client.api.cl_cpos2str(self._name, cpos_list) + return self.client.api.cl_cpos2str(self.attrs['api_name'], cpos_list) def values_by_ids(self, id_list): - return self.client.api.cl_id2str(self._name, id_list) + return self.client.api.cl_id2str(self.attrs['api_name'], id_list) -class StructuralAttributeCollection: - def __init__(self, client, corpus): - self.client = client - self.corpus = corpus +class PositionalAttributeCollection(AttributeCollection): + model = PositionalAttribute - def get(self, name): - return StructuralAttribute(self.client, self.corpus, name) - - def list(self, filters={}): - attrs = [StructuralAttribute(self.client, self.corpus, attr) for attr - in self.client.api.corpus_structural_attributes( - self.corpus.name)] - for k, v in filters.items(): - if k == 'part_of': - attrs = list(filter(lambda x: x.name.startswith(v.name + '_'), - attrs)) - return attrs + def list(self): + return [self.prepare_model(self.model._attrs(self.client, self.corpus, attr)) # noqa + for attr in self.client.api.corpus_positional_attributes(self.corpus.attrs['api_name'])] # noqa class StructuralAttribute(Attribute): - def __init__(self, client, corpus, name): - super(StructuralAttribute, self).__init__(client, corpus, name) - self.has_values = \ - client.api.corpus_structural_attribute_has_values(self._name) + @staticmethod + def _attrs(client, corpus, name): + attrs = super(StructuralAttribute, StructuralAttribute)._attrs(client, corpus, name) # noqa + attrs['has_values'] = client.api.corpus_structural_attribute_has_values(attrs['api_name']) # noqa + return attrs def cpos_by_id(self, id): - return self.client.api.cl_struc2cpos(self._name, id) + return self.client.api.cl_struc2cpos(self.attrs['api_name'], id) def ids_by_cpos(self, cpos_list): - return self.client.api.cl_cpos2struc(self._name, cpos_list) + return self.client.api.cl_cpos2struc(self.attrs['api_name'], cpos_list) def lbound_by_cpos(self, cpos_list): - return self.client.api.cl_cpos2lbound(self._name, cpos_list) + return self.client.api.cl_cpos2lbound(self.attrs['api_name'], + cpos_list) def rbound_by_cpos(self, cpos_list): - return self.client.api.cl_cpos2rbound(self._name, cpos_list) + return self.client.api.cl_cpos2rbound(self.attrs['api_name'], + cpos_list) def values_by_ids(self, id_list): - return self.client.api.cl_struc2str(self._name, id_list) + return self.client.api.cl_struc2str(self.attrs['api_name'], id_list) def export(self, first_cpos, last_cpos, context=0, expand_lists=False): first_id, last_id = self.ids_by_cpos([first_cpos, last_cpos]) @@ -150,9 +130,10 @@ class StructuralAttribute(Attribute): else: cpos_list_lbound = first_cpos rc_rbound = \ - self.cpos_by_id(min((last_id + context), (self.size - 1)))[1] + self.cpos_by_id(min((last_id + context), (self.attrs['size'] - 1)))[1] if rc_rbound != last_cpos: - rc_lbound = min((last_cpos + 1), (self.corpus.size - 1)) + rc_lbound = min((last_cpos + 1), + (self.collection.corpus.attrs['size'] - 1)) rc = (rc_lbound, rc_rbound) cpos_list_rbound = rc_rbound else: @@ -164,5 +145,17 @@ class StructuralAttribute(Attribute): 'rc': list(range(rc[0], (rc[1] + 1)))} else: match = {'lc': lc, 'c': c, 'rc': rc} - lookups = self.corpus.lookups_by_cpos(cpos_list) + lookups = self.collection.corpus.lookups_by_cpos(cpos_list) return {'match': match, **lookups} + + +class StructuralAttributeCollection(AttributeCollection): + model = StructuralAttribute + + def list(self, filters={}): + attrs = [self.prepare_model(self.model._attrs(self.client, self.corpus, attr)) # noqa + for attr in self.client.api.corpus_structural_attributes(self.corpus.attrs['api_name'])] # noqa + for k, v in filters.items(): + if k == 'part_of': + attrs = list(filter(lambda x: x.attrs['name'].startswith(v.attrs['name'] + '_'), attrs)) # noqa + return attrs diff --git a/app/corpora/cqi/models/corpora.py b/app/corpora/cqi/models/corpora.py index e81056ab..a12d9b4f 100644 --- a/app/corpora/cqi/models/corpora.py +++ b/app/corpora/cqi/models/corpora.py @@ -1,30 +1,24 @@ -from .attributes import AttributeCollection +from .attributes import (AlignmentAttributeCollection, + PositionalAttributeCollection, + StructuralAttributeCollection) +from .ressource import Collection, Model from .subcorpora import SubcorpusCollection -class CorpusCollection: - def __init__(self, client): - self.client = client +class Corpus(Model): + id_attribute = 'api_name' - def get(self, name): - return Corpus(self.client, name) - - def list(self): - return [Corpus(self.client, corpus) for corpus in - self.client.api.corpus_list_coprora()] - - -class Corpus: - def __init__(self, client, name): - self.client = client - self.name = name - self.size = client.api.cl_attribute_size('{}.word'.format(name)) - # self.info = client.api.corpus_info(name) - self.charset = client.api.corpus_charset(name) - # self.full_name = client.api.corpus_full_name(name) - self.properties = client.api.corpus_properties(name) - self.attributes = AttributeCollection(client, self) - self.subcorpora = SubcorpusCollection(client, self) + @staticmethod + def _attrs(client, name): + api_name = name + return {'api_name': api_name, + 'name': name, + 'size': client.api.cl_attribute_size( + '{}.word'.format(api_name)), + # 'info': client.api.corpus_info(name), + 'charset': client.api.corpus_charset(api_name), + # 'full_name' = client.api.corpus_full_name(name), + 'properties': client.api.corpus_properties(api_name)} def lookups_by_cpos(self, cpos_list): cpos_list = list(set(cpos_list)) @@ -33,35 +27,64 @@ class Corpus: lookups['cpos_lookup'] = {} for cpos in cpos_list: lookups['cpos_lookup'][cpos] = {} - for attr in self.attributes.positional.list(): + for attr in self.positional_attributes.list(): cpos_attr_values = attr.values_by_cpos(cpos_list) for i, cpos in enumerate(cpos_list): - lookups['cpos_lookup'][cpos][attr.name] = cpos_attr_values[i] - for attr in self.attributes.structural.list(): - if attr.has_values: + lookups['cpos_lookup'][cpos][attr.attrs['name']] = cpos_attr_values[i] + for attr in self.structural_attributes.list(): + if attr.attrs['has_values']: continue cpos_attr_ids = attr.ids_by_cpos(cpos_list) for i, cpos in enumerate(cpos_list): if cpos_attr_ids[i] != -1: - lookups['cpos_lookup'][cpos][attr.name] = cpos_attr_ids[i] + lookups['cpos_lookup'][cpos][attr.attrs['name']] = cpos_attr_ids[i] occured_attr_ids = list(filter(lambda x: x != -1, set(cpos_attr_ids))) if not occured_attr_ids: continue subattrs = \ - self.attributes.structural.list(filters={'part_of': attr}) + self.structural_attributes.list(filters={'part_of': attr}) if not subattrs: continue - lookup_name = '{}_lookup'.format(attr.name) + lookup_name = '{}_lookup'.format(attr.attrs['name']) lookups[lookup_name] = {} for attr_id in occured_attr_ids: lookups[lookup_name][attr_id] = {} for subattr in subattrs: subattr_values = subattr.values_by_ids(occured_attr_ids) for i, subattr_value in enumerate(subattr_values): - lookups[lookup_name][occured_attr_ids[i]][subattr.name] = \ + subattr_name = subattr.attrs['name'][(len(attr.attrs['name']) + 1):] + lookups[lookup_name][occured_attr_ids[i]][subattr_name] = \ subattr_value return lookups def query(self, query, subcorpus_name='Results'): - return self.client.api.cqp_query(self.name, subcorpus_name, query) + return self.client.api.cqp_query(self.attrs['api_name'], + subcorpus_name, query) + + @property + def alignment_attributes(self): + return AlignmentAttributeCollection(client=self.client, corpus=self) + + @property + def positional_attributes(self): + return PositionalAttributeCollection(client=self.client, corpus=self) + + @property + def structural_attributes(self): + return StructuralAttributeCollection(client=self.client, corpus=self) + + @property + def subcorpora(self): + return SubcorpusCollection(client=self.client, corpus=self) + + +class CorpusCollection(Collection): + model = Corpus + + def get(self, corpus_name): + return self.prepare_model(self.model._attrs(self.client, corpus_name)) + + def list(self): + return [self.prepare_model(self.model._attrs(self.client, corpus)) + for corpus in self.client.api.corpus_list_coprora()] diff --git a/app/corpora/cqi/models/ressource.py b/app/corpora/cqi/models/ressource.py new file mode 100644 index 00000000..449bc93b --- /dev/null +++ b/app/corpora/cqi/models/ressource.py @@ -0,0 +1,78 @@ +class Model: + """ + A base class for representing a single object on the server. + """ + id_attribute = 'Id' + + def __init__(self, attrs=None, client=None, collection=None): + #: A client pointing at the server that this object is on. + self.client = client + + #: The collection that this model is part of. + self.collection = collection + + #: The raw representation of this object from the API + self.attrs = attrs or {} + + @staticmethod + def _attrs(client, key): + raise NotImplementedError + + def __repr__(self): + return "<{}: {}>".format(self.__class__.__name__, self.id) + + def __eq__(self, other): + return isinstance(other, self.__class__) and self.id == other.id + + def __hash__(self): + return hash("{}:{}".format(self.__class__.__name__, self.id)) + + @property + def id(self): + """ + The ID of the object. + """ + return self.attrs.get(self.id_attribute) + + def reload(self): + """ + Load this object from the server again and update ``attrs`` with the + new data. + """ + new_model = self.collection.get(self.id) + self.attrs = new_model.attrs + + +class Collection: + """ + A base class for representing all objects of a particular type on the + server. + """ + + #: The type of object this collection represents, set by subclasses + model = None + + def __init__(self, client=None): + #: The client pointing at the server that this collection of objects + #: is on. + self.client = client + + def list(self): + raise NotImplementedError + + def get(self, key): + raise NotImplementedError + + def prepare_model(self, attrs): + """ + Create a model from a set of attributes. + """ + if isinstance(attrs, Model): + attrs.client = self.client + attrs.collection = self + return attrs + elif isinstance(attrs, dict): + return self.model(attrs=attrs, client=self.client, collection=self) + else: + raise Exception("Can't create {} from {}".format( + self.model.__name__, attrs)) diff --git a/app/corpora/cqi/models/subcorpora.py b/app/corpora/cqi/models/subcorpora.py index 0e774db0..38d25435 100644 --- a/app/corpora/cqi/models/subcorpora.py +++ b/app/corpora/cqi/models/subcorpora.py @@ -1,54 +1,45 @@ +from .ressource import Collection, Model from ..api.specification import (CONST_FIELD_KEYWORD, CONST_FIELD_MATCH, CONST_FIELD_MATCHEND, CONST_FIELD_TARGET) -class SubcorpusCollection: - def __init__(self, client, corpus): - self.client = client - self.corpus = corpus +class Subcorpus(Model): + id_attribute = 'api_name' - def get(self, name): - return Subcorpus(self.client, self.corpus, name) - - def list(self): - return [Subcorpus(self.client, self.corpus, subcorpus) for subcorpus in - self.client.api.cqp_list_subcorpora(self.corpus.name)] - - -class Subcorpus: - def __init__(self, client, corpus, name): - self.client = client - self.corpus = corpus - self.name = name - self._name = '{}:{}'.format(corpus.name, name) - self.fields = {} - if client.api.cqp_subcorpus_has_field(self._name, CONST_FIELD_MATCH): - self.fields['match'] = CONST_FIELD_MATCH - if client.api.cqp_subcorpus_has_field(self._name, + @staticmethod + def _attrs(client, corpus, name): + api_name = '{}:{}'.format(corpus.attrs['api_name'], name) + fields = {} + if client.api.cqp_subcorpus_has_field(api_name, CONST_FIELD_MATCH): + fields['match'] = CONST_FIELD_MATCH + if client.api.cqp_subcorpus_has_field(api_name, CONST_FIELD_MATCHEND): - self.fields['matchend'] = CONST_FIELD_MATCHEND - if client.api.cqp_subcorpus_has_field(self._name, CONST_FIELD_TARGET): - self.fields['target'] = CONST_FIELD_TARGET - if client.api.cqp_subcorpus_has_field(self._name, CONST_FIELD_KEYWORD): - self.fields['keyword'] = CONST_FIELD_KEYWORD - self.size = client.api.cqp_subcorpus_size(self._name) + fields['matchend'] = CONST_FIELD_MATCHEND + if client.api.cqp_subcorpus_has_field(api_name, CONST_FIELD_TARGET): + fields['target'] = CONST_FIELD_TARGET + if client.api.cqp_subcorpus_has_field(api_name, CONST_FIELD_KEYWORD): + fields['keyword'] = CONST_FIELD_KEYWORD + return {'api_name': api_name, + 'name': name, + 'fields': fields, + 'size': client.api.cqp_subcorpus_size(api_name)} def drop(self): - return self.client.api.cqp_drop_subcorpus(self._name) + return self.client.api.cqp_drop_subcorpus(self.attrs['api_name']) def dump(self, field, first, last): - return self.client.api.cqp_dump_subcorpus(self._name, field, first, - last) + return self.client.api.cqp_dump_subcorpus(self.attrs['api_name'], + field, first, last) def export(self, context=25, cutoff=float('inf'), expand_lists=False, offset=0): - if self.size == 0: + if self.attrs['size'] == 0: return {"matches": []} first_match = max(0, offset) - last_match = min((offset + cutoff - 1), (self.size - 1)) + last_match = min((offset + cutoff - 1), (self.attrs['size'] - 1)) match_boundaries = \ - zip(self.dump(self.fields['match'], first_match, last_match), - self.dump(self.fields['matchend'], first_match, last_match)) + zip(self.dump(self.attrs['fields']['match'], first_match, last_match), # noqa + self.dump(self.attrs['fields']['matchend'], first_match, last_match)) # noqa cpos_list = [] matches = [] for match_start, match_end in match_boundaries: @@ -64,10 +55,11 @@ class Subcorpus: cpos_list_lbound = lc_lbound else: cpos_list_lbound = match_start - rc_lbound = min((match_end + 1), (self.corpus.size - 1)) + rc_lbound = min((match_end + 1), + (self.collection.corpus.attrs['size'] - 1)) if rc_lbound != match_end: rc_rbound = min((match_end + 1 + context), - (self.corpus.size - 1)) + (self.collection.corpus.attrs['size'] - 1)) rc = (rc_lbound, rc_rbound) cpos_list_rbound = rc_rbound else: @@ -81,14 +73,30 @@ class Subcorpus: else: match = {'lc': lc, 'c': c, 'rc': rc} matches.append(match) - lookups = self.corpus.lookups_by_cpos(cpos_list) + lookups = self.collection.corpus.lookups_by_cpos(cpos_list) return {'matches': matches, **lookups} def fdist_1(self, cutoff, field, attribute): - return self.client.api.cqp_fdist_1(self._name, cutoff, + return self.client.api.cqp_fdist_1(self.attrs['api_name'], cutoff, field, attribute._name) def fdist_2(self, cutoff, field_1, attribute_1, field_2, attribute_2): - return self.client.api.cqp_fdist_2(self._name, cutoff, + return self.client.api.cqp_fdist_2(self.attrs['api_name'], cutoff, field_1, attribute_1._name, field_2, attribute_2._name) + + +class SubcorpusCollection(Collection): + model = Subcorpus + + def __init__(self, client=None, corpus=None): + super(SubcorpusCollection, self).__init__(client=client) + self.corpus = corpus + + def get(self, subcorpus_name): + return self.prepare_model(self.model._attrs(self.client, self.corpus, + subcorpus_name)) + + def list(self): + return [self.prepare_model(self.model._attrs(self.client, self.corpus, subcorpus)) # noqa + for subcorpus in self.client.api.cqp_list_subcorpora(self.corpus.attrs['api_name'])] # noqa diff --git a/app/corpora/events.py b/app/corpora/events.py index 73d189f9..cf9c791c 100644 --- a/app/corpora/events.py +++ b/app/corpora/events.py @@ -67,23 +67,23 @@ def corpus_analysis_query(query): socketio.emit('corpus_analysis_query', response, room=request.sid) return response = {'code': 200, 'desc': None, 'msg': 'OK', - 'payload': {**query_status, 'match_count': results.size}} + 'payload': {**query_status, 'match_count': results.attrs['size']}} socketio.emit('corpus_analysis_query', response, room=request.sid) chunk_size = 100 chunk_start = 0 context = 100 progress = 0 client.status = 'running' - while chunk_start <= results.size: + while chunk_start <= results.attrs['size']: if client.status == 'abort': break chunk = results.export(context=context, cutoff=chunk_size, expand_lists=False, offset=chunk_start) chunk['cpos_ranges'] = True - if (results.size == 0): + if (results.attrs['size'] == 0): progress = 100 else: - progress = ((chunk_start + chunk_size) / results.size) * 100 + progress = ((chunk_start + chunk_size) / results.attrs['size']) * 100 progress = min(100, int(math.ceil(progress))) response = {'code': 200, 'desc': None, 'msg': 'OK', 'payload': {'chunk': chunk, 'progress': progress}} @@ -104,12 +104,12 @@ def corpus_analysis_inspect_match(payload): return # Get more context for given match CPOS corpus = client.corpora.get('CORPUS') - s = corpus.attributes.structural.get('s') + s = corpus.structural_attributes.get('s') match_context = s.export(payload['first_cpos'], payload['last_cpos'], context=3, expand_lists=False) match_context['cpos_ranges'] = True - socketio.emit('match_context', - {'payload': match_context}, room=request.sid) + socketio.emit('match_context', {'payload': match_context}, + room=request.sid) def corpus_analysis_session_handler(app, corpus_id, user_id, session_id): diff --git a/app/static/js/nopaque.lists.js b/app/static/js/nopaque.lists.js index 49772278..b5fcb473 100644 --- a/app/static/js/nopaque.lists.js +++ b/app/static/js/nopaque.lists.js @@ -381,7 +381,7 @@ class ResultsList extends List { token = chunk["cpos_lookup"][cpos]; hitCellElement.insertAdjacentHTML("beforeend", `${token["word"]} `); // get text titles of every hit cpos token - textTitles.add(chunk["text_lookup"][token["text"]]["text_title"]); + textTitles.add(chunk["text_lookup"][token["text"]]["title"]); // add button to trigger more context to every match td var inspectBtn = document.createElement("a"); inspectBtn.setAttribute("class", "btn-floating btn-flat waves-effect waves-light grey right inspect disabled"); From a1cdfd498ad9fdafe815455e1bde83bf99260a97 Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Tue, 7 Apr 2020 16:27:28 +0200 Subject: [PATCH 2/4] Remove wrapper --- app/corpora/cqi/wrapper.py | 321 ------------------------------------- 1 file changed, 321 deletions(-) delete mode 100644 app/corpora/cqi/wrapper.py diff --git a/app/corpora/cqi/wrapper.py b/app/corpora/cqi/wrapper.py deleted file mode 100644 index f6c58395..00000000 --- a/app/corpora/cqi/wrapper.py +++ /dev/null @@ -1,321 +0,0 @@ -from .api import APIClient -from .api.specification import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND -import time - - -class CQiWrapper(APIClient): - ''' - CQIiWrapper object - - High level wrapper that groups and renames some functions of CQiClient - for ease of use. Also structures recieved data into python dictionaries. - - Keyword arguments: - host -- host IP adress or hostname wher the cqp server is running - port -- port of the cqp server - username -- username used to connect to the cqp server - password -- password of the user to connect to the cqp server - ''' - - SUBCORPUS_NAMES = [] - - def __init__(self, host='127.0.0.1', port=4877, username='anonymous', - password=''): - super(CQiWrapper, self).__init__(host, port=port) - self.username = username - self.password = password - - def connect(self): - ''' - Connect with CQP server - - Connects via socket to the CQP server using the given username and - password from class initiation. - ''' - self.ctrl_connect(self.username, self.password) - - def __create_attribute_strings(self): - ''' - Creates all needed attribute strings to query for word, lemma etc. in - the given corpus. - For example: CORPUS_NAME.word to query words - Automaticalle creates strings for all pre defined tags. - ''' - p_attrs = self.corpus_positional_attributes(self.corpus_name) - struct_attrs = self.corpus_structural_attributes(self.corpus_name) - self.attr_strings = {} - self.attr_strings['positional_attrs'] = {} - self.attr_strings['struct_attrs'] = {} - for p_attr in p_attrs: - self.attr_strings['positional_attrs'][p_attr] = (self.corpus_name - + '.' - + p_attr) - for struct_attr in struct_attrs: - self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name - + '.' - + struct_attr) - print(('All positional and ' - 'structural attributes: {}').format(self.attr_strings)) - - def select_corpus(self, corpus_name): - ''' - Checks if given copus name exists. If it exists set it as the main - corpus name used to create the needed query attribute strings like - CORPUS_NAME.word. - ''' - if corpus_name in self.corpus_list_coprora(): - self.corpus_name = corpus_name - self.__create_attribute_strings() - print('{} does exist.'.format(corpus_name)) - else: - print('{} does not exist.'.format(corpus_name)) - raise Exception('Given Corpus Name is not in corpora list.') - - def disconnect(self): - ''' - Disconnect from CQP server - - Disconnects from the CQP server. Closes used socket after disconnect. - ''' - self.ctrl_bye() - print('Disconnected from cqp server.') - - def query_subcorpus(self, query, result_subcorpus_name='Query-results'): - ''' - Create subcorpus - - Input query will be used to create a subcorpus holding all cpos match - positions for that query. - - Keyword arguments: - result_subcorpus_name -- set name of the subcorpus which holds all - cpos match positions, produced by the query - query -- query written in cqp query language - ''' - self.query = query - self.cqp_query(self.corpus_name, result_subcorpus_name, query) - self.result_subcorpus = (self.corpus_name - + ':' - + result_subcorpus_name) - self.SUBCORPUS_NAMES.append(self.result_subcorpus) - self.match_count = self.cqp_subcorpus_size(self.result_subcorpus) - print('Nr of all matches is: {}'.format(self.match_count)) - - def show_subcorpora(self): - ''' - Show all subcorpora currently saved by the cqp server. - ''' - return self.cqp_list_subcorpora(self.corpus_name) - - def show_query_results(self, - context_len=10, - result_len=1000, - result_offset=0): - ''' - Show query results - - Shows the actual matched strings produce by the query. Uses the cpos - match indexes to grab those strings. saves them into an orderd - dictionary. Also saves coresponding tags, lemmas and context. Gets those - informations using the corresponding cpos. - - Keyword arguments: - context_len -- defines how many words before and after a match will be - shown (default 10) - result_len -- defines for how many matches all informations like lemma - and POS are being grabbed - result_offset -- defines the offset of the matches being requested. If - the offset is 100 informations for matches 100 to result_len are being - grabbed - ''' - t0 = time.time() - self.context_len = context_len - self.corpus_max_len = self.cl_attribute_size( - self.attr_strings['positional_attrs']['word'] - ) - self.nr_matches = min(result_len, self.match_count) - if self.match_count == 0: - print('Query resulted in 0 matches.') - self.results = {'code': 0, - 'result': {'matches': [], - 'match_count': self.match_count, - 'cpos_lookup': {}, - 'text_lookup': {}} - } - return self.results - else: - # Get match cpos boundries - # match_boundries shows the start and end cpos of one match as a - # pair of cpositions - # [(1355, 1357), (1477, 1479)] Example for two boundry pairs - offset_start = 0 if result_offset == 0 else result_offset - print('Offset start is: {}'.format(offset_start)) - offset_end = min((self.nr_matches + result_offset - 1), self.match_count - 1) - print('Offset end is: {}'.format(offset_end)) - match_boundaries = zip(self.cqp_dump_subcorpus(self.result_subcorpus, - CONST_FIELD_MATCH, - offset_start, - offset_end), - self.cqp_dump_subcorpus(self.result_subcorpus, - CONST_FIELD_MATCHEND, - offset_start, - offset_end)) - - # Generate all cpos between match boundries including start and end - # boundries. - # Also generate cpos for left and right context. - # Save those cpos into dict as lists for the keys 'lc', 'hit' and 'rc' - # Also collect all cpos together in one list for the final request of - # all cpos informations - all_matches = [] - all_cpos = [] - for start, end in match_boundaries: - end += 1 - lc_cpos = list(range(max([0, start - self.context_len]), start)) - lc = {'lc': lc_cpos} - match_cpos = list(range(start, end)) - match = {'hit': match_cpos} - rc_cpos = list(range(end, min([self.corpus_max_len, - end + self.context_len]))) - rc = {'rc': rc_cpos} - lc.update(match) - lc.update(rc) - all_cpos.extend(lc_cpos + match_cpos + rc_cpos) - all_matches.append(lc) - - all_cpos = list(set(all_cpos)) # get rid of cpos duplicates - len_all_cpos = len(all_cpos) - t1 = time.time() - t_total = t1 - t0 - print('Time to create all CPOS for query: {}'.format(t_total)) - print('Requesting {} CPOS with one query.'.format(len_all_cpos)) - - # Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for - # all cpos entries in all_cpos_list - # Also saves these informations into self.results dict - t2 = time.time() - all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos) - t3 = time.time() - t_final = t3 - t2 - print('Got infos for {} CPOS in {} seconds:'.format(len_all_cpos, - t_final)) - self.results = {'code': 0, - 'result': {'matches': all_matches, - 'match_count': self.match_count, - 'cpos_lookup': all_cpos_infos, - 'text_lookup': text_lookup} - } - return self.results - - def get_cpos_infos(self, all_cpos): - ''' - Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for - all cpos entries specified in the parameter all_cpos. - ''' - # Get all positional attribute informations - cpos_infos = {} - for p_attr_key in self.attr_strings['positional_attrs'].keys(): - match_strs = self.cl_cpos2str(self.attr_strings['positional_attrs'][p_attr_key], all_cpos) - cpos_infos[p_attr_key] = match_strs - - # Get all strucutural attribute informations - tmp_info = {} - structs_to_check = [] - for struct_attr_key in self.attr_strings['struct_attrs'].keys(): - key = self.attr_strings['struct_attrs'][struct_attr_key] - has_value = self.corpus_structural_attribute_has_values(key) - struct_ids = self.cl_cpos2struc(key, all_cpos) - if has_value is False: # Get IDs of strucutural elements without values (this means get IDs of XML tags. Struct elements only have values if they are XML attributes) - tmp_info[struct_attr_key] = [] - for id in struct_ids: - tmp_info[struct_attr_key].append(id) - else: - structs_to_check.append({key: struct_attr_key}) - print('Structs to check: {}'.format(structs_to_check)) - struct_attr_values = list(tmp_info.values()) - # print('Struct attr value list: {}'.format(struct_attr_values)) - struct_attr_keys = list(tmp_info.keys()) - # print('Struct attr key list: {}'.format(struct_attr_keys)) - - # Build textlookup dictionary - text_lookup_ids = list(set(struct_attr_values[0])) # every CPOS is associated with one text id. A set is build to only gather text_lookup informations for every unique text id - text_lookup = {} # final dict containing all info of one text identified by its id - for d in structs_to_check: - s_key, s_value = zip(*d.items()) - print('dict entries: {}: {}'.format(s_key, s_value)) - s_value = s_value[0].split('_', 1)[-1] - print('S_VALUE: {}'.format(s_value)) - struct_values = self.cl_struc2str(s_key[0], text_lookup_ids) - print('Extracted Value with key {}: {}'.format(s_key[0], struct_values)) - zipped = dict(zip(text_lookup_ids, struct_values)) - for zip_key, zip_value in zipped.items(): - print('Text id as key is: {}'.format(zip_key)) - print('Value of this text is: {}'.format(zip_value)) - check = text_lookup.get(zip_key) - print('check: {}'.format(check)) - if check is None: - text_lookup[zip_key] = {s_value: zip_value} - else: - text_lookup[zip_key].update({s_value: zip_value}) - - # zip keys and values together - attr_values_list = [] - attr_keys_list = [] - for key in cpos_infos.keys(): - attr_values_list.append(cpos_infos[key]) - attr_keys_list.append(key) - attr_keys_list.extend(struct_attr_keys) - attr_values_list.extend(struct_attr_values) - joined_cpos_infos = zip(all_cpos, *attr_values_list) - dict_cpos_infos = {} - for info in joined_cpos_infos: - dict_cpos_infos[info[0]] = dict(zip(attr_keys_list, info[1:])) - return dict_cpos_infos, text_lookup - - def get_sentences(self, - match_cpos_list, - get_surrounding_s=False, - l_r_s_context_additional_len=1): - ''' - Get sentence informations for one match also set if and how much left - right context sentences should be grabbed surrounding the given CPOS. - ''' - t0 = time.time() - key = self.corpus_name + '.s' - first_cpos, last_cpos = match_cpos_list[0], match_cpos_list[-1] - context_sentences = {} - s_ids = self.cl_cpos2struc(key, [first_cpos, last_cpos]) - print('s id match: {}'.format(s_ids)) - for s_id in s_ids: - s_start, s_end = self.cl_struc2cpos(key, s_id) - s_cpos = list(range(s_start, s_end + 1)) - context_sentences[s_id] = s_cpos - if get_surrounding_s: - max_s_id = self.cl_attribute_size(key) - 1 - print('max sid: {}'.format(max_s_id)) - additional_s_ids = [] - additional_s = list(range(max(s_ids[0] - - l_r_s_context_additional_len, - 0), - min(s_ids[-1] - + l_r_s_context_additional_len, - max_s_id) + 1)) - additional_s_ids.extend(additional_s) - for s_id in additional_s_ids: - print('s id additional: {}'.format(s_id)) - s_start, s_end = self.cl_struc2cpos(key, s_id) - s_cpos = list(range(s_start, s_end + 1)) - context_sentences[s_id] = s_cpos - all_cpos = [] - for key in context_sentences.keys(): - all_cpos.extend(context_sentences[key]) - all_cpos = list(set(all_cpos)) - all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos) - t1 = time.time() - t_total = t1 - t0 - print('Got all sentences informations in {} seconds'. format(t_total)) - match_context = {'context_s_cpos': context_sentences, - 'cpos_lookup': all_cpos_infos, - 'text_lookup': text_lookup, - 'match_cpos_list': match_cpos_list} - return match_context From 08dba25de3451e4f2646fb986880aa875f401fdf Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Tue, 7 Apr 2020 16:30:27 +0200 Subject: [PATCH 3/4] fix --- app/corpora/cqi/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/app/corpora/cqi/__init__.py b/app/corpora/cqi/__init__.py index e543ac6e..4174f55d 100644 --- a/app/corpora/cqi/__init__.py +++ b/app/corpora/cqi/__init__.py @@ -1,7 +1,6 @@ # flake8: noqa from .api import APIClient from .client import CQiClient -from .wrapper import CQiWrapper from .version import version, version_info From ca833c966a1c1c2d8623abbd6d9811bcb7942de7 Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Tue, 7 Apr 2020 16:36:27 +0200 Subject: [PATCH 4/4] fixes and doc --- app/corpora/cqi/api/client.py | 3 ++- app/corpora/cqi/client.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/app/corpora/cqi/api/client.py b/app/corpora/cqi/api/client.py index 9c257203..bc62e65f 100644 --- a/app/corpora/cqi/api/client.py +++ b/app/corpora/cqi/api/client.py @@ -20,10 +20,11 @@ class APIClient: >>> client.ctrl_bye() {'code': 259, 'msg': 'CQI_STATUS_BYE_OK'} - Args: + Attributes: host (str): URL to the CQP server. For example, ``cqpserver.localhost`` or ``127.0.0.1``. port (int): Port the CQP server listens on. Default: ``4877`` + socket (socket.socket): Socket for communicating with a CQP server. """ def __init__(self, host, port=4877): diff --git a/app/corpora/cqi/client.py b/app/corpora/cqi/client.py index 1f965e15..0fba29eb 100644 --- a/app/corpora/cqi/client.py +++ b/app/corpora/cqi/client.py @@ -15,8 +15,9 @@ class CQiClient: {'code': 260, 'msg': 'CQI_STATUS_PING_OK'} >>> client.disconnect() {'code': 259, 'msg': 'CQI_STATUS_BYE_OK'} + Attributes: - api (APIClient): A client pointing to the specified to the CQP server. + api (APIClient): An API client pointing to the specified CQP server. """ def __init__(self, host, port=4877):