mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2024-11-15 01:05:42 +00:00
Add new CQiWrapper
This commit is contained in:
parent
0392d25464
commit
ffed8592c8
@ -1,7 +1,6 @@
|
|||||||
from .CQiClient import CQiClient
|
from .CQiClient import CQiClient
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
import collections
|
import collections
|
||||||
import socket
|
|
||||||
|
|
||||||
|
|
||||||
class CQiWrapper(CQiClient):
|
class CQiWrapper(CQiClient):
|
||||||
@ -33,21 +32,41 @@ class CQiWrapper(CQiClient):
|
|||||||
"""
|
"""
|
||||||
self.ctrl_connect(self.username, self.password)
|
self.ctrl_connect(self.username, self.password)
|
||||||
|
|
||||||
def create_attribute_strings(self, corpus_name):
|
def create_attribute_strings(self):
|
||||||
self.word_str = corpus_name + '.word'
|
p_attrs = self.corpus_positional_attributes(self.corpus_name)
|
||||||
self.lemma_str = corpus_name + '.lemma'
|
struct_attrs = self.corpus_structural_attributes(self.corpus_name)
|
||||||
self.pos_str = corpus_name + '.pos'
|
self.meta_struct_element = struct_attrs[0]
|
||||||
self.sem_str = corpus_name + '.sem'
|
print(p_attrs)
|
||||||
self.entry_str = corpus_name + '.entry'
|
print(struct_attrs)
|
||||||
self.entry_author_str = self.entry_str + '_author'
|
self.attr_strings = {}
|
||||||
self.entry_title_str = self.entry_str + '_title'
|
self.attr_strings['positional_attrs'] = {}
|
||||||
self.attributes = [self.word_str,
|
self.attr_strings['struct_attrs'] = {}
|
||||||
self.lemma_str,
|
for p_attr in p_attrs:
|
||||||
self.pos_str,
|
self.attr_strings['positional_attrs'][p_attr] = (self.corpus_name
|
||||||
self.sem_str,
|
+ '.'
|
||||||
self.entry_str,
|
+ p_attr)
|
||||||
self.entry_author_str,
|
for struct_attr in struct_attrs[:-1]:
|
||||||
self.entry_title_str]
|
self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
|
||||||
|
+ '.'
|
||||||
|
+ struct_attr)
|
||||||
|
# self.word_str = corpus_name + '.word'
|
||||||
|
# self.lemma_str = corpus_name + '.lemma'
|
||||||
|
# self.pos_str = corpus_name + '.pos'
|
||||||
|
# self.sem_str = corpus_name + '.sem'
|
||||||
|
# self.entry_str = corpus_name + '.entry'
|
||||||
|
# self.entry_author_str = self.entry_str + '_author'
|
||||||
|
# self.entry_title_str = self.entry_str + '_title'
|
||||||
|
# self.attributes = [self.word_str,
|
||||||
|
# self.lemma_str,
|
||||||
|
# self.pos_str,
|
||||||
|
# self.sem_str,
|
||||||
|
# self.entry_str,
|
||||||
|
# self.entry_author_str,
|
||||||
|
# self.entry_title_str]
|
||||||
|
# print(self.attributes)
|
||||||
|
|
||||||
|
def set_corpus_name(self, corpus_name):
|
||||||
|
self.corpus_name = corpus_name
|
||||||
|
|
||||||
def disconnect(self):
|
def disconnect(self):
|
||||||
"""
|
"""
|
||||||
@ -58,7 +77,7 @@ class CQiWrapper(CQiClient):
|
|||||||
self.ctrl_bye()
|
self.ctrl_bye()
|
||||||
self.connection.close()
|
self.connection.close()
|
||||||
|
|
||||||
def query_subcorpus(self, corpus_name, result_subcorpus_name, query):
|
def query_subcorpus(self, result_subcorpus_name, query):
|
||||||
"""
|
"""
|
||||||
Create subcorpus
|
Create subcorpus
|
||||||
|
|
||||||
@ -66,13 +85,12 @@ class CQiWrapper(CQiClient):
|
|||||||
positions for that query.
|
positions for that query.
|
||||||
|
|
||||||
Keyword arguments:
|
Keyword arguments:
|
||||||
corpus_name -- name of the corpus the query will be used on
|
|
||||||
result_subcorpus_name -- user set name of the subcorpus which holds all
|
result_subcorpus_name -- user set name of the subcorpus which holds all
|
||||||
cpos match positions, produced by the query
|
cpos match positions, produced by the query
|
||||||
query -- query written in cqp query language
|
query -- query written in cqp query language
|
||||||
"""
|
"""
|
||||||
self.cqp_query(corpus_name, result_subcorpus_name, query)
|
self.cqp_query(self.corpus_name, result_subcorpus_name, query)
|
||||||
self.result_subcorpus_ns = (corpus_name
|
self.result_subcorpus_ns = (self.corpus_name
|
||||||
+ ':'
|
+ ':'
|
||||||
+ result_subcorpus_name)
|
+ result_subcorpus_name)
|
||||||
self.SUBCORPUS_NAMES.append(self.result_subcorpus_ns)
|
self.SUBCORPUS_NAMES.append(self.result_subcorpus_ns)
|
||||||
@ -80,11 +98,9 @@ class CQiWrapper(CQiClient):
|
|||||||
print('Nr of all matches is:', self.nr_matches)
|
print('Nr of all matches is:', self.nr_matches)
|
||||||
|
|
||||||
def show_subcorpora(self):
|
def show_subcorpora(self):
|
||||||
print('Known subcorpora:', self.SUBCORPUS_NAMES)
|
return self.cqp_list_subcorpora(self.corpus_name)
|
||||||
return self.SUBCORPUS_NAMES
|
|
||||||
|
|
||||||
def show_results(self,
|
def show_results(self,
|
||||||
corpus_name,
|
|
||||||
result_start_count=0,
|
result_start_count=0,
|
||||||
result_max_count=50,
|
result_max_count=50,
|
||||||
context_len=10,):
|
context_len=10,):
|
||||||
@ -116,7 +132,6 @@ class CQiWrapper(CQiClient):
|
|||||||
])
|
])
|
||||||
|
|
||||||
Keyword arguments:
|
Keyword arguments:
|
||||||
corpus_name -- name of the parent corpus the subcorpus is part of
|
|
||||||
result_start_count -- start position of the dumped subcorpus.
|
result_start_count -- start position of the dumped subcorpus.
|
||||||
(default 0) If it is 0 matches 0 to 50 will be shown. If it is 50
|
(default 0) If it is 0 matches 0 to 50 will be shown. If it is 50
|
||||||
matches 50 to 100 will be shown.
|
matches 50 to 100 will be shown.
|
||||||
@ -126,8 +141,7 @@ class CQiWrapper(CQiClient):
|
|||||||
shown (default 10)
|
shown (default 10)
|
||||||
"""
|
"""
|
||||||
self.context_len = context_len
|
self.context_len = context_len
|
||||||
word_str = corpus_name + '.word'
|
self.corpus_max_len = self.cl_attribute_size(self.attr_strings['positional_attrs']['word'])
|
||||||
self.corpus_max_len = self.cl_attribute_size(word_str)
|
|
||||||
if self.nr_matches == 0:
|
if self.nr_matches == 0:
|
||||||
print('Query resulted in 0 matches.')
|
print('Query resulted in 0 matches.')
|
||||||
else:
|
else:
|
||||||
@ -157,7 +171,7 @@ class CQiWrapper(CQiClient):
|
|||||||
match = multiprocessing.Process(target=self.__get_matches,
|
match = multiprocessing.Process(target=self.__get_matches,
|
||||||
args=(i,
|
args=(i,
|
||||||
index_pair,
|
index_pair,
|
||||||
corpus_name,
|
self.corpus_name,
|
||||||
return_dict))
|
return_dict))
|
||||||
matches.append(match)
|
matches.append(match)
|
||||||
match.start()
|
match.start()
|
||||||
@ -167,7 +181,25 @@ class CQiWrapper(CQiClient):
|
|||||||
ordered_results = collections.OrderedDict()
|
ordered_results = collections.OrderedDict()
|
||||||
for key in sorted(return_dict.keys()):
|
for key in sorted(return_dict.keys()):
|
||||||
ordered_results[key] = return_dict[key]
|
ordered_results[key] = return_dict[key]
|
||||||
print('ORDERED_RESULTS', ordered_results)
|
return ordered_results
|
||||||
|
|
||||||
|
def get_cpos_info(self, cpos, session):
|
||||||
|
match_dict = {}
|
||||||
|
for attr_dict in self.attr_strings:
|
||||||
|
# print(self.attr_strings[attr_dict])
|
||||||
|
if attr_dict == 'positional_attrs':
|
||||||
|
for p_attr_key in self.attr_strings[attr_dict].keys():
|
||||||
|
# print(p_attr_key)
|
||||||
|
match_str = session.cl_cpos2str(self.attr_strings[attr_dict][p_attr_key], range(cpos[0], cpos[1]))
|
||||||
|
match_dict[p_attr_key] = match_str
|
||||||
|
elif attr_dict == 'struct_attrs':
|
||||||
|
for struct_attr_key in self.attr_strings[attr_dict].keys():
|
||||||
|
# print(struct_attr_key)
|
||||||
|
struct_entry = session.cl_cpos2struc(self.attr_strings['struct_attrs'][self.meta_struct_element],
|
||||||
|
range(cpos[0], cpos[1]))
|
||||||
|
match_str = session.cl_struc2str(self.attr_strings[attr_dict][struct_attr_key], struct_entry)
|
||||||
|
match_dict[struct_attr_key] = set(match_str)
|
||||||
|
return match_dict
|
||||||
|
|
||||||
def __get_matches(self, i, index_pair, corpus_name, return_dict):
|
def __get_matches(self, i, index_pair, corpus_name, return_dict):
|
||||||
"""
|
"""
|
||||||
@ -183,58 +215,46 @@ class CQiWrapper(CQiClient):
|
|||||||
return_dict -- dictionary created with manager.dict() that holds the
|
return_dict -- dictionary created with manager.dict() that holds the
|
||||||
extracted strings tags etc.
|
extracted strings tags etc.
|
||||||
"""
|
"""
|
||||||
print('START:', index_pair[0])
|
# print('START:', index_pair[0])
|
||||||
print('END:', index_pair[1])
|
# print('END:', index_pair[1])
|
||||||
print('=============================')
|
# print('=============================')
|
||||||
|
index_pair = [index_pair[0], index_pair[1] + 1]
|
||||||
tmp_session = CQiWrapper(username=self.username, password=self.password,
|
tmp_session = CQiWrapper(username=self.username, password=self.password,
|
||||||
host=self.host, port=self.port)
|
host=self.host, port=self.port)
|
||||||
tmp_session.connect()
|
tmp_session.connect()
|
||||||
tokens = tmp_session.cl_cpos2str(self.word_str,
|
match = self.get_cpos_info(index_pair, tmp_session)
|
||||||
range(index_pair[0],
|
# tokens = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
|
||||||
index_pair[1] + 1))
|
# range(index_pair[0],
|
||||||
lemmas = tmp_session.cl_cpos2str(self.lemma_str,
|
# index_pair[1] + 1))
|
||||||
range(index_pair[0],
|
# lemmas = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['lemma'],
|
||||||
index_pair[1] + 1))
|
# range(index_pair[0],
|
||||||
pos_tags = tmp_session.cl_cpos2str(self.pos_str,
|
# index_pair[1] + 1))
|
||||||
range(index_pair[0],
|
# pos_tags = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['pos'],
|
||||||
index_pair[1] + 1))
|
# range(index_pair[0],
|
||||||
sem_tags = tmp_session.cl_cpos2str(self.sem_str,
|
# index_pair[1] + 1))
|
||||||
range(index_pair[0],
|
# sem_tags = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['sem'],
|
||||||
index_pair[1] + 1))
|
# range(index_pair[0],
|
||||||
struc_entry = tmp_session.cl_cpos2struc(self.entry_str,
|
# index_pair[1] + 1))
|
||||||
range(index_pair[0],
|
# struc_entry = tmp_session.cl_cpos2struc(self.attr_strings['struct_attrs']['entry'],
|
||||||
index_pair[1] + 1))
|
# range(index_pair[0],
|
||||||
|
# index_pair[1] + 1))
|
||||||
before_index = max([0, index_pair[0] - self.context_len])
|
before_index = max([0, index_pair[0] - self.context_len])
|
||||||
after_index = min([self.corpus_max_len,
|
after_index = min([self.corpus_max_len,
|
||||||
index_pair[1] + self.context_len])
|
index_pair[1] + self.context_len])
|
||||||
context_before = tmp_session.cl_cpos2str(self.word_str,
|
context_before = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
|
||||||
range(before_index,
|
range(before_index,
|
||||||
index_pair[0]))
|
index_pair[0]))
|
||||||
context_after = tmp_session.cl_cpos2str(self.word_str,
|
context_after = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
|
||||||
range(index_pair[1] + 1,
|
range(index_pair[1] + 1,
|
||||||
after_index + 1))
|
after_index + 1))
|
||||||
entry_titles = tmp_session.cl_struc2str(self.entry_title_str,
|
# entry_titles = tmp_session.cl_struc2str(self.attr_strings['struct_attrs']['entry_title'],
|
||||||
struc_entry)
|
# struc_entry)
|
||||||
entry_authors = tmp_session.cl_struc2str(self.entry_author_str,
|
# entry_authors = tmp_session.cl_struc2str(self.attr_strings['struct_attrs']['entry_author'],
|
||||||
struc_entry)
|
# struc_entry)
|
||||||
return_dict[i] = {'tokens': tokens,
|
tmp_dict = {'context_before': context_before,
|
||||||
'lemmas': lemmas,
|
'context_after': context_after,
|
||||||
'pos_tags': pos_tags,
|
'cpos_start': index_pair[0],
|
||||||
'sem_tags': sem_tags,
|
'cpos_end': index_pair[1]}
|
||||||
'context_before': context_before,
|
match.update(tmp_dict)
|
||||||
'context_after': context_after,
|
return_dict[i] = match
|
||||||
'entry_title': entry_titles[0],
|
|
||||||
'entry_author': entry_authors[0],
|
|
||||||
'cpos_start': index_pair[0],
|
|
||||||
'cpos_end': index_pair[1]}
|
|
||||||
tmp_session.disconnect()
|
tmp_session.disconnect()
|
||||||
|
|
||||||
def get_cpos_info(self, cpos):
|
|
||||||
match_dict = collections.OrderedDict()
|
|
||||||
for attribute in self.attributes:
|
|
||||||
if '.entry' not in attribute:
|
|
||||||
match_str = self.cl_cpos2str(attribute, range(cpos[0], cpos[1]))
|
|
||||||
match_dict[attribute] = match_str
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
print(match_dict)
|
|
||||||
|
@ -38,9 +38,11 @@ def recv_query(message):
|
|||||||
corpus_name = 'CORPUS'
|
corpus_name = 'CORPUS'
|
||||||
result_subcorpus_name = 'Query-results' # should be set by the user somehow
|
result_subcorpus_name = 'Query-results' # should be set by the user somehow
|
||||||
query = message['query']
|
query = message['query']
|
||||||
analysis_client.create_attribute_strings(corpus_name)
|
analysis_client.set_corpus_name(corpus_name)
|
||||||
analysis_client.query_subcorpus(corpus_name, result_subcorpus_name, query)
|
analysis_client.create_attribute_strings()
|
||||||
analysis_client.show_results(corpus_name)
|
analysis_client.query_subcorpus(result_subcorpus_name, query)
|
||||||
|
results = analysis_client.show_results()
|
||||||
|
logger.warning('Query results: {}'.format(str(results)))
|
||||||
|
|
||||||
|
|
||||||
def observe_corpus_analysis_connection(app, corpus_id, session_id):
|
def observe_corpus_analysis_connection(app, corpus_id, session_id):
|
||||||
|
Loading…
Reference in New Issue
Block a user