Merge branch 'development' of gitlab.ub.uni-bielefeld.de:sfb1288inf/opaque into development

This commit is contained in:
Patrick Jentsch 2019-11-11 15:58:00 +01:00
commit 5602ba950f
2 changed files with 97 additions and 75 deletions

View File

@ -1,7 +1,6 @@
from .CQiClient import CQiClient from .CQiClient import CQiClient
import multiprocessing import multiprocessing
import collections import collections
import socket
class CQiWrapper(CQiClient): class CQiWrapper(CQiClient):
@ -33,21 +32,41 @@ class CQiWrapper(CQiClient):
""" """
self.ctrl_connect(self.username, self.password) self.ctrl_connect(self.username, self.password)
def create_attribute_strings(self, corpus_name): def create_attribute_strings(self):
self.word_str = corpus_name + '.word' p_attrs = self.corpus_positional_attributes(self.corpus_name)
self.lemma_str = corpus_name + '.lemma' struct_attrs = self.corpus_structural_attributes(self.corpus_name)
self.pos_str = corpus_name + '.pos' self.meta_struct_element = struct_attrs[0]
self.sem_str = corpus_name + '.sem' print(p_attrs)
self.entry_str = corpus_name + '.entry' print(struct_attrs)
self.entry_author_str = self.entry_str + '_author' self.attr_strings = {}
self.entry_title_str = self.entry_str + '_title' self.attr_strings['positional_attrs'] = {}
self.attributes = [self.word_str, self.attr_strings['struct_attrs'] = {}
self.lemma_str, for p_attr in p_attrs:
self.pos_str, self.attr_strings['positional_attrs'][p_attr] = (self.corpus_name
self.sem_str, + '.'
self.entry_str, + p_attr)
self.entry_author_str, for struct_attr in struct_attrs[:-1]:
self.entry_title_str] self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
+ '.'
+ struct_attr)
# self.word_str = corpus_name + '.word'
# self.lemma_str = corpus_name + '.lemma'
# self.pos_str = corpus_name + '.pos'
# self.sem_str = corpus_name + '.sem'
# self.entry_str = corpus_name + '.entry'
# self.entry_author_str = self.entry_str + '_author'
# self.entry_title_str = self.entry_str + '_title'
# self.attributes = [self.word_str,
# self.lemma_str,
# self.pos_str,
# self.sem_str,
# self.entry_str,
# self.entry_author_str,
# self.entry_title_str]
# print(self.attributes)
def set_corpus_name(self, corpus_name):
self.corpus_name = corpus_name
def disconnect(self): def disconnect(self):
""" """
@ -58,7 +77,7 @@ class CQiWrapper(CQiClient):
self.ctrl_bye() self.ctrl_bye()
self.connection.close() self.connection.close()
def query_subcorpus(self, corpus_name, result_subcorpus_name, query): def query_subcorpus(self, result_subcorpus_name, query):
""" """
Create subcorpus Create subcorpus
@ -66,13 +85,12 @@ class CQiWrapper(CQiClient):
positions for that query. positions for that query.
Keyword arguments: Keyword arguments:
corpus_name -- name of the corpus the query will be used on
result_subcorpus_name -- user set name of the subcorpus which holds all result_subcorpus_name -- user set name of the subcorpus which holds all
cpos match positions, produced by the query cpos match positions, produced by the query
query -- query written in cqp query language query -- query written in cqp query language
""" """
self.cqp_query(corpus_name, result_subcorpus_name, query) self.cqp_query(self.corpus_name, result_subcorpus_name, query)
self.result_subcorpus_ns = (corpus_name self.result_subcorpus_ns = (self.corpus_name
+ ':' + ':'
+ result_subcorpus_name) + result_subcorpus_name)
self.SUBCORPUS_NAMES.append(self.result_subcorpus_ns) self.SUBCORPUS_NAMES.append(self.result_subcorpus_ns)
@ -80,11 +98,9 @@ class CQiWrapper(CQiClient):
print('Nr of all matches is:', self.nr_matches) print('Nr of all matches is:', self.nr_matches)
def show_subcorpora(self): def show_subcorpora(self):
print('Known subcorpora:', self.SUBCORPUS_NAMES) return self.cqp_list_subcorpora(self.corpus_name)
return self.SUBCORPUS_NAMES
def show_results(self, def show_results(self,
corpus_name,
result_start_count=0, result_start_count=0,
result_max_count=50, result_max_count=50,
context_len=10,): context_len=10,):
@ -116,7 +132,6 @@ class CQiWrapper(CQiClient):
]) ])
Keyword arguments: Keyword arguments:
corpus_name -- name of the parent corpus the subcorpus is part of
result_start_count -- start position of the dumped subcorpus. result_start_count -- start position of the dumped subcorpus.
(default 0) If it is 0 matches 0 to 50 will be shown. If it is 50 (default 0) If it is 0 matches 0 to 50 will be shown. If it is 50
matches 50 to 100 will be shown. matches 50 to 100 will be shown.
@ -126,8 +141,7 @@ class CQiWrapper(CQiClient):
shown (default 10) shown (default 10)
""" """
self.context_len = context_len self.context_len = context_len
word_str = corpus_name + '.word' self.corpus_max_len = self.cl_attribute_size(self.attr_strings['positional_attrs']['word'])
self.corpus_max_len = self.cl_attribute_size(word_str)
if self.nr_matches == 0: if self.nr_matches == 0:
print('Query resulted in 0 matches.') print('Query resulted in 0 matches.')
else: else:
@ -157,7 +171,7 @@ class CQiWrapper(CQiClient):
match = multiprocessing.Process(target=self.__get_matches, match = multiprocessing.Process(target=self.__get_matches,
args=(i, args=(i,
index_pair, index_pair,
corpus_name, self.corpus_name,
return_dict)) return_dict))
matches.append(match) matches.append(match)
match.start() match.start()
@ -167,7 +181,25 @@ class CQiWrapper(CQiClient):
ordered_results = collections.OrderedDict() ordered_results = collections.OrderedDict()
for key in sorted(return_dict.keys()): for key in sorted(return_dict.keys()):
ordered_results[key] = return_dict[key] ordered_results[key] = return_dict[key]
print('ORDERED_RESULTS', ordered_results) return ordered_results
def get_cpos_info(self, cpos, session):
match_dict = {}
for attr_dict in self.attr_strings:
# print(self.attr_strings[attr_dict])
if attr_dict == 'positional_attrs':
for p_attr_key in self.attr_strings[attr_dict].keys():
# print(p_attr_key)
match_str = session.cl_cpos2str(self.attr_strings[attr_dict][p_attr_key], range(cpos[0], cpos[1]))
match_dict[p_attr_key] = match_str
elif attr_dict == 'struct_attrs':
for struct_attr_key in self.attr_strings[attr_dict].keys():
# print(struct_attr_key)
struct_entry = session.cl_cpos2struc(self.attr_strings['struct_attrs'][self.meta_struct_element],
range(cpos[0], cpos[1]))
match_str = session.cl_struc2str(self.attr_strings[attr_dict][struct_attr_key], struct_entry)
match_dict[struct_attr_key] = set(match_str)
return match_dict
def __get_matches(self, i, index_pair, corpus_name, return_dict): def __get_matches(self, i, index_pair, corpus_name, return_dict):
""" """
@ -183,58 +215,46 @@ class CQiWrapper(CQiClient):
return_dict -- dictionary created with manager.dict() that holds the return_dict -- dictionary created with manager.dict() that holds the
extracted strings tags etc. extracted strings tags etc.
""" """
print('START:', index_pair[0]) # print('START:', index_pair[0])
print('END:', index_pair[1]) # print('END:', index_pair[1])
print('=============================') # print('=============================')
index_pair = [index_pair[0], index_pair[1] + 1]
tmp_session = CQiWrapper(username=self.username, password=self.password, tmp_session = CQiWrapper(username=self.username, password=self.password,
host=self.host, port=self.port) host=self.host, port=self.port)
tmp_session.connect() tmp_session.connect()
tokens = tmp_session.cl_cpos2str(self.word_str, match = self.get_cpos_info(index_pair, tmp_session)
range(index_pair[0], # tokens = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
index_pair[1] + 1)) # range(index_pair[0],
lemmas = tmp_session.cl_cpos2str(self.lemma_str, # index_pair[1] + 1))
range(index_pair[0], # lemmas = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['lemma'],
index_pair[1] + 1)) # range(index_pair[0],
pos_tags = tmp_session.cl_cpos2str(self.pos_str, # index_pair[1] + 1))
range(index_pair[0], # pos_tags = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['pos'],
index_pair[1] + 1)) # range(index_pair[0],
sem_tags = tmp_session.cl_cpos2str(self.sem_str, # index_pair[1] + 1))
range(index_pair[0], # sem_tags = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['sem'],
index_pair[1] + 1)) # range(index_pair[0],
struc_entry = tmp_session.cl_cpos2struc(self.entry_str, # index_pair[1] + 1))
range(index_pair[0], # struc_entry = tmp_session.cl_cpos2struc(self.attr_strings['struct_attrs']['entry'],
index_pair[1] + 1)) # range(index_pair[0],
# index_pair[1] + 1))
before_index = max([0, index_pair[0] - self.context_len]) before_index = max([0, index_pair[0] - self.context_len])
after_index = min([self.corpus_max_len, after_index = min([self.corpus_max_len,
index_pair[1] + self.context_len]) index_pair[1] + self.context_len])
context_before = tmp_session.cl_cpos2str(self.word_str, context_before = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
range(before_index, range(before_index,
index_pair[0])) index_pair[0]))
context_after = tmp_session.cl_cpos2str(self.word_str, context_after = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
range(index_pair[1] + 1, range(index_pair[1] + 1,
after_index + 1)) after_index + 1))
entry_titles = tmp_session.cl_struc2str(self.entry_title_str, # entry_titles = tmp_session.cl_struc2str(self.attr_strings['struct_attrs']['entry_title'],
struc_entry) # struc_entry)
entry_authors = tmp_session.cl_struc2str(self.entry_author_str, # entry_authors = tmp_session.cl_struc2str(self.attr_strings['struct_attrs']['entry_author'],
struc_entry) # struc_entry)
return_dict[i] = {'tokens': tokens, tmp_dict = {'context_before': context_before,
'lemmas': lemmas,
'pos_tags': pos_tags,
'sem_tags': sem_tags,
'context_before': context_before,
'context_after': context_after, 'context_after': context_after,
'entry_title': entry_titles[0],
'entry_author': entry_authors[0],
'cpos_start': index_pair[0], 'cpos_start': index_pair[0],
'cpos_end': index_pair[1]} 'cpos_end': index_pair[1]}
match.update(tmp_dict)
return_dict[i] = match
tmp_session.disconnect() tmp_session.disconnect()
def get_cpos_info(self, cpos):
match_dict = collections.OrderedDict()
for attribute in self.attributes:
if '.entry' not in attribute:
match_str = self.cl_cpos2str(attribute, range(cpos[0], cpos[1]))
match_dict[attribute] = match_str
else:
continue
print(match_dict)

View File

@ -38,9 +38,11 @@ def recv_query(message):
corpus_name = 'CORPUS' corpus_name = 'CORPUS'
result_subcorpus_name = 'Query-results' # should be set by the user somehow result_subcorpus_name = 'Query-results' # should be set by the user somehow
query = message['query'] query = message['query']
analysis_client.create_attribute_strings(corpus_name) analysis_client.set_corpus_name(corpus_name)
analysis_client.query_subcorpus(corpus_name, result_subcorpus_name, query) analysis_client.create_attribute_strings()
analysis_client.show_results(corpus_name) analysis_client.query_subcorpus(result_subcorpus_name, query)
results = analysis_client.show_results()
logger.warning('Query results: {}'.format(str(results)))
def observe_corpus_analysis_connection(app, corpus_id, session_id): def observe_corpus_analysis_connection(app, corpus_id, session_id):