Add new CQiWrapper

This commit is contained in:
Stephan Porada 2019-11-18 14:24:13 +01:00
parent baf06d3106
commit 5fdd67ebf2
2 changed files with 153 additions and 134 deletions

View File

@ -1,4 +1,4 @@
from . import CQi import CQi
import socket import socket
import struct import struct

View File

@ -1,6 +1,7 @@
from .CQiClient import CQiClient from CQiClient import CQiClient
import multiprocessing from CQi import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND
import collections import collections
from app import logger # only works if imported into opaque web app
class CQiWrapper(CQiClient): class CQiWrapper(CQiClient):
@ -11,6 +12,8 @@ class CQiWrapper(CQiClient):
for ease of use. Also structures recieved data into python dictionaries. for ease of use. Also structures recieved data into python dictionaries.
Keyword arguments: Keyword arguments:
host -- host IP adress or hostname wher the cqp server is running
port -- port of the cqp server
username -- username used to connect to the cqp server username -- username used to connect to the cqp server
password -- password of the user to connect to the cqp server password -- password of the user to connect to the cqp server
""" """
@ -32,12 +35,15 @@ class CQiWrapper(CQiClient):
""" """
self.ctrl_connect(self.username, self.password) self.ctrl_connect(self.username, self.password)
def create_attribute_strings(self): def __create_attribute_strings(self):
"""
Creates all needed attribute strings to query for word, lemma etc. in
the given corpus.
For example: CORPUS_NAME.word to query words
"""
p_attrs = self.corpus_positional_attributes(self.corpus_name) p_attrs = self.corpus_positional_attributes(self.corpus_name)
struct_attrs = self.corpus_structural_attributes(self.corpus_name) struct_attrs = self.corpus_structural_attributes(self.corpus_name)
self.meta_struct_element = struct_attrs[0] self.meta_struct_element = struct_attrs[0]
print(p_attrs)
print(struct_attrs)
self.attr_strings = {} self.attr_strings = {}
self.attr_strings['positional_attrs'] = {} self.attr_strings['positional_attrs'] = {}
self.attr_strings['struct_attrs'] = {} self.attr_strings['struct_attrs'] = {}
@ -49,8 +55,17 @@ class CQiWrapper(CQiClient):
self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
+ '.' + '.'
+ struct_attr) + struct_attr)
def set_corpus_name(self, corpus_name): logger.warning(('All positional and '
'structural attributes: {}').format(self.attr_strings))
def select_corpus(self, corpus_name):
if corpus_name in self.corpus_list_coprora():
self.corpus_name = corpus_name self.corpus_name = corpus_name
self.__create_attribute_strings()
logger.warning('{} does exist.'.format(corpus_name))
else:
self.disconnect()
logger.warning('{} does not exist.'.format(corpus_name))
def disconnect(self): def disconnect(self):
""" """
@ -60,8 +75,9 @@ class CQiWrapper(CQiClient):
""" """
self.ctrl_bye() self.ctrl_bye()
self.connection.close() self.connection.close()
logger.warning('Disconnected from cqp server.')
def query_subcorpus(self, result_subcorpus_name, query): def query_subcorpus(self, query, result_subcorpus_name='Query-results'):
""" """
Create subcorpus Create subcorpus
@ -74,152 +90,155 @@ class CQiWrapper(CQiClient):
query -- query written in cqp query language query -- query written in cqp query language
""" """
self.cqp_query(self.corpus_name, result_subcorpus_name, query) self.cqp_query(self.corpus_name, result_subcorpus_name, query)
self.result_subcorpus_ns = (self.corpus_name self.result_subcorpus = (self.corpus_name
+ ':' + ':'
+ result_subcorpus_name) + result_subcorpus_name)
self.SUBCORPUS_NAMES.append(self.result_subcorpus_ns) self.SUBCORPUS_NAMES.append(self.result_subcorpus)
self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus_ns) self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus)
print('Nr of all matches is:', self.nr_matches) logger.warning('Nr of all matches is: {}'.format(self.nr_matches))
def show_subcorpora(self): def show_subcorpora(self):
"""
Show all subcorpora currently saved by the cqp server.
"""
return self.cqp_list_subcorpora(self.corpus_name) return self.cqp_list_subcorpora(self.corpus_name)
def show_results(self, def show_query_results(self,
result_start_count=0, context_len=10,
result_max_count=50, result_len=1000):
context_len=10,):
""" """
Show query results Show query results
Shows the actual matched strings produce by the query. Uses the cpos Shows the actual matched strings produce by the query. Uses the cpos
match indexes to grab those strings. saves them into an orderd match indexes to grab those strings. saves them into an orderd
dictionary. Also saves coresponding tags, lemmas and context: dictionary. Also saves coresponding tags, lemmas and context. Gets those
OrderedDict([ informations using the corresponding cpos.
(0,
{
'tokens': ['Big', 'Brother', 'himself'],
'lemmas': ['big', 'brother', 'himself'],
'pos_tags': ['JJ', 'NN1', 'PPX1'],
'sem_tags': ['|A11.1+|N3.2+|N5+|', '|S2.2m|S4m|S9/S2.2m|',
'|Z8m|'],
'context_before': ['figures', 'of', 'the', 'Party', ',',
'almost', 'on', 'a', 'level', 'with'],
'context_after': [',', 'and', 'then', 'had', 'engaged',
'in', 'counter-revolu-', 'tionary',
'activities', ','],
'entry_title': '1984', 'entry_author':
'george_orwell',
'cpos_start': 110490,
'cpos_end': 110492
}
)
])
Keyword arguments: Keyword arguments:
result_start_count -- start position of the dumped subcorpus.
(default 0) If it is 0 matches 0 to 50 will be shown. If it is 50
matches 50 to 100 will be shown.
result_max_count -- defines how many matches at once will be shown.
(default 50)
context_len -- defines how many words before and after a match will be context_len -- defines how many words before and after a match will be
shown (default 10) shown (default 10)
result_len -- defines how many results are actually grabbed
""" """
self.context_len = context_len self.context_len = context_len
self.corpus_max_len = self.cl_attribute_size(self.attr_strings['positional_attrs']['word']) self.corpus_max_len = self.cl_attribute_size(
self.attr_strings['positional_attrs']['word']
)
self.nr_matches = min(result_len, self.nr_matches)
if self.nr_matches == 0: if self.nr_matches == 0:
print('Query resulted in 0 matches.') logger.warning('Query resulted in 0 matches.')
self.disconnect
return None
else: else:
if self.nr_matches <= 50: # Get match cpos boundries
matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns, # match_boundries shows the start and end cpos of one match as a
0x10, # pair of cpositions
# [(1355, 1357), (1477, 1479)] Example for two boundry pairs
match_boundaries = zip(self.cqp_dump_subcorpus(self.result_subcorpus,
CONST_FIELD_MATCH,
0, 0,
self.nr_matches - 1) self.nr_matches - 1),
matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns, self.cqp_dump_subcorpus(self.result_subcorpus,
0x11, CONST_FIELD_MATCHEND,
0, self.nr_matches - 1) 0,
else: self.nr_matches - 1))
matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
0x10,
result_start_count,
result_max_count - 1)
matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
0x11,
result_start_count,
result_max_count - 1)
match_indexes = zip(matches_start, matches_end)
matches = [] # Generate all cpos between boundries including start and end boundries
manager = multiprocessing.Manager() # Save them as list into on match entry at serial number 'i'
return_dict = manager.dict() ordered_matches = collections.OrderedDict()
for i, index_pair in enumerate(match_indexes): for i, match_pair in enumerate(match_boundaries):
match = multiprocessing.Process(target=self.__get_matches, ordered_matches[i] = ({'match_cpos_list':
args=(i, list(range(match_pair[0],
index_pair, match_pair[1] + 1))})
self.corpus_name, # Saves cpos form all match entries into one list
return_dict)) all_cpos_list = []
matches.append(match) for key in ordered_matches.keys():
match.start() all_cpos_list += ordered_matches[key]['match_cpos_list']
for match in matches:
match.join()
# sort matches into ordered dict
ordered_results = collections.OrderedDict()
for key in sorted(return_dict.keys()):
ordered_results[key] = return_dict[key]
return ordered_results
def get_cpos_info(self, cpos, session): # Saves all cpos from before and after context into the list:
match_dict = {} # all_context_cpos_list
all_context_cpos_list = []
for key in ordered_matches.keys():
cpos_list = ordered_matches[key]['match_cpos_list']
before_index = max([0, cpos_list[0] - self.context_len])
after_index = min([self.corpus_max_len,
cpos_list[-1] + self.context_len])
ordered_matches[key]['context_before_cpos_list'] = list(range(before_index,
cpos_list[0]))
ordered_matches[key]['context_after_cpos_list'] = list(range(cpos_list[-1] + 1,
after_index + 1))
all_context_cpos_list += ordered_matches[key]['context_before_cpos_list']
all_context_cpos_list += ordered_matches[key]['context_after_cpos_list']
# Combines all_cpos_list with all_context_cpos_list as a sorted set
all_cpos_list += all_context_cpos_list
all_cpos_list = sorted(list(set(all_cpos_list)))
# Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
# all cpos entries in all_cpos_list
# Also saves these informations into the ordered_matches dict
all_cpos_infos = self.get_cpos_infos(all_cpos_list)
for key in ordered_matches.keys():
# loops over cpos in cpos_list which holds all match cpos
# Replaces one cpos with the corresponding cpos information created
# by self.get_cpos_infos(all_cpos_list)
cpos_list = ordered_matches[key]['match_cpos_list']
infos = []
for cpos in cpos_list:
info = {cpos: all_cpos_infos.get(cpos)}
infos.append(info)
ordered_matches[key]['match_cpos_list'] = infos
try:
# loops over cpos in ordered_matches[key]['context_before_cpos_list']
# which holds all cpos of the before context
# Replaces one cpos with the corresponding cpos information created
# by self.get_cpos_infos(all_cpos_list)
before_context_infos = []
for context_before_cpos in ordered_matches[key]['context_before_cpos_list']:
before_context_info = {context_before_cpos:
all_cpos_infos.get(context_before_cpos)}
before_context_infos.append(before_context_info)
ordered_matches[key]['context_before_cpos_list'] = before_context_infos
except UnboundLocalError:
logger.warning('Context before cpos list is empty.')
try:
# loops over cpos in ordered_matches[key]['context_after_cpos_list']
# which holds all cpos of the before context
# Replaces one cpos with the corresponding cpos information created
# by self.get_cpos_infos(all_cpos_list)
after_context_infos = []
for context_after_cpos in ordered_matches[key]['context_after_cpos_list']:
after_context_info = {context_after_cpos:
all_cpos_infos.get(context_after_cpos)}
after_context_infos.append(after_context_info)
ordered_matches[key]['context_after_cpos_list'] = after_context_infos
except UnboundLocalError:
logger.warning('Context after cpos list is empty.')
return ordered_matches
def get_cpos_infos(self, all_cpos):
'''
Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
all cpos entries specified in the parameter all_cpos.
'''
cpos_infos = {}
for attr_dict in self.attr_strings: for attr_dict in self.attr_strings:
# print(self.attr_strings[attr_dict])
if attr_dict == 'positional_attrs': if attr_dict == 'positional_attrs':
for p_attr_key in self.attr_strings[attr_dict].keys(): for p_attr_key in self.attr_strings[attr_dict].keys():
# print(p_attr_key) match_str = self.cl_cpos2str(self.attr_strings[attr_dict][p_attr_key],
match_str = session.cl_cpos2str(self.attr_strings[attr_dict][p_attr_key], range(cpos[0], cpos[1])) all_cpos)
match_dict[p_attr_key] = match_str cpos_infos[p_attr_key] = match_str
elif attr_dict == 'struct_attrs': elif attr_dict == 'struct_attrs':
for struct_attr_key in self.attr_strings[attr_dict].keys(): for struct_attr_key in self.attr_strings[attr_dict].keys():
# print(struct_attr_key) struct_entry = self.cl_cpos2struc(self.attr_strings['struct_attrs'][self.meta_struct_element],
struct_entry = session.cl_cpos2struc(self.attr_strings['struct_attrs'][self.meta_struct_element], all_cpos)
range(cpos[0], cpos[1])) match_str = self.cl_struc2str(self.attr_strings[attr_dict][struct_attr_key], struct_entry)
match_str = session.cl_struc2str(self.attr_strings[attr_dict][struct_attr_key], struct_entry) cpos_infos[struct_attr_key] = match_str
match_dict[struct_attr_key] = set(match_str) tmp_list = []
return match_dict attr_key_list = []
for key in cpos_infos.keys():
def __get_matches(self, i, index_pair, corpus_name, return_dict): tmp_list.append(cpos_infos[key])
""" attr_key_list.append(key)
Get matches as readable output joined_cpos_infos = zip(all_cpos, *tmp_list)
dict_cpos_infos = {}
Gets the actual match strings of cpos match indexes. Private helper for info in joined_cpos_infos:
method used in show_results. dict_cpos_infos[info[0]] = dict(zip(attr_key_list, info[1:]))
return dict_cpos_infos
Keyword arguments:
i -- serial number for match at given cpos
index_pair -- match start and match end cpos
corpus_name -- name of the parent corpus
return_dict -- dictionary created with manager.dict() that holds the
extracted strings tags etc.
"""
# print('START:', index_pair[0])
# print('END:', index_pair[1])
# print('=============================')
index_pair = [index_pair[0], index_pair[1] + 1]
tmp_session = CQiWrapper(username=self.username, password=self.password,
host=self.host, port=self.port)
tmp_session.connect()
match = self.get_cpos_info(index_pair, tmp_session)
before_index = max([0, index_pair[0] - self.context_len])
after_index = min([self.corpus_max_len,
index_pair[1] + self.context_len])
context_before = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
range(before_index,
index_pair[0]))
context_after = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
range(index_pair[1] + 1,
after_index + 1))
tmp_dict = {'context_before': context_before,
'context_after': context_after,
'cpos_start': index_pair[0],
'cpos_end': index_pair[1]}
match.update(tmp_dict)
return_dict[i] = match
tmp_session.disconnect()