nopaque/app/corpora/CQiWrapper/CQiWrapper.py
2020-03-10 16:06:47 +01:00

324 lines
15 KiB
Python

from .CQiClient import CQiClient
from .CQi import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND
import time
from app import logger # only works if imported into opaque web app
class CQiWrapper(CQiClient):
'''
CQIiWrapper object
High level wrapper that groups and renames some functions of CQiClient
for ease of use. Also structures recieved data into python dictionaries.
Keyword arguments:
host -- host IP adress or hostname wher the cqp server is running
port -- port of the cqp server
username -- username used to connect to the cqp server
password -- password of the user to connect to the cqp server
'''
SUBCORPUS_NAMES = []
def __init__(self, host='127.0.0.1', port=4877, username='anonymous',
password=''):
super(CQiWrapper, self).__init__(host=host, port=port)
self.username = username
self.password = password
def connect(self):
'''
Connect with CQP server
Connects via socket to the CQP server using the given username and
password from class initiation.
'''
self.ctrl_connect(self.username, self.password)
def __create_attribute_strings(self):
'''
Creates all needed attribute strings to query for word, lemma etc. in
the given corpus.
For example: CORPUS_NAME.word to query words
Automaticalle creates strings for all pre defined tags.
'''
p_attrs = self.corpus_positional_attributes(self.corpus_name)
struct_attrs = self.corpus_structural_attributes(self.corpus_name)
self.attr_strings = {}
self.attr_strings['positional_attrs'] = {}
self.attr_strings['struct_attrs'] = {}
for p_attr in p_attrs:
self.attr_strings['positional_attrs'][p_attr] = (self.corpus_name
+ '.'
+ p_attr)
for struct_attr in struct_attrs:
self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
+ '.'
+ struct_attr)
logger.warning(('All positional and '
'structural attributes: {}').format(self.attr_strings))
def select_corpus(self, corpus_name):
'''
Checks if given copus name exists. If it exists set it as the main
corpus name used to create the needed query attribute strings like
CORPUS_NAME.word.
'''
if corpus_name in self.corpus_list_coprora():
self.corpus_name = corpus_name
self.__create_attribute_strings()
logger.warning('{} does exist.'.format(corpus_name))
else:
logger.warning('{} does not exist.'.format(corpus_name))
raise Exception('Given Corpus Name is not in corpora list.')
def disconnect(self):
'''
Disconnect from CQP server
Disconnects from the CQP server. Closes used socket after disconnect.
'''
self.ctrl_bye()
self.connection.close()
logger.warning('Disconnected from cqp server.')
def query_subcorpus(self, query, result_subcorpus_name='Query-results'):
'''
Create subcorpus
Input query will be used to create a subcorpus holding all cpos match
positions for that query.
Keyword arguments:
result_subcorpus_name -- set name of the subcorpus which holds all
cpos match positions, produced by the query
query -- query written in cqp query language
'''
self.query = query
self.cqp_query(self.corpus_name, result_subcorpus_name, query)
self.result_subcorpus = (self.corpus_name
+ ':'
+ result_subcorpus_name)
self.SUBCORPUS_NAMES.append(self.result_subcorpus)
self.match_count = self.cqp_subcorpus_size(self.result_subcorpus)
logger.warning('Nr of all matches is: {}'.format(self.match_count))
def show_subcorpora(self):
'''
Show all subcorpora currently saved by the cqp server.
'''
return self.cqp_list_subcorpora(self.corpus_name)
def show_query_results(self,
context_len=10,
result_len=1000,
result_offset=0):
'''
Show query results
Shows the actual matched strings produce by the query. Uses the cpos
match indexes to grab those strings. saves them into an orderd
dictionary. Also saves coresponding tags, lemmas and context. Gets those
informations using the corresponding cpos.
Keyword arguments:
context_len -- defines how many words before and after a match will be
shown (default 10)
result_len -- defines for how many matches all informations like lemma
and POS are being grabbed
result_offset -- defines the offset of the matches being requested. If
the offset is 100 informations for matches 100 to result_len are being
grabbed
'''
t0 = time.time()
self.context_len = context_len
self.corpus_max_len = self.cl_attribute_size(
self.attr_strings['positional_attrs']['word']
)
self.nr_matches = min(result_len, self.match_count)
if self.match_count == 0:
logger.warning('Query resulted in 0 matches.')
self.results = {'code': 0,
'result': {'matches': [],
'match_count': self.match_count,
'cpos_lookup': {},
'text_lookup': {}}
}
return self.results
else:
# Get match cpos boundries
# match_boundries shows the start and end cpos of one match as a
# pair of cpositions
# [(1355, 1357), (1477, 1479)] Example for two boundry pairs
offset_start = 0 if result_offset == 0 else result_offset
logger.warning('Offset start is: {}'.format(offset_start))
offset_end = min((self.nr_matches + result_offset - 1), self.match_count - 1)
logger.warning('Offset end is: {}'.format(offset_end))
match_boundaries = zip(self.cqp_dump_subcorpus(self.result_subcorpus,
CONST_FIELD_MATCH,
offset_start,
offset_end),
self.cqp_dump_subcorpus(self.result_subcorpus,
CONST_FIELD_MATCHEND,
offset_start,
offset_end))
# Generate all cpos between match boundries including start and end
# boundries.
# Also generate cpos for left and right context.
# Save those cpos into dict as lists for the keys 'lc', 'hit' and 'rc'
# Also collect all cpos together in one list for the final request of
# all cpos informations
all_matches = []
all_cpos = []
for start, end in match_boundaries:
end += 1
lc_cpos = list(range(max([0, start - self.context_len]), start))
lc = {'lc': lc_cpos}
match_cpos = list(range(start, end))
match = {'hit': match_cpos}
rc_cpos = list(range(end, min([self.corpus_max_len,
end + self.context_len])))
rc = {'rc': rc_cpos}
lc.update(match)
lc.update(rc)
all_cpos.extend(lc_cpos + match_cpos + rc_cpos)
all_matches.append(lc)
all_cpos = list(set(all_cpos)) # get rid of cpos duplicates
len_all_cpos = len(all_cpos)
t1 = time.time()
t_total = t1 - t0
logger.warning('Time to create all CPOS for query: {}'.format(t_total))
logger.warning('Requesting {} CPOS with one query.'.format(len_all_cpos))
# Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
# all cpos entries in all_cpos_list
# Also saves these informations into self.results dict
t2 = time.time()
all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
t3 = time.time()
t_final = t3 - t2
logger.warning('Got infos for {} CPOS in {} seconds:'.format(len_all_cpos,
t_final))
self.results = {'code': 0,
'result': {'matches': all_matches,
'match_count': self.match_count,
'cpos_lookup': all_cpos_infos,
'text_lookup': text_lookup}
}
return self.results
def get_cpos_infos(self, all_cpos):
'''
Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
all cpos entries specified in the parameter all_cpos.
'''
# Get all positional attribute informations
cpos_infos = {}
for p_attr_key in self.attr_strings['positional_attrs'].keys():
match_strs = self.cl_cpos2str(self.attr_strings['positional_attrs'][p_attr_key], all_cpos)
cpos_infos[p_attr_key] = match_strs
# Get all strucutural attribute informations
tmp_info = {}
structs_to_check = []
for struct_attr_key in self.attr_strings['struct_attrs'].keys():
key = self.attr_strings['struct_attrs'][struct_attr_key]
has_value = self.corpus_structural_attribute_has_values(key)
struct_ids = self.cl_cpos2struc(key, all_cpos)
if has_value is False: # Get IDs of strucutural elements without values (this means get IDs of XML tags. Struct elements only have values if they are XML attributes)
tmp_info[struct_attr_key] = []
for id in struct_ids:
tmp_info[struct_attr_key].append(id)
else:
structs_to_check.append({key: struct_attr_key})
logger.warning('Structs to check: {}'.format(structs_to_check))
struct_attr_values = list(tmp_info.values())
# logger.warning('Struct attr value list: {}'.format(struct_attr_values))
struct_attr_keys = list(tmp_info.keys())
# logger.warning('Struct attr key list: {}'.format(struct_attr_keys))
# Build textlookup dictionary
text_lookup_ids = list(set(struct_attr_values[0])) # every CPOS is associated with one text id. A set is build to only gather text_lookup informations for every unique text id
text_lookup = {} # final dict containing all info of one text identified by its id
for d in structs_to_check:
s_key, s_value = zip(*d.items())
logger.warning('dict entries: {}: {}'.format(s_key, s_value))
s_value = s_value[0].split('_', 1)[-1]
logger.warning('S_VALUE: {}'.format(s_value))
struct_values = self.cl_struc2str(s_key[0], text_lookup_ids)
logger.warning('Extracted Value with key {}: {}'.format(s_key[0], struct_values))
zipped = dict(zip(text_lookup_ids, struct_values))
for zip_key, zip_value in zipped.items():
logger.warning('Text id as key is: {}'.format(zip_key))
logger.warning('Value of this text is: {}'.format(zip_value))
check = text_lookup.get(zip_key)
logger.warning('check: {}'.format(check))
if check is None:
text_lookup[zip_key] = {s_value: zip_value}
else:
text_lookup[zip_key].update({s_value: zip_value})
# zip keys and values together
attr_values_list = []
attr_keys_list = []
for key in cpos_infos.keys():
attr_values_list.append(cpos_infos[key])
attr_keys_list.append(key)
attr_keys_list.extend(struct_attr_keys)
attr_values_list.extend(struct_attr_values)
joined_cpos_infos = zip(all_cpos, *attr_values_list)
dict_cpos_infos = {}
for info in joined_cpos_infos:
dict_cpos_infos[info[0]] = dict(zip(attr_keys_list, info[1:]))
return dict_cpos_infos, text_lookup
def get_sentences(self,
match_cpos_list,
get_surrounding_s=False,
l_r_s_context_additional_len=1):
'''
Get sentence informations for one match also set if and how much left
right context sentences should be grabbed surrounding the given CPOS.
'''
t0 = time.time()
key = self.corpus_name + '.s'
first_cpos, last_cpos = match_cpos_list[0], match_cpos_list[-1]
context_sentences = {}
s_ids = self.cl_cpos2struc(key, [first_cpos, last_cpos])
logger.warning('s id match: {}'.format(s_ids))
for s_id in s_ids:
s_start, s_end = self.cl_struc2cpos(key, s_id)
s_cpos = list(range(s_start, s_end + 1))
context_sentences[s_id] = s_cpos
if get_surrounding_s:
max_s_id = self.cl_attribute_size(key) - 1
logger.warning('max sid: {}'.format(max_s_id))
additional_s_ids = []
additional_s = list(range(max(s_ids[0]
- l_r_s_context_additional_len,
0),
min(s_ids[-1]
+ l_r_s_context_additional_len,
max_s_id) + 1))
additional_s_ids.extend(additional_s)
for s_id in additional_s_ids:
logger.warning('s id additional: {}'.format(s_id))
s_start, s_end = self.cl_struc2cpos(key, s_id)
s_cpos = list(range(s_start, s_end + 1))
context_sentences[s_id] = s_cpos
all_cpos = []
for key in context_sentences.keys():
all_cpos.extend(context_sentences[key])
all_cpos = list(set(all_cpos))
all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
t1 = time.time()
t_total = t1 - t0
logger.warning('Got all sentences informations in {} seconds'. format(t_total))
match_context = {'context_s_cpos': context_sentences,
'cpos_lookup': all_cpos_infos,
'text_lookup': text_lookup,
'match_cpos_list': match_cpos_list}
return match_context