nopaque/app/corpora/CQiWrapper/CQiWrapper.py

321 lines
14 KiB
Python
Raw Normal View History

2019-11-27 10:18:15 +01:00
from .CQiClient import CQiClient
from .CQi import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND
2019-11-28 14:14:56 +01:00
import time
2019-11-27 10:18:15 +01:00
from app import logger # only works if imported into opaque web app
2019-11-07 15:48:47 +01:00
class CQiWrapper(CQiClient):
2019-12-02 14:19:40 +01:00
'''
2019-11-07 15:48:47 +01:00
CQIiWrapper object
High level wrapper that groups and renames some functions of CQiClient
for ease of use. Also structures recieved data into python dictionaries.
Keyword arguments:
2019-11-18 14:24:13 +01:00
host -- host IP adress or hostname wher the cqp server is running
port -- port of the cqp server
2019-11-07 15:48:47 +01:00
username -- username used to connect to the cqp server
password -- password of the user to connect to the cqp server
2019-12-02 14:19:40 +01:00
'''
2019-11-07 15:48:47 +01:00
SUBCORPUS_NAMES = []
def __init__(self, host='127.0.0.1', port=4877, username='opaque',
password='opaque'):
super(CQiWrapper, self).__init__(host=host, port=port)
self.username = username
self.password = password
def connect(self):
2019-12-02 14:19:40 +01:00
'''
2019-11-07 15:48:47 +01:00
Connect with CQP server
Connects via socket to the CQP server using the given username and
password from class initiation.
2019-12-02 14:19:40 +01:00
'''
2019-11-07 15:48:47 +01:00
self.ctrl_connect(self.username, self.password)
2019-11-18 14:24:13 +01:00
def __create_attribute_strings(self):
2019-12-02 14:19:40 +01:00
'''
2019-11-18 14:24:13 +01:00
Creates all needed attribute strings to query for word, lemma etc. in
the given corpus.
For example: CORPUS_NAME.word to query words
2019-12-02 14:19:40 +01:00
Automaticalle creates strings for all pre defined tags.
'''
2019-11-11 15:35:37 +01:00
p_attrs = self.corpus_positional_attributes(self.corpus_name)
struct_attrs = self.corpus_structural_attributes(self.corpus_name)
self.attr_strings = {}
self.attr_strings['positional_attrs'] = {}
self.attr_strings['struct_attrs'] = {}
for p_attr in p_attrs:
self.attr_strings['positional_attrs'][p_attr] = (self.corpus_name
+ '.'
+ p_attr)
2019-11-19 11:48:00 +01:00
for struct_attr in struct_attrs:
2019-11-11 15:35:37 +01:00
self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
+ '.'
+ struct_attr)
2019-12-02 14:19:40 +01:00
logger.warning(('All positional and '
'structural attributes: {}').format(self.attr_strings))
2019-11-18 14:24:13 +01:00
def select_corpus(self, corpus_name):
2019-12-02 14:19:40 +01:00
'''
Checks if given copus name exists. If it exists set it as the main
corpus name used to create the needed query attribute strings like
CORPUS_NAME.word.
'''
2019-11-18 14:24:13 +01:00
if corpus_name in self.corpus_list_coprora():
self.corpus_name = corpus_name
self.__create_attribute_strings()
2019-12-02 14:19:40 +01:00
logger.warning('{} does exist.'.format(corpus_name))
2019-11-18 14:24:13 +01:00
else:
2019-12-02 14:19:40 +01:00
logger.warning('{} does not exist.'.format(corpus_name))
raise Exception('Given Corpus Name is not in corpora list.')
2019-11-07 15:48:47 +01:00
def disconnect(self):
2019-12-02 14:19:40 +01:00
'''
2019-11-07 15:48:47 +01:00
Disconnect from CQP server
Disconnects from the CQP server. Closes used socket after disconnect.
2019-12-02 14:19:40 +01:00
'''
2019-11-07 15:48:47 +01:00
self.ctrl_bye()
self.connection.close()
2019-12-02 14:19:40 +01:00
logger.warning('Disconnected from cqp server.')
2019-11-07 15:48:47 +01:00
2019-11-18 14:24:13 +01:00
def query_subcorpus(self, query, result_subcorpus_name='Query-results'):
2019-12-02 14:19:40 +01:00
'''
2019-11-07 15:48:47 +01:00
Create subcorpus
Input query will be used to create a subcorpus holding all cpos match
positions for that query.
Keyword arguments:
2019-12-02 14:19:40 +01:00
result_subcorpus_name -- set name of the subcorpus which holds all
2019-11-07 15:48:47 +01:00
cpos match positions, produced by the query
query -- query written in cqp query language
2019-12-02 14:19:40 +01:00
'''
2020-01-21 14:50:27 +01:00
self.query = query
2019-11-11 15:35:37 +01:00
self.cqp_query(self.corpus_name, result_subcorpus_name, query)
2019-11-18 14:24:13 +01:00
self.result_subcorpus = (self.corpus_name
+ ':'
+ result_subcorpus_name)
self.SUBCORPUS_NAMES.append(self.result_subcorpus)
2020-01-27 13:19:33 +01:00
self.match_count = self.cqp_subcorpus_size(self.result_subcorpus)
logger.warning('Nr of all matches is: {}'.format(self.match_count))
2019-11-07 15:48:47 +01:00
def show_subcorpora(self):
2019-12-02 14:19:40 +01:00
'''
2019-11-18 14:24:13 +01:00
Show all subcorpora currently saved by the cqp server.
2019-12-02 14:19:40 +01:00
'''
2019-11-11 15:35:37 +01:00
return self.cqp_list_subcorpora(self.corpus_name)
2019-11-07 15:48:47 +01:00
2019-11-18 14:24:13 +01:00
def show_query_results(self,
context_len=10,
2019-11-28 14:14:56 +01:00
result_len=1000,
result_offset=0):
2019-12-02 14:19:40 +01:00
'''
2019-11-07 15:48:47 +01:00
Show query results
Shows the actual matched strings produce by the query. Uses the cpos
match indexes to grab those strings. saves them into an orderd
2019-11-18 14:24:13 +01:00
dictionary. Also saves coresponding tags, lemmas and context. Gets those
informations using the corresponding cpos.
2019-11-07 15:48:47 +01:00
Keyword arguments:
context_len -- defines how many words before and after a match will be
shown (default 10)
2019-12-02 14:19:40 +01:00
result_len -- defines for how many matches all informations like lemma
and POS are being grabbed
result_offset -- defines the offset of the matches being requested. If
the offset is 100 informations for matches 100 to result_len are being
grabbed
'''
t0 = time.time()
2019-11-07 15:48:47 +01:00
self.context_len = context_len
2019-11-18 14:24:13 +01:00
self.corpus_max_len = self.cl_attribute_size(
self.attr_strings['positional_attrs']['word']
)
2020-01-27 13:19:33 +01:00
self.nr_matches = min(result_len, self.match_count)
2020-01-27 16:11:34 +01:00
if self.match_count == 0:
2019-12-02 14:19:40 +01:00
logger.warning('Query resulted in 0 matches.')
2020-01-27 16:11:34 +01:00
self.results = {'code': 0,
'result': {'matches': [],
'match_count': self.match_count,
'cpos_lookup': {},
'text_lookup': {}}
}
return self.results
2019-11-07 15:48:47 +01:00
else:
2019-11-18 14:24:13 +01:00
# Get match cpos boundries
# match_boundries shows the start and end cpos of one match as a
# pair of cpositions
# [(1355, 1357), (1477, 1479)] Example for two boundry pairs
2019-11-28 15:19:52 +01:00
offset_start = 0 if result_offset == 0 else result_offset
2020-01-17 14:31:14 +01:00
logger.warning('Offset start is: {}'.format(offset_start))
2020-01-27 13:19:33 +01:00
offset_end = min((self.nr_matches + result_offset - 1), self.match_count - 1)
2020-01-17 14:31:14 +01:00
logger.warning('Offset end is: {}'.format(offset_end))
2019-11-18 14:24:13 +01:00
match_boundaries = zip(self.cqp_dump_subcorpus(self.result_subcorpus,
CONST_FIELD_MATCH,
2019-11-28 14:14:56 +01:00
offset_start,
offset_end),
2019-11-18 14:24:13 +01:00
self.cqp_dump_subcorpus(self.result_subcorpus,
CONST_FIELD_MATCHEND,
2019-11-28 14:14:56 +01:00
offset_start,
offset_end))
2019-11-18 14:24:13 +01:00
2019-12-02 14:19:40 +01:00
# Generate all cpos between match boundries including start and end
# boundries.
2019-11-27 09:41:21 +01:00
# Also generate cpos for left and right context.
# Save those cpos into dict as lists for the keys 'lc', 'hit' and 'rc'
# Also collect all cpos together in one list for the final request of
# all cpos informations
all_matches = []
all_cpos = []
for start, end in match_boundaries:
2019-11-28 15:19:52 +01:00
end += 1
2019-11-27 09:41:21 +01:00
lc_cpos = list(range(max([0, start - self.context_len]), start))
lc = {'lc': lc_cpos}
2019-11-28 15:19:52 +01:00
match_cpos = list(range(start, end))
2019-11-27 09:41:21 +01:00
match = {'hit': match_cpos}
2019-12-02 14:19:40 +01:00
rc_cpos = list(range(end, min([self.corpus_max_len,
end + self.context_len])))
2019-11-27 09:41:21 +01:00
rc = {'rc': rc_cpos}
lc.update(match)
lc.update(rc)
all_cpos.extend(lc_cpos + match_cpos + rc_cpos)
all_matches.append(lc)
2019-12-02 14:19:40 +01:00
all_cpos = list(set(all_cpos)) # get rid of cpos duplicates
len_all_cpos = len(all_cpos)
2019-11-28 14:14:56 +01:00
t1 = time.time()
t_total = t1 - t0
2019-12-02 14:19:40 +01:00
logger.warning('Time to create all CPOS for query: {}'.format(t_total))
2020-01-17 14:31:14 +01:00
logger.warning('Requesting {} CPOS with one query.'.format(len_all_cpos))
2019-11-18 14:24:13 +01:00
# Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
# all cpos entries in all_cpos_list
2019-11-27 09:41:21 +01:00
# Also saves these informations into self.results dict
2019-12-02 14:19:40 +01:00
t2 = time.time()
2019-11-27 09:41:21 +01:00
all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
2019-12-02 14:19:40 +01:00
t3 = time.time()
t_final = t3 - t2
2020-01-20 15:53:06 +01:00
logger.warning('Got infos for {} CPOS in {} seconds:'.format(len_all_cpos,
t_final))
2020-01-27 13:19:33 +01:00
self.results = {'code': 0,
'result': {'matches': all_matches,
'match_count': self.match_count,
'cpos_lookup': all_cpos_infos,
2020-01-27 16:11:34 +01:00
'text_lookup': text_lookup}
2020-01-27 13:19:33 +01:00
}
2019-11-27 09:41:21 +01:00
return self.results
2019-11-18 14:24:13 +01:00
def get_cpos_infos(self, all_cpos):
'''
Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
all cpos entries specified in the parameter all_cpos.
'''
2019-11-28 14:14:56 +01:00
# Get all positional attribute informations
2019-11-18 14:24:13 +01:00
cpos_infos = {}
2019-11-27 09:41:21 +01:00
for p_attr_key in self.attr_strings['positional_attrs'].keys():
match_strs = self.cl_cpos2str(self.attr_strings['positional_attrs'][p_attr_key], all_cpos)
cpos_infos[p_attr_key] = match_strs
2019-11-28 14:14:56 +01:00
# Get all strucutural attribute informations
tmp_info = {}
structs_to_check = []
2019-11-27 09:41:21 +01:00
for struct_attr_key in self.attr_strings['struct_attrs'].keys():
2019-11-28 14:14:56 +01:00
key = self.attr_strings['struct_attrs'][struct_attr_key]
has_value = self.corpus_structural_attribute_has_values(key)
struct_ids = self.cl_cpos2struc(key, all_cpos)
if has_value is False: # Get IDs of strucutural elements without values (this means get IDs of XML tags. Struct elements only have values if they are XML attributes)
tmp_info[struct_attr_key] = []
2019-11-27 09:41:21 +01:00
for id in struct_ids:
2019-11-28 14:14:56 +01:00
tmp_info[struct_attr_key].append(id)
2019-11-27 09:41:21 +01:00
else:
2019-11-28 14:14:56 +01:00
structs_to_check.append({key: struct_attr_key})
2020-01-29 16:12:57 +01:00
logger.warning('Structs to check: {}'.format(structs_to_check))
2019-11-28 14:14:56 +01:00
struct_attr_values = list(tmp_info.values())
2020-01-29 16:12:57 +01:00
# logger.warning('Struct attr value list: {}'.format(struct_attr_values))
2019-11-28 14:14:56 +01:00
struct_attr_keys = list(tmp_info.keys())
2020-01-29 16:12:57 +01:00
# logger.warning('Struct attr key list: {}'.format(struct_attr_keys))
2019-11-28 14:14:56 +01:00
# Build textlookup dictionary
2020-01-29 16:12:57 +01:00
text_lookup_ids = list(set(struct_attr_values[0])) # every CPOS is associated with one text id. A set is build to only gather text_lookup informations for every unique text id
text_lookup = {} # final dict containing all info of one text identified by its id
2019-11-28 14:14:56 +01:00
for d in structs_to_check:
s_key, s_value = zip(*d.items())
2020-01-29 16:12:57 +01:00
logger.warning('dict entries: {}: {}'.format(s_key, s_value))
s_value = s_value[0].split('_', 1)[-1]
logger.warning('S_VALUE: {}'.format(s_value))
2019-11-28 14:14:56 +01:00
struct_values = self.cl_struc2str(s_key[0], text_lookup_ids)
2020-01-29 16:12:57 +01:00
logger.warning('Extracted Value with key {}: {}'.format(s_key[0],struct_values))
2019-11-28 14:14:56 +01:00
zipped = dict(zip(text_lookup_ids, struct_values))
for zip_key, zip_value in zipped.items():
2020-01-29 16:12:57 +01:00
logger.warning('Text id as key is: {}'.format(zip_key))
logger.warning('Value of this text is: {}'.format(zip_value))
2019-11-28 14:14:56 +01:00
check = text_lookup.get(zip_key)
2020-01-17 14:31:14 +01:00
logger.warning('check: {}'.format(check))
2019-11-28 14:14:56 +01:00
if check is None:
text_lookup[zip_key] = {s_value: zip_value}
else:
text_lookup[zip_key].update({s_value: zip_value})
# zip keys and values together
attr_values_list = []
attr_keys_list = []
2019-11-18 14:24:13 +01:00
for key in cpos_infos.keys():
2019-11-28 14:14:56 +01:00
attr_values_list.append(cpos_infos[key])
attr_keys_list.append(key)
attr_keys_list.extend(struct_attr_keys)
attr_values_list.extend(struct_attr_values)
joined_cpos_infos = zip(all_cpos, *attr_values_list)
2019-11-18 14:24:13 +01:00
dict_cpos_infos = {}
for info in joined_cpos_infos:
2019-11-28 14:14:56 +01:00
dict_cpos_infos[info[0]] = dict(zip(attr_keys_list, info[1:]))
2019-11-27 09:41:21 +01:00
return dict_cpos_infos, text_lookup
2019-12-02 14:19:40 +01:00
def get_sentences(self,
match_cpos_list,
get_surrounding_s=False,
l_r_s_context_additional_len=1):
'''
Get sentence informations for one match also set if and how much left
right context sentences should be grabbed surrounding the given CPOS.
'''
t0 = time.time()
key = self.corpus_name + '.s'
first_cpos, last_cpos = match_cpos_list[0], match_cpos_list[-1]
context_sentences = {}
s_ids = self.cl_cpos2struc(key, [first_cpos, last_cpos])
for s_id in s_ids:
s_start, s_end = self.cl_struc2cpos(key, s_id)
s_cpos = list(range(s_start, s_end + 1))
context_sentences[s_id] = s_cpos
if get_surrounding_s:
max_s_id = self.cl_attribute_size(key)
additional_s_ids = []
additional_s = list(range(max(s_ids[0]
- l_r_s_context_additional_len,
0),
min(s_ids[-1]
+ l_r_s_context_additional_len,
max_s_id) + 1))
additional_s_ids.extend(additional_s)
for s_id in additional_s_ids:
s_start, s_end = self.cl_struc2cpos(key, s_id)
s_cpos = list(range(s_start, s_end + 1))
context_sentences[s_id] = s_cpos
all_cpos = []
for key in context_sentences.keys():
all_cpos.extend(context_sentences[key])
all_cpos = list(set(all_cpos))
all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
t1 = time.time()
t_total = t1 - t0
logger.warning('Got all sentences informations in {} seconds'. format(t_total))
match_context = {'context_s_cpos': context_sentences,
'cpos_lookup': all_cpos_infos,
2019-12-03 15:11:31 +01:00
'text_lookup': text_lookup,
'match_cpos_list': match_cpos_list}
return match_context