mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2024-12-24 10:34:17 +00:00
CQiWrapper new data structure
This commit is contained in:
parent
30f60b60c2
commit
b3d5c15df3
@ -1,8 +1,7 @@
|
|||||||
from .CQiClient import CQiClient
|
from CQiClient import CQiClient
|
||||||
from .CQi import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND
|
from CQi import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND
|
||||||
import collections
|
|
||||||
import re
|
import re
|
||||||
from app import logger # only works if imported into opaque web app
|
# from app import logger # only works if imported into opaque web app
|
||||||
|
|
||||||
|
|
||||||
class CQiWrapper(CQiClient):
|
class CQiWrapper(CQiClient):
|
||||||
@ -55,16 +54,16 @@ class CQiWrapper(CQiClient):
|
|||||||
self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
|
self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
|
||||||
+ '.'
|
+ '.'
|
||||||
+ struct_attr)
|
+ struct_attr)
|
||||||
logger.warning(('All positional and '
|
# logger.warning(('All positional and '
|
||||||
'structural attributes: {}').format(self.attr_strings))
|
# 'structural attributes: {}').format(self.attr_strings))
|
||||||
|
|
||||||
def select_corpus(self, corpus_name):
|
def select_corpus(self, corpus_name):
|
||||||
if corpus_name in self.corpus_list_coprora():
|
if corpus_name in self.corpus_list_coprora():
|
||||||
self.corpus_name = corpus_name
|
self.corpus_name = corpus_name
|
||||||
self.__create_attribute_strings()
|
self.__create_attribute_strings()
|
||||||
logger.warning('{} does exist.'.format(corpus_name))
|
# logger.warning('{} does exist.'.format(corpus_name))
|
||||||
else:
|
else:
|
||||||
logger.warning('{} does not exist.'.format(corpus_name))
|
# logger.warning('{} does not exist.'.format(corpus_name))
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def disconnect(self):
|
def disconnect(self):
|
||||||
@ -75,7 +74,7 @@ class CQiWrapper(CQiClient):
|
|||||||
"""
|
"""
|
||||||
self.ctrl_bye()
|
self.ctrl_bye()
|
||||||
self.connection.close()
|
self.connection.close()
|
||||||
logger.warning('Disconnected from cqp server.')
|
# logger.warning('Disconnected from cqp server.')
|
||||||
|
|
||||||
def query_subcorpus(self, query, result_subcorpus_name='Query-results'):
|
def query_subcorpus(self, query, result_subcorpus_name='Query-results'):
|
||||||
"""
|
"""
|
||||||
@ -95,7 +94,7 @@ class CQiWrapper(CQiClient):
|
|||||||
+ result_subcorpus_name)
|
+ result_subcorpus_name)
|
||||||
self.SUBCORPUS_NAMES.append(self.result_subcorpus)
|
self.SUBCORPUS_NAMES.append(self.result_subcorpus)
|
||||||
self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus)
|
self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus)
|
||||||
logger.warning('Nr of all matches is: {}'.format(self.nr_matches))
|
# logger.warning('Nr of all matches is: {}'.format(self.nr_matches))
|
||||||
|
|
||||||
def show_subcorpora(self):
|
def show_subcorpora(self):
|
||||||
"""
|
"""
|
||||||
@ -125,7 +124,7 @@ class CQiWrapper(CQiClient):
|
|||||||
)
|
)
|
||||||
self.nr_matches = min(result_len, self.nr_matches)
|
self.nr_matches = min(result_len, self.nr_matches)
|
||||||
if self.nr_matches == 0:
|
if self.nr_matches == 0:
|
||||||
logger.warning('Query resulted in 0 matches.')
|
# logger.warning('Query resulted in 0 matches.')
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
# Get match cpos boundries
|
# Get match cpos boundries
|
||||||
@ -141,86 +140,49 @@ class CQiWrapper(CQiClient):
|
|||||||
0,
|
0,
|
||||||
self.nr_matches - 1))
|
self.nr_matches - 1))
|
||||||
|
|
||||||
# Generate all cpos between boundries including start and end boundries
|
# Generate all cpos between match boundries including start and end boundries.
|
||||||
# Save them as list into on match entry at serial number 'i'
|
# Also generate cpos for left and right context.
|
||||||
ordered_matches = collections.OrderedDict()
|
# Save those cpos into dict as lists for the keys 'lc', 'hit' and 'rc'
|
||||||
for i, match_pair in enumerate(match_boundaries):
|
# Also collect all cpos together in one list for the final request of
|
||||||
ordered_matches[i] = ({'match_cpos':
|
# all cpos informations
|
||||||
list(range(match_pair[0],
|
all_matches = []
|
||||||
match_pair[1] + 1))})
|
all_cpos = []
|
||||||
# Saves cpos form all match entries into one list
|
for start, end in match_boundaries:
|
||||||
all_cpos_list = []
|
lc_cpos = list(range(max([0, start - self.context_len]), start))
|
||||||
for key in ordered_matches.keys():
|
lc = {'lc': lc_cpos}
|
||||||
all_cpos_list += ordered_matches[key]['match_cpos']
|
match_cpos = list(range(start, end + 1))
|
||||||
|
match = {'hit': match_cpos}
|
||||||
|
rc_cpos = list(range(end + 1, min([self.corpus_max_len, end + self.context_len + 1])))
|
||||||
|
rc = {'rc': rc_cpos}
|
||||||
|
lc.update(match)
|
||||||
|
lc.update(rc)
|
||||||
|
all_cpos.extend(lc_cpos + match_cpos + rc_cpos)
|
||||||
|
all_matches.append(lc)
|
||||||
|
# print(all_matches)
|
||||||
|
# print(all_cpos)
|
||||||
|
|
||||||
# Saves all cpos from before and after context into the list:
|
# Get all sentences IDs for all above collected cpos in all_cpos
|
||||||
# all_context_cpos_list
|
s_ids = self.cl_cpos2struc('UTOPIEN.s', all_cpos) # CHANGE to CORPUS.s will always be like this in nopaque
|
||||||
all_context_cpos_list = []
|
# Get all cpos for all sneteces boundries
|
||||||
for key in ordered_matches.keys():
|
s_lookup = {}
|
||||||
cpos_list = ordered_matches[key]['match_cpos']
|
for s_id in set(s_ids):
|
||||||
before_index = max([0, cpos_list[0] - self.context_len])
|
s_start, s_end = self.cl_struc2cpos('UTOPIEN.s', s_id) # CHANGE to CORPUS.s will always be like this in nopaque
|
||||||
after_index = min([self.corpus_max_len,
|
# print(s_start, s_end)
|
||||||
cpos_list[-1] + self.context_len])
|
s_cpos = range(s_start, s_end)
|
||||||
ordered_matches[key]['left_context_cpos'] = list(range(before_index,
|
s_lookup.update({s_id: list(s_cpos)})
|
||||||
cpos_list[0]))
|
# print(list(s_cpos))
|
||||||
ordered_matches[key]['right_context_cpos'] = list(range(cpos_list[-1] + 1,
|
all_cpos.extend(s_cpos)
|
||||||
after_index + 1))
|
all_cpos = list(set(all_cpos)) # get rid of cpos duplicates
|
||||||
all_context_cpos_list += ordered_matches[key]['left_context_cpos']
|
|
||||||
all_context_cpos_list += ordered_matches[key]['right_context_cpos']
|
|
||||||
# Combines all_cpos_list with all_context_cpos_list as a sorted set
|
|
||||||
all_cpos_list += all_context_cpos_list
|
|
||||||
all_cpos_list = sorted(list(set(all_cpos_list)))
|
|
||||||
|
|
||||||
# Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
|
# Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
|
||||||
# all cpos entries in all_cpos_list
|
# all cpos entries in all_cpos_list
|
||||||
# Also saves these informations into the ordered_matches dict
|
# Also saves these informations into self.results dict
|
||||||
all_cpos_infos, s_list = self.get_cpos_infos(all_cpos_list)
|
all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
|
||||||
for key in ordered_matches.keys():
|
|
||||||
# loops over cpos in cpos_list which holds all match cpos
|
self.results = {'matches': all_matches, 'cpos_lookup': all_cpos_infos,
|
||||||
# Replaces one cpos with the corresponding cpos information created
|
's_lookup': s_lookup, 'text_lookup': text_lookup}
|
||||||
# by self.get_cpos_infos(all_cpos_list)
|
return self.results
|
||||||
cpos_list = ordered_matches[key]['match_cpos']
|
# print(self.results)
|
||||||
infos = []
|
|
||||||
for cpos in cpos_list:
|
|
||||||
info = {cpos: all_cpos_infos.get(cpos)}
|
|
||||||
infos.append(info)
|
|
||||||
ordered_matches[key]['match_cpos'] = infos
|
|
||||||
try:
|
|
||||||
# loops over cpos in ordered_matches[key]['left_context_cpos']
|
|
||||||
# which holds all cpos of the before context
|
|
||||||
# Replaces one cpos with the corresponding cpos information created
|
|
||||||
# by self.get_cpos_infos(all_cpos_list)
|
|
||||||
before_context_infos = []
|
|
||||||
for context_before_cpos in ordered_matches[key]['left_context_cpos']:
|
|
||||||
before_context_info = {context_before_cpos:
|
|
||||||
all_cpos_infos.get(context_before_cpos)}
|
|
||||||
before_context_infos.append(before_context_info)
|
|
||||||
ordered_matches[key]['left_context_cpos'] = before_context_infos
|
|
||||||
except UnboundLocalError:
|
|
||||||
logger.warning('Context before cpos list is empty.')
|
|
||||||
pass
|
|
||||||
try:
|
|
||||||
# loops over cpos in ordered_matches[key]['right_context_cpos']
|
|
||||||
# which holds all cpos of the before context
|
|
||||||
# Replaces one cpos with the corresponding cpos information created
|
|
||||||
# by self.get_cpos_infos(all_cpos_list)
|
|
||||||
after_context_infos = []
|
|
||||||
for context_after_cpos in ordered_matches[key]['right_context_cpos']:
|
|
||||||
after_context_info = {context_after_cpos:
|
|
||||||
all_cpos_infos.get(context_after_cpos)}
|
|
||||||
after_context_infos.append(after_context_info)
|
|
||||||
ordered_matches[key]['right_context_cpos'] = after_context_infos
|
|
||||||
except UnboundLocalError:
|
|
||||||
logger.warning('Context after cpos list is empty.')
|
|
||||||
pass
|
|
||||||
sentences = {}
|
|
||||||
s_list = set(s_list)
|
|
||||||
for s_id in s_list:
|
|
||||||
s_start, s_end = self.cl_struc2cpos('CORPUS.s', s_id)
|
|
||||||
sentence = self.cl_cpos2str('CORPUS.word', range(s_start, s_end + 1))
|
|
||||||
sentences.update({s_id: re.sub(r' (?=\W)', '', ' '.join(sentence))})
|
|
||||||
ordered_matches['sentences'] = sentences
|
|
||||||
return ordered_matches
|
|
||||||
|
|
||||||
def get_cpos_infos(self, all_cpos):
|
def get_cpos_infos(self, all_cpos):
|
||||||
'''
|
'''
|
||||||
@ -228,25 +190,42 @@ class CQiWrapper(CQiClient):
|
|||||||
all cpos entries specified in the parameter all_cpos.
|
all cpos entries specified in the parameter all_cpos.
|
||||||
'''
|
'''
|
||||||
cpos_infos = {}
|
cpos_infos = {}
|
||||||
s_list = []
|
for p_attr_key in self.attr_strings['positional_attrs'].keys():
|
||||||
for key in self.attr_strings.keys():
|
match_strs = self.cl_cpos2str(self.attr_strings['positional_attrs'][p_attr_key], all_cpos)
|
||||||
if key == 'positional_attrs':
|
cpos_infos[p_attr_key] = match_strs
|
||||||
for p_attr_key in self.attr_strings[key].keys():
|
|
||||||
match_strs = self.cl_cpos2str(self.attr_strings[key][p_attr_key],
|
tmp_s_info = []
|
||||||
all_cpos)
|
tmp_text_info = []
|
||||||
cpos_infos[p_attr_key] = match_strs
|
text_lookup = {}
|
||||||
elif key == 'struct_attrs':
|
tmp_dict = {}
|
||||||
for struct_attr_key in self.attr_strings[key].keys():
|
for struct_attr_key in self.attr_strings['struct_attrs'].keys():
|
||||||
struct_entry = self.cl_cpos2struc(self.attr_strings[key][struct_attr_key],
|
check = self.attr_strings['struct_attrs'][struct_attr_key]
|
||||||
all_cpos)
|
if check == 'UTOPIEN.s':
|
||||||
has_value = self.corpus_structural_attribute_has_values(self.attr_strings[key][struct_attr_key])
|
struct_ids = self.cl_cpos2struc(check, all_cpos)
|
||||||
if has_value:
|
for id in struct_ids:
|
||||||
match_strs = self.cl_struc2str(self.attr_strings[key][struct_attr_key], struct_entry)
|
tmp_s_info.append({struct_attr_key: id})
|
||||||
elif self.attr_strings[key][struct_attr_key] == 'CORPUS.s':
|
elif check == 'UTOPIEN.entry':
|
||||||
s_list.extend(struct_entry)
|
struct_ids = self.cl_cpos2struc(check, all_cpos)
|
||||||
else:
|
for id in struct_ids:
|
||||||
match_strs = [None for i in struct_entry]
|
tmp_text_info.append({struct_attr_key: id})
|
||||||
cpos_infos[struct_attr_key] = zip(struct_entry, match_strs)
|
else:
|
||||||
|
struct_ids = struct_ids = self.cl_cpos2struc(check, all_cpos)
|
||||||
|
struct_values = self.cl_struc2str(self.attr_strings['struct_attrs'][struct_attr_key], struct_ids)
|
||||||
|
for value in struct_values:
|
||||||
|
for id in struct_ids:
|
||||||
|
tmp_dict.update({id: {struct_attr_key: value}})
|
||||||
|
print(tmp_dict)
|
||||||
|
print(text_lookup)
|
||||||
|
|
||||||
|
# struct_entry = self.cl_cpos2struc(self.attr_strings['struct_attrs'][struct_attr_key], all_cpos)
|
||||||
|
# has_value = self.corpus_structural_attribute_has_values(self.attr_strings['struct_attrs'][struct_attr_key])
|
||||||
|
# if has_value:
|
||||||
|
# match_strs = self.cl_struc2str(self.attr_strings['struct_attrs'][struct_attr_key], struct_entry)
|
||||||
|
# elif self.attr_strings['struct_attrs'][struct_attr_key] == 'CORPUS.s':
|
||||||
|
# pass
|
||||||
|
# else:
|
||||||
|
# match_strs = [None for i in struct_entry]
|
||||||
|
# cpos_infos[struct_attr_key] = zip(struct_entry, match_strs)
|
||||||
tmp_list = []
|
tmp_list = []
|
||||||
attr_key_list = []
|
attr_key_list = []
|
||||||
for key in cpos_infos.keys():
|
for key in cpos_infos.keys():
|
||||||
@ -256,4 +235,7 @@ class CQiWrapper(CQiClient):
|
|||||||
dict_cpos_infos = {}
|
dict_cpos_infos = {}
|
||||||
for info in joined_cpos_infos:
|
for info in joined_cpos_infos:
|
||||||
dict_cpos_infos[info[0]] = dict(zip(attr_key_list, info[1:]))
|
dict_cpos_infos[info[0]] = dict(zip(attr_key_list, info[1:]))
|
||||||
return dict_cpos_infos, s_list
|
for key, s_id, text_id in zip(dict_cpos_infos.keys(), tmp_s_info, tmp_text_info):
|
||||||
|
dict_cpos_infos[key].update(s_id)
|
||||||
|
dict_cpos_infos[key].update(text_id)
|
||||||
|
return dict_cpos_infos, text_lookup
|
||||||
|
Loading…
Reference in New Issue
Block a user