Merge branch 'development' of gitlab.ub.uni-bielefeld.de:sfb1288inf/opaque into development

This commit is contained in:
Patrick Jentsch 2019-12-02 14:24:37 +01:00
commit f9f6857e4e

View File

@ -5,7 +5,7 @@ from app import logger # only works if imported into opaque web app
class CQiWrapper(CQiClient): class CQiWrapper(CQiClient):
""" '''
CQIiWrapper object CQIiWrapper object
High level wrapper that groups and renames some functions of CQiClient High level wrapper that groups and renames some functions of CQiClient
@ -16,7 +16,7 @@ class CQiWrapper(CQiClient):
port -- port of the cqp server port -- port of the cqp server
username -- username used to connect to the cqp server username -- username used to connect to the cqp server
password -- password of the user to connect to the cqp server password -- password of the user to connect to the cqp server
""" '''
SUBCORPUS_NAMES = [] SUBCORPUS_NAMES = []
@ -27,20 +27,21 @@ class CQiWrapper(CQiClient):
self.password = password self.password = password
def connect(self): def connect(self):
""" '''
Connect with CQP server Connect with CQP server
Connects via socket to the CQP server using the given username and Connects via socket to the CQP server using the given username and
password from class initiation. password from class initiation.
""" '''
self.ctrl_connect(self.username, self.password) self.ctrl_connect(self.username, self.password)
def __create_attribute_strings(self): def __create_attribute_strings(self):
""" '''
Creates all needed attribute strings to query for word, lemma etc. in Creates all needed attribute strings to query for word, lemma etc. in
the given corpus. the given corpus.
For example: CORPUS_NAME.word to query words For example: CORPUS_NAME.word to query words
""" Automaticalle creates strings for all pre defined tags.
'''
p_attrs = self.corpus_positional_attributes(self.corpus_name) p_attrs = self.corpus_positional_attributes(self.corpus_name)
struct_attrs = self.corpus_structural_attributes(self.corpus_name) struct_attrs = self.corpus_structural_attributes(self.corpus_name)
self.attr_strings = {} self.attr_strings = {}
@ -54,40 +55,45 @@ class CQiWrapper(CQiClient):
self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
+ '.' + '.'
+ struct_attr) + struct_attr)
# logger.warning(('All positional and ' logger.warning(('All positional and '
# 'structural attributes: {}').format(self.attr_strings)) 'structural attributes: {}').format(self.attr_strings))
def select_corpus(self, corpus_name): def select_corpus(self, corpus_name):
'''
Checks if given copus name exists. If it exists set it as the main
corpus name used to create the needed query attribute strings like
CORPUS_NAME.word.
'''
if corpus_name in self.corpus_list_coprora(): if corpus_name in self.corpus_list_coprora():
self.corpus_name = corpus_name self.corpus_name = corpus_name
self.__create_attribute_strings() self.__create_attribute_strings()
# logger.warning('{} does exist.'.format(corpus_name)) logger.warning('{} does exist.'.format(corpus_name))
else: else:
# logger.warning('{} does not exist.'.format(corpus_name)) logger.warning('{} does not exist.'.format(corpus_name))
pass raise Exception('Given Corpus Name is not in corpora list.')
def disconnect(self): def disconnect(self):
""" '''
Disconnect from CQP server Disconnect from CQP server
Disconnects from the CQP server. Closes used socket after disconnect. Disconnects from the CQP server. Closes used socket after disconnect.
""" '''
self.ctrl_bye() self.ctrl_bye()
self.connection.close() self.connection.close()
# logger.warning('Disconnected from cqp server.') logger.warning('Disconnected from cqp server.')
def query_subcorpus(self, query, result_subcorpus_name='Query-results'): def query_subcorpus(self, query, result_subcorpus_name='Query-results'):
""" '''
Create subcorpus Create subcorpus
Input query will be used to create a subcorpus holding all cpos match Input query will be used to create a subcorpus holding all cpos match
positions for that query. positions for that query.
Keyword arguments: Keyword arguments:
result_subcorpus_name -- user set name of the subcorpus which holds all result_subcorpus_name -- set name of the subcorpus which holds all
cpos match positions, produced by the query cpos match positions, produced by the query
query -- query written in cqp query language query -- query written in cqp query language
""" '''
self.cqp_query(self.corpus_name, result_subcorpus_name, query) self.cqp_query(self.corpus_name, result_subcorpus_name, query)
self.result_subcorpus = (self.corpus_name self.result_subcorpus = (self.corpus_name
+ ':' + ':'
@ -95,19 +101,19 @@ class CQiWrapper(CQiClient):
self.SUBCORPUS_NAMES.append(self.result_subcorpus) self.SUBCORPUS_NAMES.append(self.result_subcorpus)
self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus) self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus)
print('Nr of all matches is:', self.nr_matches) print('Nr of all matches is:', self.nr_matches)
# logger.warning('Nr of all matches is: {}'.format(self.nr_matches)) logger.warning('Nr of all matches is: {}'.format(self.nr_matches))
def show_subcorpora(self): def show_subcorpora(self):
""" '''
Show all subcorpora currently saved by the cqp server. Show all subcorpora currently saved by the cqp server.
""" '''
return self.cqp_list_subcorpora(self.corpus_name) return self.cqp_list_subcorpora(self.corpus_name)
def show_query_results(self, def show_query_results(self,
context_len=10, context_len=10,
result_len=1000, result_len=1000,
result_offset=0): result_offset=0):
""" '''
Show query results Show query results
Shows the actual matched strings produce by the query. Uses the cpos Shows the actual matched strings produce by the query. Uses the cpos
@ -118,15 +124,20 @@ class CQiWrapper(CQiClient):
Keyword arguments: Keyword arguments:
context_len -- defines how many words before and after a match will be context_len -- defines how many words before and after a match will be
shown (default 10) shown (default 10)
result_len -- defines how many results are actually grabbed result_len -- defines for how many matches all informations like lemma
""" and POS are being grabbed
result_offset -- defines the offset of the matches being requested. If
the offset is 100 informations for matches 100 to result_len are being
grabbed
'''
t0 = time.time()
self.context_len = context_len self.context_len = context_len
self.corpus_max_len = self.cl_attribute_size( self.corpus_max_len = self.cl_attribute_size(
self.attr_strings['positional_attrs']['word'] self.attr_strings['positional_attrs']['word']
) )
self.nr_matches = min(result_len, self.nr_matches) self.nr_matches = min(result_len, self.nr_matches)
if self.nr_matches == 0: if self.nr_matches == 0:
# logger.warning('Query resulted in 0 matches.') logger.warning('Query resulted in 0 matches.')
return None return None
else: else:
# Get match cpos boundries # Get match cpos boundries
@ -144,7 +155,8 @@ class CQiWrapper(CQiClient):
offset_start, offset_start,
offset_end)) offset_end))
# Generate all cpos between match boundries including start and end boundries. # Generate all cpos between match boundries including start and end
# boundries.
# Also generate cpos for left and right context. # Also generate cpos for left and right context.
# Save those cpos into dict as lists for the keys 'lc', 'hit' and 'rc' # Save those cpos into dict as lists for the keys 'lc', 'hit' and 'rc'
# Also collect all cpos together in one list for the final request of # Also collect all cpos together in one list for the final request of
@ -157,42 +169,34 @@ class CQiWrapper(CQiClient):
lc = {'lc': lc_cpos} lc = {'lc': lc_cpos}
match_cpos = list(range(start, end)) match_cpos = list(range(start, end))
match = {'hit': match_cpos} match = {'hit': match_cpos}
rc_cpos = list(range(end, min([self.corpus_max_len, end + self.context_len]))) rc_cpos = list(range(end, min([self.corpus_max_len,
end + self.context_len])))
rc = {'rc': rc_cpos} rc = {'rc': rc_cpos}
lc.update(match) lc.update(match)
lc.update(rc) lc.update(rc)
all_cpos.extend(lc_cpos + match_cpos + rc_cpos) all_cpos.extend(lc_cpos + match_cpos + rc_cpos)
all_matches.append(lc) all_matches.append(lc)
# print(all_matches)
# print(all_cpos)
# Get all cpos for all sneteces boundries all_cpos = list(set(all_cpos)) # get rid of cpos duplicates
# s_lookup = {} len_all_cpos = len(all_cpos)
# for s_id in set(s_ids):
# s_start, s_end = self.cl_struc2cpos('UTOPIEN.s', s_id)
# # CHANGE to UTOPIEN.s will always be like this in nopaque
# s_cpos = range(s_start, s_end)
# s_lookup.update({s_id: list(s_cpos)})
# # print(list(s_cpos))
# all_cpos.extend(s_cpos)
t0 = time.time()
all_cpos = list(set(all_cpos)) # get rid of cpos duplicates
t1 = time.time() t1 = time.time()
t_total = t1 - t0 t_total = t1 - t0
print('TIME FOR ALL CPOS:', t_total) logger.warning('Time to create all CPOS for query: {}'.format(t_total))
print('CPOS SUM:', len(all_cpos)) print('Requesting {} CPOS with one query.'.format(len_all_cpos))
# Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for # Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
# all cpos entries in all_cpos_list # all cpos entries in all_cpos_list
# Also saves these informations into self.results dict # Also saves these informations into self.results dict
t6 = time.time() t2 = time.time()
all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos) all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
t7 = time.time() t3 = time.time()
t_final = t7 - t6 t_final = t3 - t2
print('GOT ALL RESULTS IN:', t_final) print('Got infos for {} CPOS in {} seconds:'.format(len_all_cpos,
t_final))
self.results = {'matches': all_matches, 'cpos_lookup': all_cpos_infos, self.results = {'matches': all_matches,
'text_lookup': text_lookup} 'cpos_lookup': all_cpos_infos,
'text_lookup': text_lookup,
'nr_matches': self.nr_matches}
return self.results return self.results
def get_cpos_infos(self, all_cpos): def get_cpos_infos(self, all_cpos):
@ -250,3 +254,44 @@ class CQiWrapper(CQiClient):
for info in joined_cpos_infos: for info in joined_cpos_infos:
dict_cpos_infos[info[0]] = dict(zip(attr_keys_list, info[1:])) dict_cpos_infos[info[0]] = dict(zip(attr_keys_list, info[1:]))
return dict_cpos_infos, text_lookup return dict_cpos_infos, text_lookup
def get_sentences(self,
match_cpos_list,
get_surrounding_s=False,
l_r_s_context_additional_len=1):
'''
Get sentence informations for one match also set if and how much left
right context sentences should be grabbed surrounding the given CPOS.
'''
t0 = time.time()
key = self.corpus_name + '.s'
first_cpos, last_cpos = match_cpos_list[0], match_cpos_list[-1]
context_sentences = {}
s_ids = self.cl_cpos2struc(key, [first_cpos, last_cpos])
for s_id in s_ids:
s_start, s_end = self.cl_struc2cpos(key, s_id)
s_cpos = list(range(s_start, s_end + 1))
context_sentences[s_id] = s_cpos
if get_surrounding_s:
max_s_id = self.cl_attribute_size(key)
additional_s_ids = []
additional_s = list(range(max(s_ids[0]
- l_r_s_context_additional_len,
0),
min(s_ids[-1]
+ l_r_s_context_additional_len,
max_s_id) + 1))
additional_s_ids.extend(additional_s)
for s_id in additional_s_ids:
s_start, s_end = self.cl_struc2cpos(key, s_id)
s_cpos = list(range(s_start, s_end + 1))
context_sentences[s_id] = s_cpos
all_cpos = []
for key in context_sentences.keys():
all_cpos.extend(context_sentences[key])
all_cpos = list(set(all_cpos))
all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
t1 = time.time()
t_total = t1 - t0
logger.warning('Got all sentences informations in {} seconds'. format(t_total))
return context_sentences, all_cpos_infos, text_lookup