mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2024-11-15 01:05:42 +00:00
Merge branch 'development' of gitlab.ub.uni-bielefeld.de:sfb1288inf/opaque into development
This commit is contained in:
commit
f9f6857e4e
@ -5,7 +5,7 @@ from app import logger # only works if imported into opaque web app
|
|||||||
|
|
||||||
|
|
||||||
class CQiWrapper(CQiClient):
|
class CQiWrapper(CQiClient):
|
||||||
"""
|
'''
|
||||||
CQIiWrapper object
|
CQIiWrapper object
|
||||||
|
|
||||||
High level wrapper that groups and renames some functions of CQiClient
|
High level wrapper that groups and renames some functions of CQiClient
|
||||||
@ -16,7 +16,7 @@ class CQiWrapper(CQiClient):
|
|||||||
port -- port of the cqp server
|
port -- port of the cqp server
|
||||||
username -- username used to connect to the cqp server
|
username -- username used to connect to the cqp server
|
||||||
password -- password of the user to connect to the cqp server
|
password -- password of the user to connect to the cqp server
|
||||||
"""
|
'''
|
||||||
|
|
||||||
SUBCORPUS_NAMES = []
|
SUBCORPUS_NAMES = []
|
||||||
|
|
||||||
@ -27,20 +27,21 @@ class CQiWrapper(CQiClient):
|
|||||||
self.password = password
|
self.password = password
|
||||||
|
|
||||||
def connect(self):
|
def connect(self):
|
||||||
"""
|
'''
|
||||||
Connect with CQP server
|
Connect with CQP server
|
||||||
|
|
||||||
Connects via socket to the CQP server using the given username and
|
Connects via socket to the CQP server using the given username and
|
||||||
password from class initiation.
|
password from class initiation.
|
||||||
"""
|
'''
|
||||||
self.ctrl_connect(self.username, self.password)
|
self.ctrl_connect(self.username, self.password)
|
||||||
|
|
||||||
def __create_attribute_strings(self):
|
def __create_attribute_strings(self):
|
||||||
"""
|
'''
|
||||||
Creates all needed attribute strings to query for word, lemma etc. in
|
Creates all needed attribute strings to query for word, lemma etc. in
|
||||||
the given corpus.
|
the given corpus.
|
||||||
For example: CORPUS_NAME.word to query words
|
For example: CORPUS_NAME.word to query words
|
||||||
"""
|
Automaticalle creates strings for all pre defined tags.
|
||||||
|
'''
|
||||||
p_attrs = self.corpus_positional_attributes(self.corpus_name)
|
p_attrs = self.corpus_positional_attributes(self.corpus_name)
|
||||||
struct_attrs = self.corpus_structural_attributes(self.corpus_name)
|
struct_attrs = self.corpus_structural_attributes(self.corpus_name)
|
||||||
self.attr_strings = {}
|
self.attr_strings = {}
|
||||||
@ -54,40 +55,45 @@ class CQiWrapper(CQiClient):
|
|||||||
self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
|
self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
|
||||||
+ '.'
|
+ '.'
|
||||||
+ struct_attr)
|
+ struct_attr)
|
||||||
# logger.warning(('All positional and '
|
logger.warning(('All positional and '
|
||||||
# 'structural attributes: {}').format(self.attr_strings))
|
'structural attributes: {}').format(self.attr_strings))
|
||||||
|
|
||||||
def select_corpus(self, corpus_name):
|
def select_corpus(self, corpus_name):
|
||||||
|
'''
|
||||||
|
Checks if given copus name exists. If it exists set it as the main
|
||||||
|
corpus name used to create the needed query attribute strings like
|
||||||
|
CORPUS_NAME.word.
|
||||||
|
'''
|
||||||
if corpus_name in self.corpus_list_coprora():
|
if corpus_name in self.corpus_list_coprora():
|
||||||
self.corpus_name = corpus_name
|
self.corpus_name = corpus_name
|
||||||
self.__create_attribute_strings()
|
self.__create_attribute_strings()
|
||||||
# logger.warning('{} does exist.'.format(corpus_name))
|
logger.warning('{} does exist.'.format(corpus_name))
|
||||||
else:
|
else:
|
||||||
# logger.warning('{} does not exist.'.format(corpus_name))
|
logger.warning('{} does not exist.'.format(corpus_name))
|
||||||
pass
|
raise Exception('Given Corpus Name is not in corpora list.')
|
||||||
|
|
||||||
def disconnect(self):
|
def disconnect(self):
|
||||||
"""
|
'''
|
||||||
Disconnect from CQP server
|
Disconnect from CQP server
|
||||||
|
|
||||||
Disconnects from the CQP server. Closes used socket after disconnect.
|
Disconnects from the CQP server. Closes used socket after disconnect.
|
||||||
"""
|
'''
|
||||||
self.ctrl_bye()
|
self.ctrl_bye()
|
||||||
self.connection.close()
|
self.connection.close()
|
||||||
# logger.warning('Disconnected from cqp server.')
|
logger.warning('Disconnected from cqp server.')
|
||||||
|
|
||||||
def query_subcorpus(self, query, result_subcorpus_name='Query-results'):
|
def query_subcorpus(self, query, result_subcorpus_name='Query-results'):
|
||||||
"""
|
'''
|
||||||
Create subcorpus
|
Create subcorpus
|
||||||
|
|
||||||
Input query will be used to create a subcorpus holding all cpos match
|
Input query will be used to create a subcorpus holding all cpos match
|
||||||
positions for that query.
|
positions for that query.
|
||||||
|
|
||||||
Keyword arguments:
|
Keyword arguments:
|
||||||
result_subcorpus_name -- user set name of the subcorpus which holds all
|
result_subcorpus_name -- set name of the subcorpus which holds all
|
||||||
cpos match positions, produced by the query
|
cpos match positions, produced by the query
|
||||||
query -- query written in cqp query language
|
query -- query written in cqp query language
|
||||||
"""
|
'''
|
||||||
self.cqp_query(self.corpus_name, result_subcorpus_name, query)
|
self.cqp_query(self.corpus_name, result_subcorpus_name, query)
|
||||||
self.result_subcorpus = (self.corpus_name
|
self.result_subcorpus = (self.corpus_name
|
||||||
+ ':'
|
+ ':'
|
||||||
@ -95,19 +101,19 @@ class CQiWrapper(CQiClient):
|
|||||||
self.SUBCORPUS_NAMES.append(self.result_subcorpus)
|
self.SUBCORPUS_NAMES.append(self.result_subcorpus)
|
||||||
self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus)
|
self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus)
|
||||||
print('Nr of all matches is:', self.nr_matches)
|
print('Nr of all matches is:', self.nr_matches)
|
||||||
# logger.warning('Nr of all matches is: {}'.format(self.nr_matches))
|
logger.warning('Nr of all matches is: {}'.format(self.nr_matches))
|
||||||
|
|
||||||
def show_subcorpora(self):
|
def show_subcorpora(self):
|
||||||
"""
|
'''
|
||||||
Show all subcorpora currently saved by the cqp server.
|
Show all subcorpora currently saved by the cqp server.
|
||||||
"""
|
'''
|
||||||
return self.cqp_list_subcorpora(self.corpus_name)
|
return self.cqp_list_subcorpora(self.corpus_name)
|
||||||
|
|
||||||
def show_query_results(self,
|
def show_query_results(self,
|
||||||
context_len=10,
|
context_len=10,
|
||||||
result_len=1000,
|
result_len=1000,
|
||||||
result_offset=0):
|
result_offset=0):
|
||||||
"""
|
'''
|
||||||
Show query results
|
Show query results
|
||||||
|
|
||||||
Shows the actual matched strings produce by the query. Uses the cpos
|
Shows the actual matched strings produce by the query. Uses the cpos
|
||||||
@ -118,15 +124,20 @@ class CQiWrapper(CQiClient):
|
|||||||
Keyword arguments:
|
Keyword arguments:
|
||||||
context_len -- defines how many words before and after a match will be
|
context_len -- defines how many words before and after a match will be
|
||||||
shown (default 10)
|
shown (default 10)
|
||||||
result_len -- defines how many results are actually grabbed
|
result_len -- defines for how many matches all informations like lemma
|
||||||
"""
|
and POS are being grabbed
|
||||||
|
result_offset -- defines the offset of the matches being requested. If
|
||||||
|
the offset is 100 informations for matches 100 to result_len are being
|
||||||
|
grabbed
|
||||||
|
'''
|
||||||
|
t0 = time.time()
|
||||||
self.context_len = context_len
|
self.context_len = context_len
|
||||||
self.corpus_max_len = self.cl_attribute_size(
|
self.corpus_max_len = self.cl_attribute_size(
|
||||||
self.attr_strings['positional_attrs']['word']
|
self.attr_strings['positional_attrs']['word']
|
||||||
)
|
)
|
||||||
self.nr_matches = min(result_len, self.nr_matches)
|
self.nr_matches = min(result_len, self.nr_matches)
|
||||||
if self.nr_matches == 0:
|
if self.nr_matches == 0:
|
||||||
# logger.warning('Query resulted in 0 matches.')
|
logger.warning('Query resulted in 0 matches.')
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
# Get match cpos boundries
|
# Get match cpos boundries
|
||||||
@ -144,7 +155,8 @@ class CQiWrapper(CQiClient):
|
|||||||
offset_start,
|
offset_start,
|
||||||
offset_end))
|
offset_end))
|
||||||
|
|
||||||
# Generate all cpos between match boundries including start and end boundries.
|
# Generate all cpos between match boundries including start and end
|
||||||
|
# boundries.
|
||||||
# Also generate cpos for left and right context.
|
# Also generate cpos for left and right context.
|
||||||
# Save those cpos into dict as lists for the keys 'lc', 'hit' and 'rc'
|
# Save those cpos into dict as lists for the keys 'lc', 'hit' and 'rc'
|
||||||
# Also collect all cpos together in one list for the final request of
|
# Also collect all cpos together in one list for the final request of
|
||||||
@ -157,42 +169,34 @@ class CQiWrapper(CQiClient):
|
|||||||
lc = {'lc': lc_cpos}
|
lc = {'lc': lc_cpos}
|
||||||
match_cpos = list(range(start, end))
|
match_cpos = list(range(start, end))
|
||||||
match = {'hit': match_cpos}
|
match = {'hit': match_cpos}
|
||||||
rc_cpos = list(range(end, min([self.corpus_max_len, end + self.context_len])))
|
rc_cpos = list(range(end, min([self.corpus_max_len,
|
||||||
|
end + self.context_len])))
|
||||||
rc = {'rc': rc_cpos}
|
rc = {'rc': rc_cpos}
|
||||||
lc.update(match)
|
lc.update(match)
|
||||||
lc.update(rc)
|
lc.update(rc)
|
||||||
all_cpos.extend(lc_cpos + match_cpos + rc_cpos)
|
all_cpos.extend(lc_cpos + match_cpos + rc_cpos)
|
||||||
all_matches.append(lc)
|
all_matches.append(lc)
|
||||||
# print(all_matches)
|
|
||||||
# print(all_cpos)
|
|
||||||
|
|
||||||
# Get all cpos for all sneteces boundries
|
|
||||||
# s_lookup = {}
|
|
||||||
# for s_id in set(s_ids):
|
|
||||||
# s_start, s_end = self.cl_struc2cpos('UTOPIEN.s', s_id)
|
|
||||||
# # CHANGE to UTOPIEN.s will always be like this in nopaque
|
|
||||||
# s_cpos = range(s_start, s_end)
|
|
||||||
# s_lookup.update({s_id: list(s_cpos)})
|
|
||||||
# # print(list(s_cpos))
|
|
||||||
# all_cpos.extend(s_cpos)
|
|
||||||
t0 = time.time()
|
|
||||||
all_cpos = list(set(all_cpos)) # get rid of cpos duplicates
|
all_cpos = list(set(all_cpos)) # get rid of cpos duplicates
|
||||||
|
len_all_cpos = len(all_cpos)
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
t_total = t1 - t0
|
t_total = t1 - t0
|
||||||
print('TIME FOR ALL CPOS:', t_total)
|
logger.warning('Time to create all CPOS for query: {}'.format(t_total))
|
||||||
print('CPOS SUM:', len(all_cpos))
|
print('Requesting {} CPOS with one query.'.format(len_all_cpos))
|
||||||
|
|
||||||
# Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
|
# Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
|
||||||
# all cpos entries in all_cpos_list
|
# all cpos entries in all_cpos_list
|
||||||
# Also saves these informations into self.results dict
|
# Also saves these informations into self.results dict
|
||||||
t6 = time.time()
|
t2 = time.time()
|
||||||
all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
|
all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
|
||||||
t7 = time.time()
|
t3 = time.time()
|
||||||
t_final = t7 - t6
|
t_final = t3 - t2
|
||||||
print('GOT ALL RESULTS IN:', t_final)
|
print('Got infos for {} CPOS in {} seconds:'.format(len_all_cpos,
|
||||||
|
t_final))
|
||||||
self.results = {'matches': all_matches, 'cpos_lookup': all_cpos_infos,
|
self.results = {'matches': all_matches,
|
||||||
'text_lookup': text_lookup}
|
'cpos_lookup': all_cpos_infos,
|
||||||
|
'text_lookup': text_lookup,
|
||||||
|
'nr_matches': self.nr_matches}
|
||||||
return self.results
|
return self.results
|
||||||
|
|
||||||
def get_cpos_infos(self, all_cpos):
|
def get_cpos_infos(self, all_cpos):
|
||||||
@ -250,3 +254,44 @@ class CQiWrapper(CQiClient):
|
|||||||
for info in joined_cpos_infos:
|
for info in joined_cpos_infos:
|
||||||
dict_cpos_infos[info[0]] = dict(zip(attr_keys_list, info[1:]))
|
dict_cpos_infos[info[0]] = dict(zip(attr_keys_list, info[1:]))
|
||||||
return dict_cpos_infos, text_lookup
|
return dict_cpos_infos, text_lookup
|
||||||
|
|
||||||
|
def get_sentences(self,
|
||||||
|
match_cpos_list,
|
||||||
|
get_surrounding_s=False,
|
||||||
|
l_r_s_context_additional_len=1):
|
||||||
|
'''
|
||||||
|
Get sentence informations for one match also set if and how much left
|
||||||
|
right context sentences should be grabbed surrounding the given CPOS.
|
||||||
|
'''
|
||||||
|
t0 = time.time()
|
||||||
|
key = self.corpus_name + '.s'
|
||||||
|
first_cpos, last_cpos = match_cpos_list[0], match_cpos_list[-1]
|
||||||
|
context_sentences = {}
|
||||||
|
s_ids = self.cl_cpos2struc(key, [first_cpos, last_cpos])
|
||||||
|
for s_id in s_ids:
|
||||||
|
s_start, s_end = self.cl_struc2cpos(key, s_id)
|
||||||
|
s_cpos = list(range(s_start, s_end + 1))
|
||||||
|
context_sentences[s_id] = s_cpos
|
||||||
|
if get_surrounding_s:
|
||||||
|
max_s_id = self.cl_attribute_size(key)
|
||||||
|
additional_s_ids = []
|
||||||
|
additional_s = list(range(max(s_ids[0]
|
||||||
|
- l_r_s_context_additional_len,
|
||||||
|
0),
|
||||||
|
min(s_ids[-1]
|
||||||
|
+ l_r_s_context_additional_len,
|
||||||
|
max_s_id) + 1))
|
||||||
|
additional_s_ids.extend(additional_s)
|
||||||
|
for s_id in additional_s_ids:
|
||||||
|
s_start, s_end = self.cl_struc2cpos(key, s_id)
|
||||||
|
s_cpos = list(range(s_start, s_end + 1))
|
||||||
|
context_sentences[s_id] = s_cpos
|
||||||
|
all_cpos = []
|
||||||
|
for key in context_sentences.keys():
|
||||||
|
all_cpos.extend(context_sentences[key])
|
||||||
|
all_cpos = list(set(all_cpos))
|
||||||
|
all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
|
||||||
|
t1 = time.time()
|
||||||
|
t_total = t1 - t0
|
||||||
|
logger.warning('Got all sentences informations in {} seconds'. format(t_total))
|
||||||
|
return context_sentences, all_cpos_infos, text_lookup
|
||||||
|
Loading…
Reference in New Issue
Block a user