mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2024-12-24 10:34:17 +00:00
Add new CQiWrapper
This commit is contained in:
parent
baf06d3106
commit
5fdd67ebf2
@ -1,4 +1,4 @@
|
|||||||
from . import CQi
|
import CQi
|
||||||
import socket
|
import socket
|
||||||
import struct
|
import struct
|
||||||
|
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
from .CQiClient import CQiClient
|
from CQiClient import CQiClient
|
||||||
import multiprocessing
|
from CQi import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND
|
||||||
import collections
|
import collections
|
||||||
|
from app import logger # only works if imported into opaque web app
|
||||||
|
|
||||||
|
|
||||||
class CQiWrapper(CQiClient):
|
class CQiWrapper(CQiClient):
|
||||||
@ -11,6 +12,8 @@ class CQiWrapper(CQiClient):
|
|||||||
for ease of use. Also structures recieved data into python dictionaries.
|
for ease of use. Also structures recieved data into python dictionaries.
|
||||||
|
|
||||||
Keyword arguments:
|
Keyword arguments:
|
||||||
|
host -- host IP adress or hostname wher the cqp server is running
|
||||||
|
port -- port of the cqp server
|
||||||
username -- username used to connect to the cqp server
|
username -- username used to connect to the cqp server
|
||||||
password -- password of the user to connect to the cqp server
|
password -- password of the user to connect to the cqp server
|
||||||
"""
|
"""
|
||||||
@ -32,12 +35,15 @@ class CQiWrapper(CQiClient):
|
|||||||
"""
|
"""
|
||||||
self.ctrl_connect(self.username, self.password)
|
self.ctrl_connect(self.username, self.password)
|
||||||
|
|
||||||
def create_attribute_strings(self):
|
def __create_attribute_strings(self):
|
||||||
|
"""
|
||||||
|
Creates all needed attribute strings to query for word, lemma etc. in
|
||||||
|
the given corpus.
|
||||||
|
For example: CORPUS_NAME.word to query words
|
||||||
|
"""
|
||||||
p_attrs = self.corpus_positional_attributes(self.corpus_name)
|
p_attrs = self.corpus_positional_attributes(self.corpus_name)
|
||||||
struct_attrs = self.corpus_structural_attributes(self.corpus_name)
|
struct_attrs = self.corpus_structural_attributes(self.corpus_name)
|
||||||
self.meta_struct_element = struct_attrs[0]
|
self.meta_struct_element = struct_attrs[0]
|
||||||
print(p_attrs)
|
|
||||||
print(struct_attrs)
|
|
||||||
self.attr_strings = {}
|
self.attr_strings = {}
|
||||||
self.attr_strings['positional_attrs'] = {}
|
self.attr_strings['positional_attrs'] = {}
|
||||||
self.attr_strings['struct_attrs'] = {}
|
self.attr_strings['struct_attrs'] = {}
|
||||||
@ -49,8 +55,17 @@ class CQiWrapper(CQiClient):
|
|||||||
self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
|
self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
|
||||||
+ '.'
|
+ '.'
|
||||||
+ struct_attr)
|
+ struct_attr)
|
||||||
def set_corpus_name(self, corpus_name):
|
logger.warning(('All positional and '
|
||||||
self.corpus_name = corpus_name
|
'structural attributes: {}').format(self.attr_strings))
|
||||||
|
|
||||||
|
def select_corpus(self, corpus_name):
|
||||||
|
if corpus_name in self.corpus_list_coprora():
|
||||||
|
self.corpus_name = corpus_name
|
||||||
|
self.__create_attribute_strings()
|
||||||
|
logger.warning('{} does exist.'.format(corpus_name))
|
||||||
|
else:
|
||||||
|
self.disconnect()
|
||||||
|
logger.warning('{} does not exist.'.format(corpus_name))
|
||||||
|
|
||||||
def disconnect(self):
|
def disconnect(self):
|
||||||
"""
|
"""
|
||||||
@ -60,8 +75,9 @@ class CQiWrapper(CQiClient):
|
|||||||
"""
|
"""
|
||||||
self.ctrl_bye()
|
self.ctrl_bye()
|
||||||
self.connection.close()
|
self.connection.close()
|
||||||
|
logger.warning('Disconnected from cqp server.')
|
||||||
|
|
||||||
def query_subcorpus(self, result_subcorpus_name, query):
|
def query_subcorpus(self, query, result_subcorpus_name='Query-results'):
|
||||||
"""
|
"""
|
||||||
Create subcorpus
|
Create subcorpus
|
||||||
|
|
||||||
@ -74,152 +90,155 @@ class CQiWrapper(CQiClient):
|
|||||||
query -- query written in cqp query language
|
query -- query written in cqp query language
|
||||||
"""
|
"""
|
||||||
self.cqp_query(self.corpus_name, result_subcorpus_name, query)
|
self.cqp_query(self.corpus_name, result_subcorpus_name, query)
|
||||||
self.result_subcorpus_ns = (self.corpus_name
|
self.result_subcorpus = (self.corpus_name
|
||||||
+ ':'
|
+ ':'
|
||||||
+ result_subcorpus_name)
|
+ result_subcorpus_name)
|
||||||
self.SUBCORPUS_NAMES.append(self.result_subcorpus_ns)
|
self.SUBCORPUS_NAMES.append(self.result_subcorpus)
|
||||||
self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus_ns)
|
self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus)
|
||||||
print('Nr of all matches is:', self.nr_matches)
|
logger.warning('Nr of all matches is: {}'.format(self.nr_matches))
|
||||||
|
|
||||||
def show_subcorpora(self):
|
def show_subcorpora(self):
|
||||||
|
"""
|
||||||
|
Show all subcorpora currently saved by the cqp server.
|
||||||
|
"""
|
||||||
return self.cqp_list_subcorpora(self.corpus_name)
|
return self.cqp_list_subcorpora(self.corpus_name)
|
||||||
|
|
||||||
def show_results(self,
|
def show_query_results(self,
|
||||||
result_start_count=0,
|
context_len=10,
|
||||||
result_max_count=50,
|
result_len=1000):
|
||||||
context_len=10,):
|
|
||||||
"""
|
"""
|
||||||
Show query results
|
Show query results
|
||||||
|
|
||||||
Shows the actual matched strings produce by the query. Uses the cpos
|
Shows the actual matched strings produce by the query. Uses the cpos
|
||||||
match indexes to grab those strings. saves them into an orderd
|
match indexes to grab those strings. saves them into an orderd
|
||||||
dictionary. Also saves coresponding tags, lemmas and context:
|
dictionary. Also saves coresponding tags, lemmas and context. Gets those
|
||||||
OrderedDict([
|
informations using the corresponding cpos.
|
||||||
(0,
|
|
||||||
{
|
|
||||||
'tokens': ['Big', 'Brother', 'himself'],
|
|
||||||
'lemmas': ['big', 'brother', 'himself'],
|
|
||||||
'pos_tags': ['JJ', 'NN1', 'PPX1'],
|
|
||||||
'sem_tags': ['|A11.1+|N3.2+|N5+|', '|S2.2m|S4m|S9/S2.2m|',
|
|
||||||
'|Z8m|'],
|
|
||||||
'context_before': ['figures', 'of', 'the', 'Party', ',',
|
|
||||||
'almost', 'on', 'a', 'level', 'with'],
|
|
||||||
'context_after': [',', 'and', 'then', 'had', 'engaged',
|
|
||||||
'in', 'counter-revolu-', 'tionary',
|
|
||||||
'activities', ','],
|
|
||||||
'entry_title': '1984', 'entry_author':
|
|
||||||
'george_orwell',
|
|
||||||
'cpos_start': 110490,
|
|
||||||
'cpos_end': 110492
|
|
||||||
}
|
|
||||||
)
|
|
||||||
])
|
|
||||||
|
|
||||||
Keyword arguments:
|
Keyword arguments:
|
||||||
result_start_count -- start position of the dumped subcorpus.
|
|
||||||
(default 0) If it is 0 matches 0 to 50 will be shown. If it is 50
|
|
||||||
matches 50 to 100 will be shown.
|
|
||||||
result_max_count -- defines how many matches at once will be shown.
|
|
||||||
(default 50)
|
|
||||||
context_len -- defines how many words before and after a match will be
|
context_len -- defines how many words before and after a match will be
|
||||||
shown (default 10)
|
shown (default 10)
|
||||||
|
result_len -- defines how many results are actually grabbed
|
||||||
"""
|
"""
|
||||||
self.context_len = context_len
|
self.context_len = context_len
|
||||||
self.corpus_max_len = self.cl_attribute_size(self.attr_strings['positional_attrs']['word'])
|
self.corpus_max_len = self.cl_attribute_size(
|
||||||
|
self.attr_strings['positional_attrs']['word']
|
||||||
|
)
|
||||||
|
self.nr_matches = min(result_len, self.nr_matches)
|
||||||
if self.nr_matches == 0:
|
if self.nr_matches == 0:
|
||||||
print('Query resulted in 0 matches.')
|
logger.warning('Query resulted in 0 matches.')
|
||||||
|
self.disconnect
|
||||||
|
return None
|
||||||
else:
|
else:
|
||||||
if self.nr_matches <= 50:
|
# Get match cpos boundries
|
||||||
matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
|
# match_boundries shows the start and end cpos of one match as a
|
||||||
0x10,
|
# pair of cpositions
|
||||||
0,
|
# [(1355, 1357), (1477, 1479)] Example for two boundry pairs
|
||||||
self.nr_matches - 1)
|
match_boundaries = zip(self.cqp_dump_subcorpus(self.result_subcorpus,
|
||||||
matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
|
CONST_FIELD_MATCH,
|
||||||
0x11,
|
0,
|
||||||
0, self.nr_matches - 1)
|
self.nr_matches - 1),
|
||||||
else:
|
self.cqp_dump_subcorpus(self.result_subcorpus,
|
||||||
matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
|
CONST_FIELD_MATCHEND,
|
||||||
0x10,
|
0,
|
||||||
result_start_count,
|
self.nr_matches - 1))
|
||||||
result_max_count - 1)
|
|
||||||
matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
|
|
||||||
0x11,
|
|
||||||
result_start_count,
|
|
||||||
result_max_count - 1)
|
|
||||||
match_indexes = zip(matches_start, matches_end)
|
|
||||||
|
|
||||||
matches = []
|
# Generate all cpos between boundries including start and end boundries
|
||||||
manager = multiprocessing.Manager()
|
# Save them as list into on match entry at serial number 'i'
|
||||||
return_dict = manager.dict()
|
ordered_matches = collections.OrderedDict()
|
||||||
for i, index_pair in enumerate(match_indexes):
|
for i, match_pair in enumerate(match_boundaries):
|
||||||
match = multiprocessing.Process(target=self.__get_matches,
|
ordered_matches[i] = ({'match_cpos_list':
|
||||||
args=(i,
|
list(range(match_pair[0],
|
||||||
index_pair,
|
match_pair[1] + 1))})
|
||||||
self.corpus_name,
|
# Saves cpos form all match entries into one list
|
||||||
return_dict))
|
all_cpos_list = []
|
||||||
matches.append(match)
|
for key in ordered_matches.keys():
|
||||||
match.start()
|
all_cpos_list += ordered_matches[key]['match_cpos_list']
|
||||||
for match in matches:
|
|
||||||
match.join()
|
|
||||||
# sort matches into ordered dict
|
|
||||||
ordered_results = collections.OrderedDict()
|
|
||||||
for key in sorted(return_dict.keys()):
|
|
||||||
ordered_results[key] = return_dict[key]
|
|
||||||
return ordered_results
|
|
||||||
|
|
||||||
def get_cpos_info(self, cpos, session):
|
# Saves all cpos from before and after context into the list:
|
||||||
match_dict = {}
|
# all_context_cpos_list
|
||||||
|
all_context_cpos_list = []
|
||||||
|
for key in ordered_matches.keys():
|
||||||
|
cpos_list = ordered_matches[key]['match_cpos_list']
|
||||||
|
before_index = max([0, cpos_list[0] - self.context_len])
|
||||||
|
after_index = min([self.corpus_max_len,
|
||||||
|
cpos_list[-1] + self.context_len])
|
||||||
|
ordered_matches[key]['context_before_cpos_list'] = list(range(before_index,
|
||||||
|
cpos_list[0]))
|
||||||
|
ordered_matches[key]['context_after_cpos_list'] = list(range(cpos_list[-1] + 1,
|
||||||
|
after_index + 1))
|
||||||
|
all_context_cpos_list += ordered_matches[key]['context_before_cpos_list']
|
||||||
|
all_context_cpos_list += ordered_matches[key]['context_after_cpos_list']
|
||||||
|
# Combines all_cpos_list with all_context_cpos_list as a sorted set
|
||||||
|
all_cpos_list += all_context_cpos_list
|
||||||
|
all_cpos_list = sorted(list(set(all_cpos_list)))
|
||||||
|
|
||||||
|
# Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
|
||||||
|
# all cpos entries in all_cpos_list
|
||||||
|
# Also saves these informations into the ordered_matches dict
|
||||||
|
all_cpos_infos = self.get_cpos_infos(all_cpos_list)
|
||||||
|
for key in ordered_matches.keys():
|
||||||
|
# loops over cpos in cpos_list which holds all match cpos
|
||||||
|
# Replaces one cpos with the corresponding cpos information created
|
||||||
|
# by self.get_cpos_infos(all_cpos_list)
|
||||||
|
cpos_list = ordered_matches[key]['match_cpos_list']
|
||||||
|
infos = []
|
||||||
|
for cpos in cpos_list:
|
||||||
|
info = {cpos: all_cpos_infos.get(cpos)}
|
||||||
|
infos.append(info)
|
||||||
|
ordered_matches[key]['match_cpos_list'] = infos
|
||||||
|
try:
|
||||||
|
# loops over cpos in ordered_matches[key]['context_before_cpos_list']
|
||||||
|
# which holds all cpos of the before context
|
||||||
|
# Replaces one cpos with the corresponding cpos information created
|
||||||
|
# by self.get_cpos_infos(all_cpos_list)
|
||||||
|
before_context_infos = []
|
||||||
|
for context_before_cpos in ordered_matches[key]['context_before_cpos_list']:
|
||||||
|
before_context_info = {context_before_cpos:
|
||||||
|
all_cpos_infos.get(context_before_cpos)}
|
||||||
|
before_context_infos.append(before_context_info)
|
||||||
|
ordered_matches[key]['context_before_cpos_list'] = before_context_infos
|
||||||
|
except UnboundLocalError:
|
||||||
|
logger.warning('Context before cpos list is empty.')
|
||||||
|
try:
|
||||||
|
# loops over cpos in ordered_matches[key]['context_after_cpos_list']
|
||||||
|
# which holds all cpos of the before context
|
||||||
|
# Replaces one cpos with the corresponding cpos information created
|
||||||
|
# by self.get_cpos_infos(all_cpos_list)
|
||||||
|
after_context_infos = []
|
||||||
|
for context_after_cpos in ordered_matches[key]['context_after_cpos_list']:
|
||||||
|
after_context_info = {context_after_cpos:
|
||||||
|
all_cpos_infos.get(context_after_cpos)}
|
||||||
|
after_context_infos.append(after_context_info)
|
||||||
|
ordered_matches[key]['context_after_cpos_list'] = after_context_infos
|
||||||
|
except UnboundLocalError:
|
||||||
|
logger.warning('Context after cpos list is empty.')
|
||||||
|
return ordered_matches
|
||||||
|
|
||||||
|
def get_cpos_infos(self, all_cpos):
|
||||||
|
'''
|
||||||
|
Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
|
||||||
|
all cpos entries specified in the parameter all_cpos.
|
||||||
|
'''
|
||||||
|
cpos_infos = {}
|
||||||
for attr_dict in self.attr_strings:
|
for attr_dict in self.attr_strings:
|
||||||
# print(self.attr_strings[attr_dict])
|
|
||||||
if attr_dict == 'positional_attrs':
|
if attr_dict == 'positional_attrs':
|
||||||
for p_attr_key in self.attr_strings[attr_dict].keys():
|
for p_attr_key in self.attr_strings[attr_dict].keys():
|
||||||
# print(p_attr_key)
|
match_str = self.cl_cpos2str(self.attr_strings[attr_dict][p_attr_key],
|
||||||
match_str = session.cl_cpos2str(self.attr_strings[attr_dict][p_attr_key], range(cpos[0], cpos[1]))
|
all_cpos)
|
||||||
match_dict[p_attr_key] = match_str
|
cpos_infos[p_attr_key] = match_str
|
||||||
elif attr_dict == 'struct_attrs':
|
elif attr_dict == 'struct_attrs':
|
||||||
for struct_attr_key in self.attr_strings[attr_dict].keys():
|
for struct_attr_key in self.attr_strings[attr_dict].keys():
|
||||||
# print(struct_attr_key)
|
struct_entry = self.cl_cpos2struc(self.attr_strings['struct_attrs'][self.meta_struct_element],
|
||||||
struct_entry = session.cl_cpos2struc(self.attr_strings['struct_attrs'][self.meta_struct_element],
|
all_cpos)
|
||||||
range(cpos[0], cpos[1]))
|
match_str = self.cl_struc2str(self.attr_strings[attr_dict][struct_attr_key], struct_entry)
|
||||||
match_str = session.cl_struc2str(self.attr_strings[attr_dict][struct_attr_key], struct_entry)
|
cpos_infos[struct_attr_key] = match_str
|
||||||
match_dict[struct_attr_key] = set(match_str)
|
tmp_list = []
|
||||||
return match_dict
|
attr_key_list = []
|
||||||
|
for key in cpos_infos.keys():
|
||||||
def __get_matches(self, i, index_pair, corpus_name, return_dict):
|
tmp_list.append(cpos_infos[key])
|
||||||
"""
|
attr_key_list.append(key)
|
||||||
Get matches as readable output
|
joined_cpos_infos = zip(all_cpos, *tmp_list)
|
||||||
|
dict_cpos_infos = {}
|
||||||
Gets the actual match strings of cpos match indexes. Private helper
|
for info in joined_cpos_infos:
|
||||||
method used in show_results.
|
dict_cpos_infos[info[0]] = dict(zip(attr_key_list, info[1:]))
|
||||||
|
return dict_cpos_infos
|
||||||
Keyword arguments:
|
|
||||||
i -- serial number for match at given cpos
|
|
||||||
index_pair -- match start and match end cpos
|
|
||||||
corpus_name -- name of the parent corpus
|
|
||||||
return_dict -- dictionary created with manager.dict() that holds the
|
|
||||||
extracted strings tags etc.
|
|
||||||
"""
|
|
||||||
# print('START:', index_pair[0])
|
|
||||||
# print('END:', index_pair[1])
|
|
||||||
# print('=============================')
|
|
||||||
index_pair = [index_pair[0], index_pair[1] + 1]
|
|
||||||
tmp_session = CQiWrapper(username=self.username, password=self.password,
|
|
||||||
host=self.host, port=self.port)
|
|
||||||
tmp_session.connect()
|
|
||||||
match = self.get_cpos_info(index_pair, tmp_session)
|
|
||||||
before_index = max([0, index_pair[0] - self.context_len])
|
|
||||||
after_index = min([self.corpus_max_len,
|
|
||||||
index_pair[1] + self.context_len])
|
|
||||||
context_before = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
|
|
||||||
range(before_index,
|
|
||||||
index_pair[0]))
|
|
||||||
context_after = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
|
|
||||||
range(index_pair[1] + 1,
|
|
||||||
after_index + 1))
|
|
||||||
tmp_dict = {'context_before': context_before,
|
|
||||||
'context_after': context_after,
|
|
||||||
'cpos_start': index_pair[0],
|
|
||||||
'cpos_end': index_pair[1]}
|
|
||||||
match.update(tmp_dict)
|
|
||||||
return_dict[i] = match
|
|
||||||
tmp_session.disconnect()
|
|
||||||
|
Loading…
Reference in New Issue
Block a user