nopaque/app/corpora/CQiWrapper/CQiWrapper.py
2019-11-11 15:35:37 +01:00

261 lines
12 KiB
Python

from .CQiClient import CQiClient
import multiprocessing
import collections
class CQiWrapper(CQiClient):
"""
CQIiWrapper object
High level wrapper that groups and renames some functions of CQiClient
for ease of use. Also structures recieved data into python dictionaries.
Keyword arguments:
username -- username used to connect to the cqp server
password -- password of the user to connect to the cqp server
"""
SUBCORPUS_NAMES = []
def __init__(self, host='127.0.0.1', port=4877, username='opaque',
password='opaque'):
super(CQiWrapper, self).__init__(host=host, port=port)
self.username = username
self.password = password
def connect(self):
"""
Connect with CQP server
Connects via socket to the CQP server using the given username and
password from class initiation.
"""
self.ctrl_connect(self.username, self.password)
def create_attribute_strings(self):
p_attrs = self.corpus_positional_attributes(self.corpus_name)
struct_attrs = self.corpus_structural_attributes(self.corpus_name)
self.meta_struct_element = struct_attrs[0]
print(p_attrs)
print(struct_attrs)
self.attr_strings = {}
self.attr_strings['positional_attrs'] = {}
self.attr_strings['struct_attrs'] = {}
for p_attr in p_attrs:
self.attr_strings['positional_attrs'][p_attr] = (self.corpus_name
+ '.'
+ p_attr)
for struct_attr in struct_attrs[:-1]:
self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
+ '.'
+ struct_attr)
# self.word_str = corpus_name + '.word'
# self.lemma_str = corpus_name + '.lemma'
# self.pos_str = corpus_name + '.pos'
# self.sem_str = corpus_name + '.sem'
# self.entry_str = corpus_name + '.entry'
# self.entry_author_str = self.entry_str + '_author'
# self.entry_title_str = self.entry_str + '_title'
# self.attributes = [self.word_str,
# self.lemma_str,
# self.pos_str,
# self.sem_str,
# self.entry_str,
# self.entry_author_str,
# self.entry_title_str]
# print(self.attributes)
def set_corpus_name(self, corpus_name):
self.corpus_name = corpus_name
def disconnect(self):
"""
Disconnect from CQP server
Disconnects from the CQP server. Closes used socket after disconnect.
"""
self.ctrl_bye()
self.connection.close()
def query_subcorpus(self, result_subcorpus_name, query):
"""
Create subcorpus
Input query will be used to create a subcorpus holding all cpos match
positions for that query.
Keyword arguments:
result_subcorpus_name -- user set name of the subcorpus which holds all
cpos match positions, produced by the query
query -- query written in cqp query language
"""
self.cqp_query(self.corpus_name, result_subcorpus_name, query)
self.result_subcorpus_ns = (self.corpus_name
+ ':'
+ result_subcorpus_name)
self.SUBCORPUS_NAMES.append(self.result_subcorpus_ns)
self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus_ns)
print('Nr of all matches is:', self.nr_matches)
def show_subcorpora(self):
return self.cqp_list_subcorpora(self.corpus_name)
def show_results(self,
result_start_count=0,
result_max_count=50,
context_len=10,):
"""
Show query results
Shows the actual matched strings produce by the query. Uses the cpos
match indexes to grab those strings. saves them into an orderd
dictionary. Also saves coresponding tags, lemmas and context:
OrderedDict([
(0,
{
'tokens': ['Big', 'Brother', 'himself'],
'lemmas': ['big', 'brother', 'himself'],
'pos_tags': ['JJ', 'NN1', 'PPX1'],
'sem_tags': ['|A11.1+|N3.2+|N5+|', '|S2.2m|S4m|S9/S2.2m|',
'|Z8m|'],
'context_before': ['figures', 'of', 'the', 'Party', ',',
'almost', 'on', 'a', 'level', 'with'],
'context_after': [',', 'and', 'then', 'had', 'engaged',
'in', 'counter-revolu-', 'tionary',
'activities', ','],
'entry_title': '1984', 'entry_author':
'george_orwell',
'cpos_start': 110490,
'cpos_end': 110492
}
)
])
Keyword arguments:
result_start_count -- start position of the dumped subcorpus.
(default 0) If it is 0 matches 0 to 50 will be shown. If it is 50
matches 50 to 100 will be shown.
result_max_count -- defines how many matches at once will be shown.
(default 50)
context_len -- defines how many words before and after a match will be
shown (default 10)
"""
self.context_len = context_len
self.corpus_max_len = self.cl_attribute_size(self.attr_strings['positional_attrs']['word'])
if self.nr_matches == 0:
print('Query resulted in 0 matches.')
else:
if self.nr_matches <= 50:
matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
0x10,
0,
self.nr_matches - 1)
matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
0x11,
0, self.nr_matches - 1)
else:
matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
0x10,
result_start_count,
result_max_count - 1)
matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
0x11,
result_start_count,
result_max_count - 1)
match_indexes = zip(matches_start, matches_end)
matches = []
manager = multiprocessing.Manager()
return_dict = manager.dict()
for i, index_pair in enumerate(match_indexes):
match = multiprocessing.Process(target=self.__get_matches,
args=(i,
index_pair,
self.corpus_name,
return_dict))
matches.append(match)
match.start()
for match in matches:
match.join()
# sort matches into ordered dict
ordered_results = collections.OrderedDict()
for key in sorted(return_dict.keys()):
ordered_results[key] = return_dict[key]
return ordered_results
def get_cpos_info(self, cpos, session):
match_dict = {}
for attr_dict in self.attr_strings:
# print(self.attr_strings[attr_dict])
if attr_dict == 'positional_attrs':
for p_attr_key in self.attr_strings[attr_dict].keys():
# print(p_attr_key)
match_str = session.cl_cpos2str(self.attr_strings[attr_dict][p_attr_key], range(cpos[0], cpos[1]))
match_dict[p_attr_key] = match_str
elif attr_dict == 'struct_attrs':
for struct_attr_key in self.attr_strings[attr_dict].keys():
# print(struct_attr_key)
struct_entry = session.cl_cpos2struc(self.attr_strings['struct_attrs'][self.meta_struct_element],
range(cpos[0], cpos[1]))
match_str = session.cl_struc2str(self.attr_strings[attr_dict][struct_attr_key], struct_entry)
match_dict[struct_attr_key] = set(match_str)
return match_dict
def __get_matches(self, i, index_pair, corpus_name, return_dict):
"""
Get matches as readable output
Gets the actual match strings of cpos match indexes. Private helper
method used in show_results.
Keyword arguments:
i -- serial number for match at given cpos
index_pair -- match start and match end cpos
corpus_name -- name of the parent corpus
return_dict -- dictionary created with manager.dict() that holds the
extracted strings tags etc.
"""
# print('START:', index_pair[0])
# print('END:', index_pair[1])
# print('=============================')
index_pair = [index_pair[0], index_pair[1] + 1]
tmp_session = CQiWrapper(username=self.username, password=self.password,
host=self.host, port=self.port)
tmp_session.connect()
match = self.get_cpos_info(index_pair, tmp_session)
# tokens = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
# range(index_pair[0],
# index_pair[1] + 1))
# lemmas = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['lemma'],
# range(index_pair[0],
# index_pair[1] + 1))
# pos_tags = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['pos'],
# range(index_pair[0],
# index_pair[1] + 1))
# sem_tags = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['sem'],
# range(index_pair[0],
# index_pair[1] + 1))
# struc_entry = tmp_session.cl_cpos2struc(self.attr_strings['struct_attrs']['entry'],
# range(index_pair[0],
# index_pair[1] + 1))
before_index = max([0, index_pair[0] - self.context_len])
after_index = min([self.corpus_max_len,
index_pair[1] + self.context_len])
context_before = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
range(before_index,
index_pair[0]))
context_after = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
range(index_pair[1] + 1,
after_index + 1))
# entry_titles = tmp_session.cl_struc2str(self.attr_strings['struct_attrs']['entry_title'],
# struc_entry)
# entry_authors = tmp_session.cl_struc2str(self.attr_strings['struct_attrs']['entry_author'],
# struc_entry)
tmp_dict = {'context_before': context_before,
'context_after': context_after,
'cpos_start': index_pair[0],
'cpos_end': index_pair[1]}
match.update(tmp_dict)
return_dict[i] = match
tmp_session.disconnect()