mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2025-01-27 01:50:34 +00:00
261 lines
12 KiB
Python
261 lines
12 KiB
Python
from .CQiClient import CQiClient
|
|
import multiprocessing
|
|
import collections
|
|
|
|
|
|
class CQiWrapper(CQiClient):
|
|
"""
|
|
CQIiWrapper object
|
|
|
|
High level wrapper that groups and renames some functions of CQiClient
|
|
for ease of use. Also structures recieved data into python dictionaries.
|
|
|
|
Keyword arguments:
|
|
username -- username used to connect to the cqp server
|
|
password -- password of the user to connect to the cqp server
|
|
"""
|
|
|
|
SUBCORPUS_NAMES = []
|
|
|
|
def __init__(self, host='127.0.0.1', port=4877, username='opaque',
|
|
password='opaque'):
|
|
super(CQiWrapper, self).__init__(host=host, port=port)
|
|
self.username = username
|
|
self.password = password
|
|
|
|
def connect(self):
|
|
"""
|
|
Connect with CQP server
|
|
|
|
Connects via socket to the CQP server using the given username and
|
|
password from class initiation.
|
|
"""
|
|
self.ctrl_connect(self.username, self.password)
|
|
|
|
def create_attribute_strings(self):
|
|
p_attrs = self.corpus_positional_attributes(self.corpus_name)
|
|
struct_attrs = self.corpus_structural_attributes(self.corpus_name)
|
|
self.meta_struct_element = struct_attrs[0]
|
|
print(p_attrs)
|
|
print(struct_attrs)
|
|
self.attr_strings = {}
|
|
self.attr_strings['positional_attrs'] = {}
|
|
self.attr_strings['struct_attrs'] = {}
|
|
for p_attr in p_attrs:
|
|
self.attr_strings['positional_attrs'][p_attr] = (self.corpus_name
|
|
+ '.'
|
|
+ p_attr)
|
|
for struct_attr in struct_attrs[:-1]:
|
|
self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
|
|
+ '.'
|
|
+ struct_attr)
|
|
# self.word_str = corpus_name + '.word'
|
|
# self.lemma_str = corpus_name + '.lemma'
|
|
# self.pos_str = corpus_name + '.pos'
|
|
# self.sem_str = corpus_name + '.sem'
|
|
# self.entry_str = corpus_name + '.entry'
|
|
# self.entry_author_str = self.entry_str + '_author'
|
|
# self.entry_title_str = self.entry_str + '_title'
|
|
# self.attributes = [self.word_str,
|
|
# self.lemma_str,
|
|
# self.pos_str,
|
|
# self.sem_str,
|
|
# self.entry_str,
|
|
# self.entry_author_str,
|
|
# self.entry_title_str]
|
|
# print(self.attributes)
|
|
|
|
def set_corpus_name(self, corpus_name):
|
|
self.corpus_name = corpus_name
|
|
|
|
def disconnect(self):
|
|
"""
|
|
Disconnect from CQP server
|
|
|
|
Disconnects from the CQP server. Closes used socket after disconnect.
|
|
"""
|
|
self.ctrl_bye()
|
|
self.connection.close()
|
|
|
|
def query_subcorpus(self, result_subcorpus_name, query):
|
|
"""
|
|
Create subcorpus
|
|
|
|
Input query will be used to create a subcorpus holding all cpos match
|
|
positions for that query.
|
|
|
|
Keyword arguments:
|
|
result_subcorpus_name -- user set name of the subcorpus which holds all
|
|
cpos match positions, produced by the query
|
|
query -- query written in cqp query language
|
|
"""
|
|
self.cqp_query(self.corpus_name, result_subcorpus_name, query)
|
|
self.result_subcorpus_ns = (self.corpus_name
|
|
+ ':'
|
|
+ result_subcorpus_name)
|
|
self.SUBCORPUS_NAMES.append(self.result_subcorpus_ns)
|
|
self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus_ns)
|
|
print('Nr of all matches is:', self.nr_matches)
|
|
|
|
def show_subcorpora(self):
|
|
return self.cqp_list_subcorpora(self.corpus_name)
|
|
|
|
def show_results(self,
|
|
result_start_count=0,
|
|
result_max_count=50,
|
|
context_len=10,):
|
|
"""
|
|
Show query results
|
|
|
|
Shows the actual matched strings produce by the query. Uses the cpos
|
|
match indexes to grab those strings. saves them into an orderd
|
|
dictionary. Also saves coresponding tags, lemmas and context:
|
|
OrderedDict([
|
|
(0,
|
|
{
|
|
'tokens': ['Big', 'Brother', 'himself'],
|
|
'lemmas': ['big', 'brother', 'himself'],
|
|
'pos_tags': ['JJ', 'NN1', 'PPX1'],
|
|
'sem_tags': ['|A11.1+|N3.2+|N5+|', '|S2.2m|S4m|S9/S2.2m|',
|
|
'|Z8m|'],
|
|
'context_before': ['figures', 'of', 'the', 'Party', ',',
|
|
'almost', 'on', 'a', 'level', 'with'],
|
|
'context_after': [',', 'and', 'then', 'had', 'engaged',
|
|
'in', 'counter-revolu-', 'tionary',
|
|
'activities', ','],
|
|
'entry_title': '1984', 'entry_author':
|
|
'george_orwell',
|
|
'cpos_start': 110490,
|
|
'cpos_end': 110492
|
|
}
|
|
)
|
|
])
|
|
|
|
Keyword arguments:
|
|
result_start_count -- start position of the dumped subcorpus.
|
|
(default 0) If it is 0 matches 0 to 50 will be shown. If it is 50
|
|
matches 50 to 100 will be shown.
|
|
result_max_count -- defines how many matches at once will be shown.
|
|
(default 50)
|
|
context_len -- defines how many words before and after a match will be
|
|
shown (default 10)
|
|
"""
|
|
self.context_len = context_len
|
|
self.corpus_max_len = self.cl_attribute_size(self.attr_strings['positional_attrs']['word'])
|
|
if self.nr_matches == 0:
|
|
print('Query resulted in 0 matches.')
|
|
else:
|
|
if self.nr_matches <= 50:
|
|
matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
|
|
0x10,
|
|
0,
|
|
self.nr_matches - 1)
|
|
matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
|
|
0x11,
|
|
0, self.nr_matches - 1)
|
|
else:
|
|
matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
|
|
0x10,
|
|
result_start_count,
|
|
result_max_count - 1)
|
|
matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
|
|
0x11,
|
|
result_start_count,
|
|
result_max_count - 1)
|
|
match_indexes = zip(matches_start, matches_end)
|
|
|
|
matches = []
|
|
manager = multiprocessing.Manager()
|
|
return_dict = manager.dict()
|
|
for i, index_pair in enumerate(match_indexes):
|
|
match = multiprocessing.Process(target=self.__get_matches,
|
|
args=(i,
|
|
index_pair,
|
|
self.corpus_name,
|
|
return_dict))
|
|
matches.append(match)
|
|
match.start()
|
|
for match in matches:
|
|
match.join()
|
|
# sort matches into ordered dict
|
|
ordered_results = collections.OrderedDict()
|
|
for key in sorted(return_dict.keys()):
|
|
ordered_results[key] = return_dict[key]
|
|
return ordered_results
|
|
|
|
def get_cpos_info(self, cpos, session):
|
|
match_dict = {}
|
|
for attr_dict in self.attr_strings:
|
|
# print(self.attr_strings[attr_dict])
|
|
if attr_dict == 'positional_attrs':
|
|
for p_attr_key in self.attr_strings[attr_dict].keys():
|
|
# print(p_attr_key)
|
|
match_str = session.cl_cpos2str(self.attr_strings[attr_dict][p_attr_key], range(cpos[0], cpos[1]))
|
|
match_dict[p_attr_key] = match_str
|
|
elif attr_dict == 'struct_attrs':
|
|
for struct_attr_key in self.attr_strings[attr_dict].keys():
|
|
# print(struct_attr_key)
|
|
struct_entry = session.cl_cpos2struc(self.attr_strings['struct_attrs'][self.meta_struct_element],
|
|
range(cpos[0], cpos[1]))
|
|
match_str = session.cl_struc2str(self.attr_strings[attr_dict][struct_attr_key], struct_entry)
|
|
match_dict[struct_attr_key] = set(match_str)
|
|
return match_dict
|
|
|
|
def __get_matches(self, i, index_pair, corpus_name, return_dict):
|
|
"""
|
|
Get matches as readable output
|
|
|
|
Gets the actual match strings of cpos match indexes. Private helper
|
|
method used in show_results.
|
|
|
|
Keyword arguments:
|
|
i -- serial number for match at given cpos
|
|
index_pair -- match start and match end cpos
|
|
corpus_name -- name of the parent corpus
|
|
return_dict -- dictionary created with manager.dict() that holds the
|
|
extracted strings tags etc.
|
|
"""
|
|
# print('START:', index_pair[0])
|
|
# print('END:', index_pair[1])
|
|
# print('=============================')
|
|
index_pair = [index_pair[0], index_pair[1] + 1]
|
|
tmp_session = CQiWrapper(username=self.username, password=self.password,
|
|
host=self.host, port=self.port)
|
|
tmp_session.connect()
|
|
match = self.get_cpos_info(index_pair, tmp_session)
|
|
# tokens = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
|
|
# range(index_pair[0],
|
|
# index_pair[1] + 1))
|
|
# lemmas = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['lemma'],
|
|
# range(index_pair[0],
|
|
# index_pair[1] + 1))
|
|
# pos_tags = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['pos'],
|
|
# range(index_pair[0],
|
|
# index_pair[1] + 1))
|
|
# sem_tags = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['sem'],
|
|
# range(index_pair[0],
|
|
# index_pair[1] + 1))
|
|
# struc_entry = tmp_session.cl_cpos2struc(self.attr_strings['struct_attrs']['entry'],
|
|
# range(index_pair[0],
|
|
# index_pair[1] + 1))
|
|
before_index = max([0, index_pair[0] - self.context_len])
|
|
after_index = min([self.corpus_max_len,
|
|
index_pair[1] + self.context_len])
|
|
context_before = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
|
|
range(before_index,
|
|
index_pair[0]))
|
|
context_after = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
|
|
range(index_pair[1] + 1,
|
|
after_index + 1))
|
|
# entry_titles = tmp_session.cl_struc2str(self.attr_strings['struct_attrs']['entry_title'],
|
|
# struc_entry)
|
|
# entry_authors = tmp_session.cl_struc2str(self.attr_strings['struct_attrs']['entry_author'],
|
|
# struc_entry)
|
|
tmp_dict = {'context_before': context_before,
|
|
'context_after': context_after,
|
|
'cpos_start': index_pair[0],
|
|
'cpos_end': index_pair[1]}
|
|
match.update(tmp_dict)
|
|
return_dict[i] = match
|
|
tmp_session.disconnect()
|