mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2025-01-13 11:40:35 +00:00
241 lines
10 KiB
Python
241 lines
10 KiB
Python
from .CQiClient import CQiClient
|
|
import multiprocessing
|
|
import collections
|
|
import socket
|
|
|
|
|
|
class CQiWrapper(CQiClient):
|
|
"""
|
|
CQIiWrapper object
|
|
|
|
High level wrapper that groups and renames some functions of CQiClient
|
|
for ease of use. Also structures recieved data into python dictionaries.
|
|
|
|
Keyword arguments:
|
|
username -- username used to connect to the cqp server
|
|
password -- password of the user to connect to the cqp server
|
|
"""
|
|
|
|
SUBCORPUS_NAMES = []
|
|
|
|
def __init__(self, host='127.0.0.1', port=4877, username='opaque',
|
|
password='opaque'):
|
|
super(CQiWrapper, self).__init__(host=host, port=port)
|
|
self.username = username
|
|
self.password = password
|
|
|
|
def connect(self):
|
|
"""
|
|
Connect with CQP server
|
|
|
|
Connects via socket to the CQP server using the given username and
|
|
password from class initiation.
|
|
"""
|
|
self.ctrl_connect(self.username, self.password)
|
|
|
|
def create_attribute_strings(self, corpus_name):
|
|
self.word_str = corpus_name + '.word'
|
|
self.lemma_str = corpus_name + '.lemma'
|
|
self.pos_str = corpus_name + '.pos'
|
|
self.sem_str = corpus_name + '.sem'
|
|
self.entry_str = corpus_name + '.entry'
|
|
self.entry_author_str = self.entry_str + '_author'
|
|
self.entry_title_str = self.entry_str + '_title'
|
|
self.attributes = [self.word_str,
|
|
self.lemma_str,
|
|
self.pos_str,
|
|
self.sem_str,
|
|
self.entry_str,
|
|
self.entry_author_str,
|
|
self.entry_title_str]
|
|
|
|
def disconnect(self):
|
|
"""
|
|
Disconnect from CQP server
|
|
|
|
Disconnects from the CQP server. Closes used socket after disconnect.
|
|
"""
|
|
self.ctrl_bye()
|
|
self.connection.close()
|
|
|
|
def query_subcorpus(self, corpus_name, result_subcorpus_name, query):
|
|
"""
|
|
Create subcorpus
|
|
|
|
Input query will be used to create a subcorpus holding all cpos match
|
|
positions for that query.
|
|
|
|
Keyword arguments:
|
|
corpus_name -- name of the corpus the query will be used on
|
|
result_subcorpus_name -- user set name of the subcorpus which holds all
|
|
cpos match positions, produced by the query
|
|
query -- query written in cqp query language
|
|
"""
|
|
self.cqp_query(corpus_name, result_subcorpus_name, query)
|
|
self.result_subcorpus_ns = (corpus_name
|
|
+ ':'
|
|
+ result_subcorpus_name)
|
|
self.SUBCORPUS_NAMES.append(self.result_subcorpus_ns)
|
|
self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus_ns)
|
|
print('Nr of all matches is:', self.nr_matches)
|
|
|
|
def show_subcorpora(self):
|
|
print('Known subcorpora:', self.SUBCORPUS_NAMES)
|
|
return self.SUBCORPUS_NAMES
|
|
|
|
def show_results(self,
|
|
corpus_name,
|
|
result_start_count=0,
|
|
result_max_count=50,
|
|
context_len=10,):
|
|
"""
|
|
Show query results
|
|
|
|
Shows the actual matched strings produce by the query. Uses the cpos
|
|
match indexes to grab those strings. saves them into an orderd
|
|
dictionary. Also saves coresponding tags, lemmas and context:
|
|
OrderedDict([
|
|
(0,
|
|
{
|
|
'tokens': ['Big', 'Brother', 'himself'],
|
|
'lemmas': ['big', 'brother', 'himself'],
|
|
'pos_tags': ['JJ', 'NN1', 'PPX1'],
|
|
'sem_tags': ['|A11.1+|N3.2+|N5+|', '|S2.2m|S4m|S9/S2.2m|',
|
|
'|Z8m|'],
|
|
'context_before': ['figures', 'of', 'the', 'Party', ',',
|
|
'almost', 'on', 'a', 'level', 'with'],
|
|
'context_after': [',', 'and', 'then', 'had', 'engaged',
|
|
'in', 'counter-revolu-', 'tionary',
|
|
'activities', ','],
|
|
'entry_title': '1984', 'entry_author':
|
|
'george_orwell',
|
|
'cpos_start': 110490,
|
|
'cpos_end': 110492
|
|
}
|
|
)
|
|
])
|
|
|
|
Keyword arguments:
|
|
corpus_name -- name of the parent corpus the subcorpus is part of
|
|
result_start_count -- start position of the dumped subcorpus.
|
|
(default 0) If it is 0 matches 0 to 50 will be shown. If it is 50
|
|
matches 50 to 100 will be shown.
|
|
result_max_count -- defines how many matches at once will be shown.
|
|
(default 50)
|
|
context_len -- defines how many words before and after a match will be
|
|
shown (default 10)
|
|
"""
|
|
self.context_len = context_len
|
|
word_str = corpus_name + '.word'
|
|
self.corpus_max_len = self.cl_attribute_size(word_str)
|
|
if self.nr_matches == 0:
|
|
print('Query resulted in 0 matches.')
|
|
else:
|
|
if self.nr_matches <= 50:
|
|
matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
|
|
0x10,
|
|
0,
|
|
self.nr_matches - 1)
|
|
matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
|
|
0x11,
|
|
0, self.nr_matches - 1)
|
|
else:
|
|
matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
|
|
0x10,
|
|
result_start_count,
|
|
result_max_count - 1)
|
|
matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
|
|
0x11,
|
|
result_start_count,
|
|
result_max_count - 1)
|
|
match_indexes = zip(matches_start, matches_end)
|
|
|
|
matches = []
|
|
manager = multiprocessing.Manager()
|
|
return_dict = manager.dict()
|
|
for i, index_pair in enumerate(match_indexes):
|
|
match = multiprocessing.Process(target=self.__get_matches,
|
|
args=(i,
|
|
index_pair,
|
|
corpus_name,
|
|
return_dict))
|
|
matches.append(match)
|
|
match.start()
|
|
for match in matches:
|
|
match.join()
|
|
# sort matches into ordered dict
|
|
ordered_results = collections.OrderedDict()
|
|
for key in sorted(return_dict.keys()):
|
|
ordered_results[key] = return_dict[key]
|
|
print('ORDERED_RESULTS', ordered_results)
|
|
|
|
def __get_matches(self, i, index_pair, corpus_name, return_dict):
|
|
"""
|
|
Get matches as readable output
|
|
|
|
Gets the actual match strings of cpos match indexes. Private helper
|
|
method used in show_results.
|
|
|
|
Keyword arguments:
|
|
i -- serial number for match at given cpos
|
|
index_pair -- match start and match end cpos
|
|
corpus_name -- name of the parent corpus
|
|
return_dict -- dictionary created with manager.dict() that holds the
|
|
extracted strings tags etc.
|
|
"""
|
|
print('START:', index_pair[0])
|
|
print('END:', index_pair[1])
|
|
print('=============================')
|
|
tmp_session = CQiWrapper(username=self.username, password=self.password,
|
|
host=self.host, port=self.port)
|
|
tmp_session.connect()
|
|
tokens = tmp_session.cl_cpos2str(self.word_str,
|
|
range(index_pair[0],
|
|
index_pair[1] + 1))
|
|
lemmas = tmp_session.cl_cpos2str(self.lemma_str,
|
|
range(index_pair[0],
|
|
index_pair[1] + 1))
|
|
pos_tags = tmp_session.cl_cpos2str(self.pos_str,
|
|
range(index_pair[0],
|
|
index_pair[1] + 1))
|
|
sem_tags = tmp_session.cl_cpos2str(self.sem_str,
|
|
range(index_pair[0],
|
|
index_pair[1] + 1))
|
|
struc_entry = tmp_session.cl_cpos2struc(self.entry_str,
|
|
range(index_pair[0],
|
|
index_pair[1] + 1))
|
|
before_index = max([0, index_pair[0] - self.context_len])
|
|
after_index = min([self.corpus_max_len,
|
|
index_pair[1] + self.context_len])
|
|
context_before = tmp_session.cl_cpos2str(self.word_str,
|
|
range(before_index,
|
|
index_pair[0]))
|
|
context_after = tmp_session.cl_cpos2str(self.word_str,
|
|
range(index_pair[1] + 1,
|
|
after_index + 1))
|
|
entry_titles = tmp_session.cl_struc2str(self.entry_title_str,
|
|
struc_entry)
|
|
entry_authors = tmp_session.cl_struc2str(self.entry_author_str,
|
|
struc_entry)
|
|
return_dict[i] = {'tokens': tokens,
|
|
'lemmas': lemmas,
|
|
'pos_tags': pos_tags,
|
|
'sem_tags': sem_tags,
|
|
'context_before': context_before,
|
|
'context_after': context_after,
|
|
'entry_title': entry_titles[0],
|
|
'entry_author': entry_authors[0],
|
|
'cpos_start': index_pair[0],
|
|
'cpos_end': index_pair[1]}
|
|
tmp_session.disconnect()
|
|
|
|
def get_cpos_info(self, cpos):
|
|
match_dict = collections.OrderedDict()
|
|
for attribute in self.attributes:
|
|
if '.entry' not in attribute:
|
|
match_str = self.cl_cpos2str(attribute, range(cpos[0], cpos[1]))
|
|
match_dict[attribute] = match_str
|
|
else:
|
|
continue
|
|
print(match_dict)
|