2019-11-27 09:18:15 +00:00
from . CQiClient import CQiClient
from . CQi import CONST_FIELD_MATCH , CONST_FIELD_MATCHEND
2019-11-28 13:14:56 +00:00
import time
2019-11-27 09:18:15 +00:00
from app import logger # only works if imported into opaque web app
2019-11-07 14:48:47 +00:00
class CQiWrapper ( CQiClient ) :
"""
CQIiWrapper object
High level wrapper that groups and renames some functions of CQiClient
for ease of use . Also structures recieved data into python dictionaries .
Keyword arguments :
2019-11-18 13:24:13 +00:00
host - - host IP adress or hostname wher the cqp server is running
port - - port of the cqp server
2019-11-07 14:48:47 +00:00
username - - username used to connect to the cqp server
password - - password of the user to connect to the cqp server
"""
SUBCORPUS_NAMES = [ ]
def __init__ ( self , host = ' 127.0.0.1 ' , port = 4877 , username = ' opaque ' ,
password = ' opaque ' ) :
super ( CQiWrapper , self ) . __init__ ( host = host , port = port )
self . username = username
self . password = password
def connect ( self ) :
"""
Connect with CQP server
Connects via socket to the CQP server using the given username and
password from class initiation .
"""
self . ctrl_connect ( self . username , self . password )
2019-11-18 13:24:13 +00:00
def __create_attribute_strings ( self ) :
"""
Creates all needed attribute strings to query for word , lemma etc . in
the given corpus .
For example : CORPUS_NAME . word to query words
"""
2019-11-11 14:35:37 +00:00
p_attrs = self . corpus_positional_attributes ( self . corpus_name )
struct_attrs = self . corpus_structural_attributes ( self . corpus_name )
self . attr_strings = { }
self . attr_strings [ ' positional_attrs ' ] = { }
self . attr_strings [ ' struct_attrs ' ] = { }
for p_attr in p_attrs :
self . attr_strings [ ' positional_attrs ' ] [ p_attr ] = ( self . corpus_name
+ ' . '
+ p_attr )
2019-11-19 10:48:00 +00:00
for struct_attr in struct_attrs :
2019-11-11 14:35:37 +00:00
self . attr_strings [ ' struct_attrs ' ] [ struct_attr ] = ( self . corpus_name
+ ' . '
+ struct_attr )
2019-11-27 08:41:21 +00:00
# logger.warning(('All positional and '
# 'structural attributes: {}').format(self.attr_strings))
2019-11-18 13:24:13 +00:00
def select_corpus ( self , corpus_name ) :
if corpus_name in self . corpus_list_coprora ( ) :
self . corpus_name = corpus_name
self . __create_attribute_strings ( )
2019-11-27 08:41:21 +00:00
# logger.warning('{} does exist.'.format(corpus_name))
2019-11-18 13:24:13 +00:00
else :
2019-11-27 08:41:21 +00:00
# logger.warning('{} does not exist.'.format(corpus_name))
2019-11-19 10:48:00 +00:00
pass
2019-11-07 14:48:47 +00:00
def disconnect ( self ) :
"""
Disconnect from CQP server
Disconnects from the CQP server . Closes used socket after disconnect .
"""
self . ctrl_bye ( )
self . connection . close ( )
2019-11-27 08:41:21 +00:00
# logger.warning('Disconnected from cqp server.')
2019-11-07 14:48:47 +00:00
2019-11-18 13:24:13 +00:00
def query_subcorpus ( self , query , result_subcorpus_name = ' Query-results ' ) :
2019-11-07 14:48:47 +00:00
"""
Create subcorpus
Input query will be used to create a subcorpus holding all cpos match
positions for that query .
Keyword arguments :
result_subcorpus_name - - user set name of the subcorpus which holds all
cpos match positions , produced by the query
query - - query written in cqp query language
"""
2019-11-11 14:35:37 +00:00
self . cqp_query ( self . corpus_name , result_subcorpus_name , query )
2019-11-18 13:24:13 +00:00
self . result_subcorpus = ( self . corpus_name
+ ' : '
+ result_subcorpus_name )
self . SUBCORPUS_NAMES . append ( self . result_subcorpus )
self . nr_matches = self . cqp_subcorpus_size ( self . result_subcorpus )
2019-11-28 13:14:56 +00:00
print ( ' Nr of all matches is: ' , self . nr_matches )
2019-11-27 08:41:21 +00:00
# logger.warning('Nr of all matches is: {}'.format(self.nr_matches))
2019-11-07 14:48:47 +00:00
def show_subcorpora ( self ) :
2019-11-18 13:24:13 +00:00
"""
Show all subcorpora currently saved by the cqp server .
"""
2019-11-11 14:35:37 +00:00
return self . cqp_list_subcorpora ( self . corpus_name )
2019-11-07 14:48:47 +00:00
2019-11-18 13:24:13 +00:00
def show_query_results ( self ,
context_len = 10 ,
2019-11-28 13:14:56 +00:00
result_len = 1000 ,
result_offset = 0 ) :
2019-11-07 14:48:47 +00:00
"""
Show query results
Shows the actual matched strings produce by the query . Uses the cpos
match indexes to grab those strings . saves them into an orderd
2019-11-18 13:24:13 +00:00
dictionary . Also saves coresponding tags , lemmas and context . Gets those
informations using the corresponding cpos .
2019-11-07 14:48:47 +00:00
Keyword arguments :
context_len - - defines how many words before and after a match will be
shown ( default 10 )
2019-11-18 13:24:13 +00:00
result_len - - defines how many results are actually grabbed
2019-11-07 14:48:47 +00:00
"""
self . context_len = context_len
2019-11-18 13:24:13 +00:00
self . corpus_max_len = self . cl_attribute_size (
self . attr_strings [ ' positional_attrs ' ] [ ' word ' ]
)
self . nr_matches = min ( result_len , self . nr_matches )
2019-11-07 14:48:47 +00:00
if self . nr_matches == 0 :
2019-11-27 08:41:21 +00:00
# logger.warning('Query resulted in 0 matches.')
2019-11-18 13:24:13 +00:00
return None
2019-11-07 14:48:47 +00:00
else :
2019-11-18 13:24:13 +00:00
# Get match cpos boundries
# match_boundries shows the start and end cpos of one match as a
# pair of cpositions
# [(1355, 1357), (1477, 1479)] Example for two boundry pairs
2019-11-28 13:14:56 +00:00
offset_start = 0 + ( result_offset + 1 ) if result_offset != 0 else result_offset
offset_end = self . nr_matches + result_offset
2019-11-18 13:24:13 +00:00
match_boundaries = zip ( self . cqp_dump_subcorpus ( self . result_subcorpus ,
CONST_FIELD_MATCH ,
2019-11-28 13:14:56 +00:00
offset_start ,
offset_end ) ,
2019-11-18 13:24:13 +00:00
self . cqp_dump_subcorpus ( self . result_subcorpus ,
CONST_FIELD_MATCHEND ,
2019-11-28 13:14:56 +00:00
offset_start ,
offset_end ) )
2019-11-18 13:24:13 +00:00
2019-11-27 08:41:21 +00:00
# Generate all cpos between match boundries including start and end boundries.
# Also generate cpos for left and right context.
# Save those cpos into dict as lists for the keys 'lc', 'hit' and 'rc'
# Also collect all cpos together in one list for the final request of
# all cpos informations
all_matches = [ ]
all_cpos = [ ]
for start , end in match_boundaries :
lc_cpos = list ( range ( max ( [ 0 , start - self . context_len ] ) , start ) )
lc = { ' lc ' : lc_cpos }
match_cpos = list ( range ( start , end + 1 ) )
match = { ' hit ' : match_cpos }
2019-11-28 13:14:56 +00:00
rc_cpos = list ( range ( end , min ( [ self . corpus_max_len , end + self . context_len ] ) ) )
2019-11-27 08:41:21 +00:00
rc = { ' rc ' : rc_cpos }
lc . update ( match )
lc . update ( rc )
all_cpos . extend ( lc_cpos + match_cpos + rc_cpos )
all_matches . append ( lc )
# print(all_matches)
# print(all_cpos)
# Get all cpos for all sneteces boundries
2019-11-28 13:14:56 +00:00
# s_lookup = {}
# for s_id in set(s_ids):
# s_start, s_end = self.cl_struc2cpos('UTOPIEN.s', s_id)
# # CHANGE to UTOPIEN.s will always be like this in nopaque
# s_cpos = range(s_start, s_end)
# s_lookup.update({s_id: list(s_cpos)})
# # print(list(s_cpos))
# all_cpos.extend(s_cpos)
t0 = time . time ( )
2019-11-27 08:41:21 +00:00
all_cpos = list ( set ( all_cpos ) ) # get rid of cpos duplicates
2019-11-28 13:14:56 +00:00
t1 = time . time ( )
t_total = t1 - t0
print ( ' TIME FOR ALL CPOS: ' , t_total )
print ( ' CPOS SUM: ' , len ( all_cpos ) )
2019-11-18 13:24:13 +00:00
# Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
# all cpos entries in all_cpos_list
2019-11-27 08:41:21 +00:00
# Also saves these informations into self.results dict
2019-11-28 13:14:56 +00:00
t6 = time . time ( )
2019-11-27 08:41:21 +00:00
all_cpos_infos , text_lookup = self . get_cpos_infos ( all_cpos )
2019-11-28 13:14:56 +00:00
t7 = time . time ( )
t_final = t7 - t6
print ( ' GOT ALL RESULTS IN: ' , t_final )
2019-11-27 08:41:21 +00:00
self . results = { ' matches ' : all_matches , ' cpos_lookup ' : all_cpos_infos ,
2019-11-28 13:14:56 +00:00
' text_lookup ' : text_lookup }
2019-11-27 08:41:21 +00:00
return self . results
2019-11-18 13:24:13 +00:00
def get_cpos_infos ( self , all_cpos ) :
'''
Get cpos informations like CORPUS_NAME . word or CORPUS_NAME . lemma for
all cpos entries specified in the parameter all_cpos .
'''
2019-11-28 13:14:56 +00:00
# Get all positional attribute informations
2019-11-18 13:24:13 +00:00
cpos_infos = { }
2019-11-27 08:41:21 +00:00
for p_attr_key in self . attr_strings [ ' positional_attrs ' ] . keys ( ) :
match_strs = self . cl_cpos2str ( self . attr_strings [ ' positional_attrs ' ] [ p_attr_key ] , all_cpos )
cpos_infos [ p_attr_key ] = match_strs
2019-11-28 13:14:56 +00:00
# Get all strucutural attribute informations
tmp_info = { }
structs_to_check = [ ]
2019-11-27 08:41:21 +00:00
for struct_attr_key in self . attr_strings [ ' struct_attrs ' ] . keys ( ) :
2019-11-28 13:14:56 +00:00
key = self . attr_strings [ ' struct_attrs ' ] [ struct_attr_key ]
has_value = self . corpus_structural_attribute_has_values ( key )
struct_ids = self . cl_cpos2struc ( key , all_cpos )
if has_value is False : # Get IDs of strucutural elements without values (this means get IDs of XML tags. Struct elements only have values if they are XML attributes)
tmp_info [ struct_attr_key ] = [ ]
2019-11-27 08:41:21 +00:00
for id in struct_ids :
2019-11-28 13:14:56 +00:00
tmp_info [ struct_attr_key ] . append ( id )
2019-11-27 08:41:21 +00:00
else :
2019-11-28 13:14:56 +00:00
structs_to_check . append ( { key : struct_attr_key } )
struct_attr_values = list ( tmp_info . values ( ) )
struct_attr_keys = list ( tmp_info . keys ( ) )
# Build textlookup dictionary
text_lookup_ids = list ( set ( struct_attr_values [ 0 ] ) ) # First is always one text
text_lookup = { }
for d in structs_to_check :
s_key , s_value = zip ( * d . items ( ) )
s_value = s_value [ 0 ] . split ( ' _ ' ) [ 1 ]
struct_values = self . cl_struc2str ( s_key [ 0 ] , text_lookup_ids )
zipped = dict ( zip ( text_lookup_ids , struct_values ) )
for zip_key , zip_value in zipped . items ( ) :
check = text_lookup . get ( zip_key )
if check is None :
text_lookup [ zip_key ] = { s_value : zip_value }
else :
text_lookup [ zip_key ] . update ( { s_value : zip_value } )
# zip keys and values together
attr_values_list = [ ]
attr_keys_list = [ ]
2019-11-18 13:24:13 +00:00
for key in cpos_infos . keys ( ) :
2019-11-28 13:14:56 +00:00
attr_values_list . append ( cpos_infos [ key ] )
attr_keys_list . append ( key )
attr_keys_list . extend ( struct_attr_keys )
attr_values_list . extend ( struct_attr_values )
joined_cpos_infos = zip ( all_cpos , * attr_values_list )
2019-11-18 13:24:13 +00:00
dict_cpos_infos = { }
for info in joined_cpos_infos :
2019-11-28 13:14:56 +00:00
dict_cpos_infos [ info [ 0 ] ] = dict ( zip ( attr_keys_list , info [ 1 : ] ) )
2019-11-27 08:41:21 +00:00
return dict_cpos_infos , text_lookup