2020-03-23 08:10:35 +00:00
from . api import APIClient
from . constants import CONST_FIELD_MATCH , CONST_FIELD_MATCHEND
2020-03-11 12:34:09 +00:00
import time
2019-11-07 14:48:47 +00:00
2020-03-23 08:10:35 +00:00
class CQiWrapper ( APIClient ) :
2019-12-02 13:19:40 +00:00
'''
2019-11-07 14:48:47 +00:00
CQIiWrapper object
High level wrapper that groups and renames some functions of CQiClient
for ease of use . Also structures recieved data into python dictionaries .
Keyword arguments :
2019-11-18 13:24:13 +00:00
host - - host IP adress or hostname wher the cqp server is running
port - - port of the cqp server
2019-11-07 14:48:47 +00:00
username - - username used to connect to the cqp server
password - - password of the user to connect to the cqp server
2019-12-02 13:19:40 +00:00
'''
2019-11-07 14:48:47 +00:00
SUBCORPUS_NAMES = [ ]
2020-03-10 15:06:47 +00:00
def __init__ ( self , host = ' 127.0.0.1 ' , port = 4877 , username = ' anonymous ' ,
password = ' ' ) :
2020-03-20 14:12:19 +00:00
super ( CQiWrapper , self ) . __init__ ( host , port = port )
2019-11-07 14:48:47 +00:00
self . username = username
self . password = password
def connect ( self ) :
2019-12-02 13:19:40 +00:00
'''
2019-11-07 14:48:47 +00:00
Connect with CQP server
Connects via socket to the CQP server using the given username and
password from class initiation .
2019-12-02 13:19:40 +00:00
'''
2019-11-07 14:48:47 +00:00
self . ctrl_connect ( self . username , self . password )
2019-11-18 13:24:13 +00:00
def __create_attribute_strings ( self ) :
2019-12-02 13:19:40 +00:00
'''
2019-11-18 13:24:13 +00:00
Creates all needed attribute strings to query for word , lemma etc . in
the given corpus .
For example : CORPUS_NAME . word to query words
2019-12-02 13:19:40 +00:00
Automaticalle creates strings for all pre defined tags .
'''
2019-11-11 14:35:37 +00:00
p_attrs = self . corpus_positional_attributes ( self . corpus_name )
struct_attrs = self . corpus_structural_attributes ( self . corpus_name )
self . attr_strings = { }
self . attr_strings [ ' positional_attrs ' ] = { }
self . attr_strings [ ' struct_attrs ' ] = { }
for p_attr in p_attrs :
self . attr_strings [ ' positional_attrs ' ] [ p_attr ] = ( self . corpus_name
+ ' . '
+ p_attr )
2019-11-19 10:48:00 +00:00
for struct_attr in struct_attrs :
2019-11-11 14:35:37 +00:00
self . attr_strings [ ' struct_attrs ' ] [ struct_attr ] = ( self . corpus_name
+ ' . '
+ struct_attr )
2020-03-23 08:10:35 +00:00
print ( ( ' All positional and '
2019-12-02 13:19:40 +00:00
' structural attributes: {} ' ) . format ( self . attr_strings ) )
2019-11-18 13:24:13 +00:00
def select_corpus ( self , corpus_name ) :
2019-12-02 13:19:40 +00:00
'''
Checks if given copus name exists . If it exists set it as the main
corpus name used to create the needed query attribute strings like
CORPUS_NAME . word .
'''
2019-11-18 13:24:13 +00:00
if corpus_name in self . corpus_list_coprora ( ) :
self . corpus_name = corpus_name
self . __create_attribute_strings ( )
2020-03-23 08:10:35 +00:00
print ( ' {} does exist. ' . format ( corpus_name ) )
2019-11-18 13:24:13 +00:00
else :
2020-03-23 08:10:35 +00:00
print ( ' {} does not exist. ' . format ( corpus_name ) )
2019-12-02 13:19:40 +00:00
raise Exception ( ' Given Corpus Name is not in corpora list. ' )
2019-11-07 14:48:47 +00:00
def disconnect ( self ) :
2019-12-02 13:19:40 +00:00
'''
2019-11-07 14:48:47 +00:00
Disconnect from CQP server
Disconnects from the CQP server . Closes used socket after disconnect .
2019-12-02 13:19:40 +00:00
'''
2019-11-07 14:48:47 +00:00
self . ctrl_bye ( )
2020-03-23 08:10:35 +00:00
print ( ' Disconnected from cqp server. ' )
2019-11-07 14:48:47 +00:00
2019-11-18 13:24:13 +00:00
def query_subcorpus ( self , query , result_subcorpus_name = ' Query-results ' ) :
2019-12-02 13:19:40 +00:00
'''
2019-11-07 14:48:47 +00:00
Create subcorpus
Input query will be used to create a subcorpus holding all cpos match
positions for that query .
Keyword arguments :
2019-12-02 13:19:40 +00:00
result_subcorpus_name - - set name of the subcorpus which holds all
2019-11-07 14:48:47 +00:00
cpos match positions , produced by the query
query - - query written in cqp query language
2019-12-02 13:19:40 +00:00
'''
2020-01-21 13:50:27 +00:00
self . query = query
2019-11-11 14:35:37 +00:00
self . cqp_query ( self . corpus_name , result_subcorpus_name , query )
2019-11-18 13:24:13 +00:00
self . result_subcorpus = ( self . corpus_name
+ ' : '
+ result_subcorpus_name )
self . SUBCORPUS_NAMES . append ( self . result_subcorpus )
2020-01-27 12:19:33 +00:00
self . match_count = self . cqp_subcorpus_size ( self . result_subcorpus )
2020-03-23 08:10:35 +00:00
print ( ' Nr of all matches is: {} ' . format ( self . match_count ) )
2019-11-07 14:48:47 +00:00
def show_subcorpora ( self ) :
2019-12-02 13:19:40 +00:00
'''
2019-11-18 13:24:13 +00:00
Show all subcorpora currently saved by the cqp server .
2019-12-02 13:19:40 +00:00
'''
2019-11-11 14:35:37 +00:00
return self . cqp_list_subcorpora ( self . corpus_name )
2019-11-07 14:48:47 +00:00
2019-11-18 13:24:13 +00:00
def show_query_results ( self ,
context_len = 10 ,
2019-11-28 13:14:56 +00:00
result_len = 1000 ,
result_offset = 0 ) :
2019-12-02 13:19:40 +00:00
'''
2019-11-07 14:48:47 +00:00
Show query results
Shows the actual matched strings produce by the query . Uses the cpos
match indexes to grab those strings . saves them into an orderd
2019-11-18 13:24:13 +00:00
dictionary . Also saves coresponding tags , lemmas and context . Gets those
informations using the corresponding cpos .
2019-11-07 14:48:47 +00:00
Keyword arguments :
context_len - - defines how many words before and after a match will be
shown ( default 10 )
2019-12-02 13:19:40 +00:00
result_len - - defines for how many matches all informations like lemma
and POS are being grabbed
result_offset - - defines the offset of the matches being requested . If
the offset is 100 informations for matches 100 to result_len are being
grabbed
'''
t0 = time . time ( )
2019-11-07 14:48:47 +00:00
self . context_len = context_len
2019-11-18 13:24:13 +00:00
self . corpus_max_len = self . cl_attribute_size (
self . attr_strings [ ' positional_attrs ' ] [ ' word ' ]
)
2020-01-27 12:19:33 +00:00
self . nr_matches = min ( result_len , self . match_count )
2020-01-27 15:11:34 +00:00
if self . match_count == 0 :
2020-03-23 08:10:35 +00:00
print ( ' Query resulted in 0 matches. ' )
2020-01-27 15:11:34 +00:00
self . results = { ' code ' : 0 ,
' result ' : { ' matches ' : [ ] ,
' match_count ' : self . match_count ,
' cpos_lookup ' : { } ,
' text_lookup ' : { } }
}
return self . results
2019-11-07 14:48:47 +00:00
else :
2019-11-18 13:24:13 +00:00
# Get match cpos boundries
# match_boundries shows the start and end cpos of one match as a
# pair of cpositions
# [(1355, 1357), (1477, 1479)] Example for two boundry pairs
2019-11-28 14:19:52 +00:00
offset_start = 0 if result_offset == 0 else result_offset
2020-03-23 08:10:35 +00:00
print ( ' Offset start is: {} ' . format ( offset_start ) )
2020-01-27 12:19:33 +00:00
offset_end = min ( ( self . nr_matches + result_offset - 1 ) , self . match_count - 1 )
2020-03-23 08:10:35 +00:00
print ( ' Offset end is: {} ' . format ( offset_end ) )
2019-11-18 13:24:13 +00:00
match_boundaries = zip ( self . cqp_dump_subcorpus ( self . result_subcorpus ,
2020-03-23 08:10:35 +00:00
CONST_FIELD_MATCH ,
2019-11-28 13:14:56 +00:00
offset_start ,
offset_end ) ,
2019-11-18 13:24:13 +00:00
self . cqp_dump_subcorpus ( self . result_subcorpus ,
2020-03-23 08:10:35 +00:00
CONST_FIELD_MATCHEND ,
2019-11-28 13:14:56 +00:00
offset_start ,
offset_end ) )
2019-11-18 13:24:13 +00:00
2019-12-02 13:19:40 +00:00
# Generate all cpos between match boundries including start and end
# boundries.
2019-11-27 08:41:21 +00:00
# Also generate cpos for left and right context.
# Save those cpos into dict as lists for the keys 'lc', 'hit' and 'rc'
# Also collect all cpos together in one list for the final request of
# all cpos informations
all_matches = [ ]
all_cpos = [ ]
for start , end in match_boundaries :
2019-11-28 14:19:52 +00:00
end + = 1
2019-11-27 08:41:21 +00:00
lc_cpos = list ( range ( max ( [ 0 , start - self . context_len ] ) , start ) )
lc = { ' lc ' : lc_cpos }
2019-11-28 14:19:52 +00:00
match_cpos = list ( range ( start , end ) )
2019-11-27 08:41:21 +00:00
match = { ' hit ' : match_cpos }
2019-12-02 13:19:40 +00:00
rc_cpos = list ( range ( end , min ( [ self . corpus_max_len ,
end + self . context_len ] ) ) )
2019-11-27 08:41:21 +00:00
rc = { ' rc ' : rc_cpos }
lc . update ( match )
lc . update ( rc )
all_cpos . extend ( lc_cpos + match_cpos + rc_cpos )
all_matches . append ( lc )
2019-12-02 13:19:40 +00:00
all_cpos = list ( set ( all_cpos ) ) # get rid of cpos duplicates
len_all_cpos = len ( all_cpos )
2019-11-28 13:14:56 +00:00
t1 = time . time ( )
t_total = t1 - t0
2020-03-23 08:10:35 +00:00
print ( ' Time to create all CPOS for query: {} ' . format ( t_total ) )
print ( ' Requesting {} CPOS with one query. ' . format ( len_all_cpos ) )
2019-11-18 13:24:13 +00:00
# Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
# all cpos entries in all_cpos_list
2019-11-27 08:41:21 +00:00
# Also saves these informations into self.results dict
2019-12-02 13:19:40 +00:00
t2 = time . time ( )
2019-11-27 08:41:21 +00:00
all_cpos_infos , text_lookup = self . get_cpos_infos ( all_cpos )
2019-12-02 13:19:40 +00:00
t3 = time . time ( )
t_final = t3 - t2
2020-03-23 08:10:35 +00:00
print ( ' Got infos for {} CPOS in {} seconds: ' . format ( len_all_cpos ,
2020-01-20 14:53:06 +00:00
t_final ) )
2020-01-27 12:19:33 +00:00
self . results = { ' code ' : 0 ,
' result ' : { ' matches ' : all_matches ,
' match_count ' : self . match_count ,
' cpos_lookup ' : all_cpos_infos ,
2020-01-27 15:11:34 +00:00
' text_lookup ' : text_lookup }
2020-01-27 12:19:33 +00:00
}
2019-11-27 08:41:21 +00:00
return self . results
2019-11-18 13:24:13 +00:00
def get_cpos_infos ( self , all_cpos ) :
'''
Get cpos informations like CORPUS_NAME . word or CORPUS_NAME . lemma for
all cpos entries specified in the parameter all_cpos .
'''
2019-11-28 13:14:56 +00:00
# Get all positional attribute informations
2019-11-18 13:24:13 +00:00
cpos_infos = { }
2019-11-27 08:41:21 +00:00
for p_attr_key in self . attr_strings [ ' positional_attrs ' ] . keys ( ) :
match_strs = self . cl_cpos2str ( self . attr_strings [ ' positional_attrs ' ] [ p_attr_key ] , all_cpos )
cpos_infos [ p_attr_key ] = match_strs
2019-11-28 13:14:56 +00:00
# Get all strucutural attribute informations
tmp_info = { }
structs_to_check = [ ]
2019-11-27 08:41:21 +00:00
for struct_attr_key in self . attr_strings [ ' struct_attrs ' ] . keys ( ) :
2019-11-28 13:14:56 +00:00
key = self . attr_strings [ ' struct_attrs ' ] [ struct_attr_key ]
has_value = self . corpus_structural_attribute_has_values ( key )
struct_ids = self . cl_cpos2struc ( key , all_cpos )
if has_value is False : # Get IDs of strucutural elements without values (this means get IDs of XML tags. Struct elements only have values if they are XML attributes)
tmp_info [ struct_attr_key ] = [ ]
2019-11-27 08:41:21 +00:00
for id in struct_ids :
2019-11-28 13:14:56 +00:00
tmp_info [ struct_attr_key ] . append ( id )
2019-11-27 08:41:21 +00:00
else :
2019-11-28 13:14:56 +00:00
structs_to_check . append ( { key : struct_attr_key } )
2020-03-23 08:10:35 +00:00
print ( ' Structs to check: {} ' . format ( structs_to_check ) )
2019-11-28 13:14:56 +00:00
struct_attr_values = list ( tmp_info . values ( ) )
2020-03-23 08:10:35 +00:00
# print('Struct attr value list: {}'.format(struct_attr_values))
2019-11-28 13:14:56 +00:00
struct_attr_keys = list ( tmp_info . keys ( ) )
2020-03-23 08:10:35 +00:00
# print('Struct attr key list: {}'.format(struct_attr_keys))
2019-11-28 13:14:56 +00:00
# Build textlookup dictionary
2020-01-29 15:12:57 +00:00
text_lookup_ids = list ( set ( struct_attr_values [ 0 ] ) ) # every CPOS is associated with one text id. A set is build to only gather text_lookup informations for every unique text id
text_lookup = { } # final dict containing all info of one text identified by its id
2019-11-28 13:14:56 +00:00
for d in structs_to_check :
s_key , s_value = zip ( * d . items ( ) )
2020-03-23 08:10:35 +00:00
print ( ' dict entries: {} : {} ' . format ( s_key , s_value ) )
2020-01-29 15:12:57 +00:00
s_value = s_value [ 0 ] . split ( ' _ ' , 1 ) [ - 1 ]
2020-03-23 08:10:35 +00:00
print ( ' S_VALUE: {} ' . format ( s_value ) )
2019-11-28 13:14:56 +00:00
struct_values = self . cl_struc2str ( s_key [ 0 ] , text_lookup_ids )
2020-03-23 08:10:35 +00:00
print ( ' Extracted Value with key {} : {} ' . format ( s_key [ 0 ] , struct_values ) )
2019-11-28 13:14:56 +00:00
zipped = dict ( zip ( text_lookup_ids , struct_values ) )
for zip_key , zip_value in zipped . items ( ) :
2020-03-23 08:10:35 +00:00
print ( ' Text id as key is: {} ' . format ( zip_key ) )
print ( ' Value of this text is: {} ' . format ( zip_value ) )
2019-11-28 13:14:56 +00:00
check = text_lookup . get ( zip_key )
2020-03-23 08:10:35 +00:00
print ( ' check: {} ' . format ( check ) )
2019-11-28 13:14:56 +00:00
if check is None :
text_lookup [ zip_key ] = { s_value : zip_value }
else :
text_lookup [ zip_key ] . update ( { s_value : zip_value } )
# zip keys and values together
attr_values_list = [ ]
attr_keys_list = [ ]
2019-11-18 13:24:13 +00:00
for key in cpos_infos . keys ( ) :
2019-11-28 13:14:56 +00:00
attr_values_list . append ( cpos_infos [ key ] )
attr_keys_list . append ( key )
attr_keys_list . extend ( struct_attr_keys )
attr_values_list . extend ( struct_attr_values )
joined_cpos_infos = zip ( all_cpos , * attr_values_list )
2019-11-18 13:24:13 +00:00
dict_cpos_infos = { }
for info in joined_cpos_infos :
2019-11-28 13:14:56 +00:00
dict_cpos_infos [ info [ 0 ] ] = dict ( zip ( attr_keys_list , info [ 1 : ] ) )
2019-11-27 08:41:21 +00:00
return dict_cpos_infos , text_lookup
2019-12-02 13:19:40 +00:00
def get_sentences ( self ,
match_cpos_list ,
get_surrounding_s = False ,
l_r_s_context_additional_len = 1 ) :
'''
Get sentence informations for one match also set if and how much left
right context sentences should be grabbed surrounding the given CPOS .
'''
t0 = time . time ( )
key = self . corpus_name + ' .s '
first_cpos , last_cpos = match_cpos_list [ 0 ] , match_cpos_list [ - 1 ]
context_sentences = { }
s_ids = self . cl_cpos2struc ( key , [ first_cpos , last_cpos ] )
2020-03-23 08:10:35 +00:00
print ( ' s id match: {} ' . format ( s_ids ) )
2019-12-02 13:19:40 +00:00
for s_id in s_ids :
s_start , s_end = self . cl_struc2cpos ( key , s_id )
s_cpos = list ( range ( s_start , s_end + 1 ) )
context_sentences [ s_id ] = s_cpos
if get_surrounding_s :
2020-02-03 11:58:40 +00:00
max_s_id = self . cl_attribute_size ( key ) - 1
2020-03-23 08:10:35 +00:00
print ( ' max sid: {} ' . format ( max_s_id ) )
2019-12-02 13:19:40 +00:00
additional_s_ids = [ ]
additional_s = list ( range ( max ( s_ids [ 0 ]
- l_r_s_context_additional_len ,
0 ) ,
min ( s_ids [ - 1 ]
+ l_r_s_context_additional_len ,
max_s_id ) + 1 ) )
additional_s_ids . extend ( additional_s )
for s_id in additional_s_ids :
2020-03-23 08:10:35 +00:00
print ( ' s id additional: {} ' . format ( s_id ) )
2019-12-02 13:19:40 +00:00
s_start , s_end = self . cl_struc2cpos ( key , s_id )
s_cpos = list ( range ( s_start , s_end + 1 ) )
context_sentences [ s_id ] = s_cpos
all_cpos = [ ]
for key in context_sentences . keys ( ) :
all_cpos . extend ( context_sentences [ key ] )
all_cpos = list ( set ( all_cpos ) )
all_cpos_infos , text_lookup = self . get_cpos_infos ( all_cpos )
t1 = time . time ( )
t_total = t1 - t0
2020-03-23 08:10:35 +00:00
print ( ' Got all sentences informations in {} seconds ' . format ( t_total ) )
2019-12-02 15:13:53 +00:00
match_context = { ' context_s_cpos ' : context_sentences ,
' cpos_lookup ' : all_cpos_infos ,
2019-12-03 14:11:31 +00:00
' text_lookup ' : text_lookup ,
' match_cpos_list ' : match_cpos_list }
2019-12-02 15:13:53 +00:00
return match_context