Add CQiWrapper

This commit is contained in:
Stephan Porada 2019-11-07 15:48:47 +01:00
parent 13421a9e7f
commit 8e5e8408bd
3 changed files with 1257 additions and 0 deletions

View File

@ -0,0 +1,406 @@
# ########################################################################### #
# IMS CQi specification #
# #
# Version: 0.1a ;o) #
# Author: Stefan Evert (evert@ims.uni-stuttgart.de) #
# Modified by: Patrick Jentsch <p.jentsch@uni-bielefeld.de #
# Modified date: Thurs Oct 10 <Uhrzeit> #
# ########################################################################### #
""" 1. padding """
PAD = 0x00
""" 2. CQi responses """
""" 2.1 CQI_STATUS_* """
STATUS = 0x01
STATUS_OK = 0x0101
STATUS_CONNECT_OK = 0x0102
STATUS_BYE_OK = 0x0103
STATUS_PING_OK = 0x0104
""" 2.2 CQI_ERROR_* """
ERROR = 0x02
ERROR_GENERAL_ERROR = 0x0201
ERROR_CONNECT_REFUSED = 0x0202
ERROR_USER_ABORT = 0x0203
ERROR_SYNTAX_ERROR = 0x0204
# includes corpus/attribute/subcorpus specifier syntax
""" 2.3 CQI_DATA_* """
DATA = 0x03
DATA_BYTE = 0x0301
DATA_BOOL = 0x0302
DATA_INT = 0x0303
DATA_STRING = 0x0304
DATA_BYTE_LIST = 0x0305
DATA_BOOL_LIST = 0x0306
DATA_INT_LIST = 0x0307
DATA_STRING_LIST = 0x0308
DATA_INT_INT = 0x0309
DATA_INT_INT_INT_INT = 0x030A
DATA_INT_TABLE = 0x030B
""" 2.4 CQI_CL_ERROR_* """
"""
" NOTE: some CL error codes are not represented in the CQi specs
" - usually because they're not used in the CL any more
" - CDA_ENOSTRING is not considered an error (returns -1)
" - CDA_EARGS: dynamic attribute calls not yet supported
"""
CL_ERROR = 0x04
CL_ERROR_NO_SUCH_ATTRIBUTE = 0x0401
# returned if CQi server couldn't open attribute
CL_ERROR_WRONG_ATTRIBUTE_TYPE = 0x0402
# CDA_EATTTYPE
CL_ERROR_OUT_OF_RANGE = 0x0403
# CDA_EIDORNG, CDA_EIDXORNG, CDA_EPOSORNG
CL_ERROR_REGEX = 0x0404
# CDA_EPATTERN (not used), CDA_EBADREGEX
CL_ERROR_CORPUS_ACCESS = 0x0405
# CDA_ENODATA
CL_ERROR_OUT_OF_MEMORY = 0x0406
# CDA_ENOMEM
# this means the CQi server has run out of memory;
# try discarding some other corpora and/or subcorpora
CL_ERROR_INTERNAL = 0x0407
# CDA_EOTHER, CDA_ENYI
# this is the classical 'please contact technical support' error
""" 2.5 CQI_CQP_ERROR_* """
CQP_ERROR = 0x05
# CQP error messages yet to be defined
CQP_ERROR_GENERAL = 0x0501
CQP_ERROR_NO_SUCH_CORPUS = 0x0502
CQP_ERROR_INVALID_FIELD = 0x0503
CQP_ERROR_OUT_OF_RANGE = 0x0504
# various cases where a number is out of range
""" 3. CQi commands """
""" 3.1 CQI_CTRL_* """
CTRL = 0x11
CTRL_CONNECT = 0x1101
# INPUT: (STRING username, STRING password)
# OUTPUT: CQI_STATUS_CONNECT_OK, CQI_ERROR_CONNECT_REFUSED
CTRL_BYE = 0x1102
# INPUT: ()
# OUTPUT: CQI_STATUS_BYE_OK
CTRL_USER_ABORT = 0x1103
# INPUT: ()
# OUTPUT:
CTRL_PING = 0x1104
# INPUT: ()
# OUTPUT: CQI_STATUS_PING_OK
CTRL_LAST_GENERAL_ERROR = 0x1105
# INPUT: ()
# OUTPUT: CQI_DATA_STRING
# full-text error message for the last general error reported by the CQi server
""" 3.2 CQI_ASK_FEATURE_* """
ASK_FEATURE = 0x12
ASK_FEATURE_CQI_1_0 = 0x1201
# INPUT: ()
# OUTPUT: CQI_DATA_BOOL
ASK_FEATURE_CL_2_3 = 0x1202
# INPUT: ()
# OUTPUT: CQI_DATA_BOOL
ASK_FEATURE_CQP_2_3 = 0x1203
# INPUT: ()
# OUTPUT: CQI_DATA_BOOL
""" 3.3 CQI_CORPUS_* """
CORPUS = 0x13
CORPUS_LIST_CORPORA = 0x1301
# INPUT: ()
# OUTPUT: CQI_DATA_STRING_LIST
CORPUS_CHARSET = 0x1303
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING
CORPUS_PROPERTIES = 0x1304
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING_LIST
CORPUS_POSITIONAL_ATTRIBUTES = 0x1305
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING_LIST
CORPUS_STRUCTURAL_ATTRIBUTES = 0x1306
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING_LIST
CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES = 0x1307
# INPUT: (STRING attribute)
# OUTPUT: CQI_DATA_BOOL
CORPUS_ALIGNMENT_ATTRIBUTES = 0x1308
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING_LIST
CORPUS_FULL_NAME = 0x1309
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING
# the full name of <corpus> as specified in its registry entry
CORPUS_INFO = 0x130A
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING_LIST
# returns the contents of the .info file of <corpus> as a list of lines
CORPUS_DROP_CORPUS = 0x130B
# INPUT: (STRING corpus)
# OUTPUT: CQI_STATUS_OK
# try to unload a corpus and all its attributes from memory
""" 3.4 CQI_CL_* """
CL = 0x14
# low-level corpus access (CL functions)
CL_ATTRIBUTE_SIZE = 0x1401
# INPUT: (STRING attribute)
# OUTPUT: CQI_DATA_INT
# returns the size of <attribute>:
# - number of tokens (positional)
# - number of regions (structural)
# - number of alignments (alignment)
CL_LEXICON_SIZE = 0x1402
# INPUT: (STRING attribute)
# OUTPUT: CQI_DATA_INT
# returns the number of entries in the lexicon of a positional attribute;
# valid lexicon IDs range from 0 .. (lexicon_size - 1)
CL_DROP_ATTRIBUTE = 0x1403
# INPUT: (STRING attribute)
# OUTPUT: CQI_STATUS_OK
# unload attribute from memory
"""
" NOTE: simple (scalar) mappings are applied to lists (the returned list has
" exactly the same length as the list passed as an argument)
"""
CL_STR2ID = 0x1404
# INPUT: (STRING attribute, STRING_LIST strings)
# OUTPUT: CQI_DATA_INT_LIST
# returns -1 for every string in <strings> that is not found in the lexicon
CL_ID2STR = 0x1405
# INPUT: (STRING attribute, INT_LIST id)
# OUTPUT: CQI_DATA_STRING_LIST
# returns "" for every ID in <id> that is out of range
CL_ID2FREQ = 0x1406
# INPUT: (STRING attribute, INT_LIST id)
# OUTPUT: CQI_DATA_INT_LIST
# returns 0 for every ID in <id> that is out of range
CL_CPOS2ID = 0x1407
# INPUT: (STRING attribute, INT_LIST cpos)
# OUTPUT: CQI_DATA_INT_LIST
# returns -1 for every corpus position in <cpos> that is out of range
CL_CPOS2STR = 0x1408
# INPUT: (STRING attribute, INT_LIST cpos)
# OUTPUT: CQI_DATA_STRING_LIST
# returns "" for every corpus position in <cpos> that is out of range
CL_CPOS2STRUC = 0x1409
# INPUT: (STRING attribute, INT_LIST cpos)
# OUTPUT: CQI_DATA_INT_LIST
# returns -1 for every corpus position not inside a structure region
"""
" NOTE: temporary addition for the Euralex2000 tutorial, but should probably be
" included in CQi specs
"""
CL_CPOS2LBOUND = 0x1420
# INPUT: (STRING attribute, INT_LIST cpos)
# OUTPUT: CQI_DATA_INT_LIST
# returns left boundary of s-attribute region enclosing cpos, -1 if not in
# region
CL_CPOS2RBOUND = 0x1421
# INPUT: (STRING attribute, INT_LIST cpos)
# OUTPUT: CQI_DATA_INT_LIST
# returns right boundary of s-attribute region enclosing cpos, -1 if not in
# region
CL_CPOS2ALG = 0x140A
# INPUT: (STRING attribute, INT_LIST cpos)
# OUTPUT: CQI_DATA_INT_LIST
# returns -1 for every corpus position not inside an alignment
CL_STRUC2STR = 0x140B
# INPUT: (STRING attribute, INT_LIST strucs)
# OUTPUT: CQI_DATA_STRING_LIST
# returns annotated string values of structure regions in <strucs>; "" if out
# of range
# check CQI_CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES(<attribute>) first
"""
" NOTE: the following mappings take a single argument and return multiple
" values, including lists of arbitrary size
"""
CL_ID2CPOS = 0x140C
# INPUT: (STRING attribute, INT id)
# OUTPUT: CQI_DATA_INT_LIST
# returns all corpus positions where the given token occurs
CL_IDLIST2CPOS = 0x140D
# INPUT: (STRING attribute, INT_LIST id_list)
# OUTPUT: CQI_DATA_INT_LIST
# returns all corpus positions where one of the tokens in <id_list>
# occurs; the returned list is sorted as a whole, not per token id
CL_REGEX2ID = 0x140E
# INPUT: (STRING attribute, STRING regex)
# OUTPUT: CQI_DATA_INT_LIST
# returns lexicon IDs of all tokens that match <regex>; the returned
# list may be empty (size 0);
CL_STRUC2CPOS = 0x140F
# INPUT: (STRING attribute, INT struc)
# OUTPUT: CQI_DATA_INT_INT
# returns start and end corpus positions of structure region <struc>
CL_ALG2CPOS = 0x1410
# INPUT: (STRING attribute, INT alg)
# OUTPUT: CQI_DATA_INT_INT_INT_INT
# returns (src_start, src_end, target_start, target_end)
""" 3.5 CQI_CQP_* """
CQP = 0x15
CQP_QUERY = 0x1501
# INPUT: (STRING mother_corpus, STRING subcorpus_name, STRING query)
# OUTPUT: CQI_STATUS_OK
# <query> must include the ';' character terminating the query.
CQP_LIST_SUBCORPORA = 0x1502
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING_LIST
CQP_SUBCORPUS_SIZE = 0x1503
# INPUT: (STRING subcorpus)
# OUTPUT: CQI_DATA_INT
CQP_SUBCORPUS_HAS_FIELD = 0x1504
# INPUT: (STRING subcorpus, BYTE field)
# OUTPUT: CQI_DATA_BOOL
CQP_DUMP_SUBCORPUS = 0x1505
# INPUT: (STRING subcorpus, BYTE field, INT first, INT last)
# OUTPUT: CQI_DATA_INT_LIST
# Dump the values of <field> for match ranges <first> .. <last> in <subcorpus>.
# <field> is one of the CQI_CONST_FIELD_* constants.
CQP_DROP_SUBCORPUS = 0x1509
# INPUT: (STRING subcorpus)
# OUTPUT: CQI_STATUS_OK
# delete a subcorpus from memory
"""
" NOTE: The following two functions are temporarily included for the Euralex
" 2000 tutorial demo
"""
""" NOTE: frequency distribution of single tokens """
CQP_FDIST_1 = 0x1510
# INPUT: (STRING subcorpus, INT cutoff, BYTE field, STRING attribute)
# OUTPUT: CQI_DATA_INT_LIST
# returns <n> (id, frequency) pairs flattened into a list of size 2*<n>
# field is one of CQI_CONST_FIELD_MATCH, CQI_CONST_FIELD_TARGET,
# CQI_CONST_FIELD_KEYWORD
# NB: pairs are sorted by frequency desc.
""" NOTE: frequency distribution of pairs of tokens """
CQP_FDIST_2 = 0x1511
# INPUT: (STRING subcorpus, INT cutoff, BYTE field1, STRING attribute1,
# BYTE field2, STRING attribute2)
# OUTPUT: CQI_DATA_INT_LIST
# returns <n> (id1, id2, frequency) pairs flattened into a list of size 3*<n>
# NB: triples are sorted by frequency desc.
""" 4. Constant Definitions """
CONST_FALSE = 0x00
CONST_NO = 0x00
CONST_TRUE = 0x01
CONST_YES = 0x01
"""
" NOTE: The following constants specify which field will be returned by
" CQI_CQP_DUMP_SUBCORPUS and some other subcorpus commands.
"""
CONST_FIELD_MATCH = 0x10
CONST_FIELD_MATCHEND = 0x11
"""
" NOTE: The constants specifiying target0 .. target9 are guaranteed to have the
" numerical values 0 .. 9, so clients do not need to look up the constant
" values if they're handling arbitrary targets.
"""
CONST_FIELD_TARGET_0 = 0x00
CONST_FIELD_TARGET_1 = 0x01
CONST_FIELD_TARGET_2 = 0x02
CONST_FIELD_TARGET_3 = 0x03
CONST_FIELD_TARGET_4 = 0x04
CONST_FIELD_TARGET_5 = 0x05
CONST_FIELD_TARGET_6 = 0x06
CONST_FIELD_TARGET_7 = 0x07
CONST_FIELD_TARGET_8 = 0x08
CONST_FIELD_TARGET_9 = 0x09
"""
" NOTE: The following constants are provided for backward compatibility with
" traditional CQP field names & while the generalised target concept
" isn't yet implemented in the CQPserver.
"""
CONST_FIELD_TARGET = 0x00
CONST_FIELD_KEYWORD = 0x09
""" NOTE: CQi version is CQI_MAJOR_VERSION.CQI_MINOR_VERSION """
MAJOR_VERSION = 0x00
MINOR_VERSION = 0x01
""" 5. CQi lookup dictionary. """
lookup = {
257: 'CQI_STATUS_OK',
258: 'CQI_STATUS_CONNECT_OK',
259: 'CQI_STATUS_BYE_OK',
260: 'CQI_STATUS_PING_OK',
513: 'CQI_ERROR_GENERAL_ERROR',
514: 'CQI_ERROR_CONNECT_REFUSED',
515: 'CQI_ERROR_USER_ABORT',
516: 'CQI_ERROR_SYNTAX_ERROR',
769: 'CQI_DATA_BYTE',
770: 'CQI_DATA_BOOL',
771: 'CQI_DATA_INT',
772: 'CQI_DATA_STRING',
773: 'CQI_DATA_BYTE_LIST',
774: 'CQI_DATA_BOOL_LIST',
775: 'CQI_DATA_INT_LIST',
776: 'CQI_DATA_STRING_LIST',
777: 'CQI_DATA_INT_INT',
778: 'CQI_DATA_INT_INT_INT_INT',
779: 'CQI_DATA_INT_TABLE',
1025: 'CQI_CL_ERROR_NO_SUCH_ATTRIBUTE',
1026: 'CQI_CL_ERROR_WRONG_ATTRIBUTE_TYPE',
1027: 'CQI_CL_ERROR_OUT_OF_RANGE',
1028: 'CQI_CL_ERROR_REGEX',
1029: 'CQI_CL_ERROR_CORPUS_ACCESS',
1030: 'CQI_CL_ERROR_OUT_OF_MEMORY',
1031: 'CQI_CL_ERROR_INTERNAL',
1281: 'CQI_CQP_ERROR_GENERAL',
1282: 'CQI_CQP_ERROR_NO_SUCH_CORPUS',
1283: 'CQI_CQP_ERROR_INVALID_FIELD',
1284: 'CQI_CQP_ERROR_OUT_OF_RANGE',
4353: 'CQI_CTRL_CONNECT',
4354: 'CQI_CTRL_BYE',
4355: 'CQI_CTRL_USER_ABORT',
4356: 'CQI_CTRL_PING',
4357: 'CQI_CTRL_LAST_GENERAL_ERROR',
4609: 'CQI_ASK_FEATURE_CQI_1_0',
4610: 'CQI_ASK_FEATURE_CL_2_3',
4611: 'CQI_ASK_FEATURE_CQP_2_3',
4865: 'CQI_CORPUS_LIST_CORPORA',
4867: 'CQI_CORPUS_CHARSET',
4868: 'CQI_CORPUS_PROPERTIES',
4869: 'CQI_CORPUS_POSITIONAL_ATTRIBUTES',
4870: 'CQI_CORPUS_STRUCTURAL_ATTRIBUTES',
4871: 'CQI_CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES',
4872: 'CQI_CORPUS_ALIGNMENT_ATTRIBUTES',
4873: 'CQI_CORPUS_FULL_NAME',
4874: 'CQI_CORPUS_INFO',
4875: 'CQI_CORPUS_DROP_CORPUS',
5121: 'CQI_CL_ATTRIBUTE_SIZE',
5122: 'CQI_CL_LEXICON_SIZE',
5123: 'CQI_CL_DROP_ATTRIBUTE',
5124: 'CQI_CL_STR2ID',
5125: 'CQI_CL_ID2STR',
5126: 'CQI_CL_ID2FREQ',
5127: 'CQI_CL_CPOS2ID',
5128: 'CQI_CL_CPOS2STR',
5129: 'CQI_CL_CPOS2STRUC',
5130: 'CQI_CL_CPOS2ALG',
5131: 'CQI_CL_STRUC2STR',
5132: 'CQI_CL_ID2CPOS',
5133: 'CQI_CL_IDLIST2CPOS',
5134: 'CQI_CL_REGEX2ID',
5135: 'CQI_CL_STRUC2CPOS',
5136: 'CQI_CL_ALG2CPOS',
5152: 'CQI_CL_CPOS2LBOUND',
5153: 'CQI_CL_CPOS2RBOUND',
5377: 'CQI_CQP_QUERY',
5378: 'CQI_CQP_LIST_SUBCORPORA',
5379: 'CQI_CQP_SUBCORPUS_SIZE',
5380: 'CQI_CQP_SUBCORPUS_HAS_FIELD',
5381: 'CQI_CQP_DUMP_SUBCORPUS',
5385: 'CQI_CQP_DROP_SUBCORPUS',
5392: 'CQI_CQP_FDIST_1',
5393: 'CQI_CQP_FDIST_2'
}

View File

@ -0,0 +1,611 @@
from . import CQi
import socket
import struct
class CQiClient:
def __init__(self, host='127.0.0.1', port=4877):
self.host = host
self.port = port
self.connection = socket.socket()
self.connection.connect((self.host, self.port))
def ctrl_connect(self, username, password):
# INPUT: (STRING username, STRING password)
# OUTPUT: CQI_STATUS_CONNECT_OK, CQI_ERROR_CONNECT_REFUSED
# print('CTRL_CONNECT')
self.__send_WORD(CQi.CTRL_CONNECT)
self.__send_STRING(username)
self.__send_STRING(password)
self.__recv_response()
def ctrl_bye(self):
# INPUT: ()
# OUTPUT: CQI_STATUS_BYE_OK
# print('CTRL_BYE')
self.__send_WORD(CQi.CTRL_BYE)
self.__recv_response()
def ctrl_user_abort(self):
# INPUT: ()
# OUTPUT:
# print('CTRL_USER_ABORT')
self.__send_WORD(CQi.CTRL_USER_ABORT)
def ctrl_ping(self):
# INPUT: ()
# OUTPUT: CQI_STATUS_PING_OK
# print('CTRL_PING')
self.__send_WORD(CQi.CTRL_PING)
self.__recv_response()
def ctrl_last_general_error(self):
# INPUT: ()
# OUTPUT: CQI_DATA_STRING
# full-text error message for the last general error reported by the
# CQi server
# print('CTRL_LAST_GENERAL_ERROR')
self.__send_WORD(CQi.CTRL_LAST_GENERAL_ERROR)
return self.__recv_response()
def ask_feature_cqi_1_0(self):
# INPUT: ()
# OUTPUT: CQI_DATA_BOOL
# print('ASK_FEATURE_CQI_1_0')
self.__send_WORD(CQi.ASK_FEATURE_CQI_1_0)
return self.__recv_response()
def ask_feature_cl_2_3(self):
# INPUT: ()
# OUTPUT: CQI_DATA_BOOL
# print('ASK_FEATURE_CL_2_3')
self.__send_WORD(CQi.ASK_FEATURE_CL_2_3)
return self.__recv_response()
def ask_feature_cqp_2_3(self):
# INPUT: ()
# OUTPUT: CQI_DATA_BOOL
# print('ASK_FEATURE_CL_2_3')
self.__send_WORD(CQi.ASK_FEATURE_CL_2_3)
return self.__recv_response()
def corpus_list_coprora(self):
# INPUT: ()
# OUTPUT: CQI_DATA_STRING_LIST
# print('CORPUS_LIST_CORPORA')
self.__send_WORD(CQi.CORPUS_LIST_CORPORA)
return self.__recv_response()
def corpus_charset(self, corpus):
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING
# print('CORPUS_CHARSET')
self.__send_WORD(CQi.CORPUS_CHARSET)
self.__send_STRING(corpus)
return self.__recv_response()
def corpus_properties(self, corpus):
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING_LIST
# print('CORPUS_PROPERTIES')
self.__send_WORD(CQi.CORPUS_PROPERTIES)
self.__send_STRING(corpus)
return self.__recv_response()
def corpus_positional_attributes(self, corpus):
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING_LIST
# print('CORPUS_POSITIONAL_ATTRIBUTES')
self.__send_WORD(CQi.CORPUS_POSITIONAL_ATTRIBUTES)
self.__send_STRING(corpus)
return self.__recv_response()
def corpus_structural_attributes(self, corpus):
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING_LIST
# print('CORPUS_STRUCTURAL_ATTRIBUTES')
self.__send_WORD(CQi.CORPUS_STRUCTURAL_ATTRIBUTES)
self.__send_STRING(corpus)
return self.__recv_response()
def corpus_structural_attribute_has_values(self, attribute):
# INPUT: (STRING attribute)
# OUTPUT: CQI_DATA_BOOL
# print('CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES')
self.__send_WORD(CQi.CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES)
self.__send_STRING(attribute)
return self.__recv_response()
def corpus_alignment_attributes(self, corpus):
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING_LIST
# print('CORPUS_ALIGNMENT_ATTRIBUTES')
self.__send_WORD(CQi.CORPUS_ALIGNMENT_ATTRIBUTES)
self.__send_STRING(corpus)
return self.__recv_response()
def corpus_full_name(self, corpus):
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING
# the full name of <corpus> as specified in its registry entry
# print('CORPUS_FULL_NAME')
self.__send_WORD(CQi.CORPUS_FULL_NAME)
self.__send_STRING(corpus)
return self.__recv_response()
def corpus_info(self, corpus):
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING_LIST
# returns the contents of the .info file of <corpus> as a list of lines
# print('CORPUS_INFO')
self.__send_WORD(CQi.CORPUS_INFO)
self.__send_STRING(corpus)
return self.__recv_response()
def corpus_drop_corpus(self, corpus):
'''
' Broken
' TODO: Check what type of return value is provided by the server.
'''
# INPUT: (STRING corpus)
# OUTPUT: CQI_STATUS_OK
# try to unload a corpus and all its attributes from memory
# print('CORPUS_DROP_CORPUS')
self.__send_WORD(CQi.CORPUS_DROP_CORPUS)
self.__send_STRING(corpus)
self.__recv_response()
def cl_attribute_size(self, attribute):
# INPUT: (STRING attribute)
# OUTPUT: CQI_DATA_INT
# returns the size of <attribute>:
# number of tokens (positional)
# number of regions (structural)
# number of alignments (alignment)
# print('CL_ATTRIBUTE_SIZE')
self.__send_WORD(CQi.CL_ATTRIBUTE_SIZE)
self.__send_STRING(attribute)
return self.__recv_response()
def cl_lexicon_size(self, attribute):
# INPUT: (STRING attribute)
# OUTPUT: CQI_DATA_INT
# returns the number of entries in the lexicon of a positional
# attribute;
# valid lexicon IDs range from 0 .. (lexicon_size - 1)
# print('CL_LEXICON_SIZE')
self.__send_WORD(CQi.CL_LEXICON_SIZE)
self.__send_STRING(attribute)
return self.__recv_response()
def cl_drop_attribute(self, attribute):
# INPUT: (STRING attribute)
# OUTPUT: CQI_STATUS_OK
# unload attribute from memory
# print('CL_DROP_ATTRIBUTE')
self.__send_WORD(CQi.CL_LEXICON_SIZE)
self.__send_STRING(attribute)
self.__recv_response()
"""
" NOTE: simple (scalar) mappings are applied to lists (the returned list
" has exactly the same length as the list passed as an argument)
"""
def cl_str2id(self, attribute, strings):
# INPUT: (STRING attribute, STRING_LIST strings)
# OUTPUT: CQI_DATA_INT_LIST
# returns -1 for every string in <strings> that is not found in the
# lexicon
# print('CL_STR2ID')
self.__send_WORD(CQi.CL_LEXICON_SIZE)
self.__send_STRING(attribute)
self.__send_STRING_LIST(strings)
return self.__recv_response()
def cl_id2str(self, attribute, id):
# INPUT: (STRING attribute, INT_LIST id)
# OUTPUT: CQI_DATA_STRING_LIST
# returns "" for every ID in <id> that is out of range
# print('CL_ID2STR')
self.__send_WORD(CQi.CL_ID2STR)
self.__send_STRING(attribute)
self.__send_INT_LIST(id)
return self.__recv_response()
def cl_id2freq(self, attribute, id):
# INPUT: (STRING attribute, INT_LIST id)
# OUTPUT: CQI_DATA_INT_LIST
# returns 0 for every ID in <id> that is out of range
# print('CL_ID2FREQ')
self.__send_WORD(CQi.CL_ID2FREQ)
self.__send_STRING(attribute)
self.__send_INT_LIST(id)
return self.__recv_response()
def cl_cpos2id(self, attribute, cpos):
# INPUT: (STRING attribute, INT_LIST cpos)
# OUTPUT: CQI_DATA_INT_LIST
# returns -1 for every corpus position in <cpos> that is out of range
# print('CL_CPOS2ID')
self.__send_WORD(CQi.CL_ID2FREQ)
self.__send_STRING(attribute)
self.__send_INT_LIST(cpos)
return self.__recv_response()
def cl_cpos2str(self, attribute, cpos):
# INPUT: (STRING attribute, INT_LIST cpos)
# OUTPUT: CQI_DATA_STRING_LIST
# returns "" for every corpus position in <cpos> that is out of range
# print('CL_CPOS2STR')
self.__send_WORD(CQi.CL_CPOS2STR)
self.__send_STRING(attribute)
self.__send_INT_LIST(cpos)
return self.__recv_response()
def cl_cpos2struc(self, attribute, cpos):
# INPUT: (STRING attribute, INT_LIST cpos)
# OUTPUT: CQI_DATA_INT_LIST
# returns -1 for every corpus position not inside a structure region
# print('CL_CPOS2STRUC')
self.__send_WORD(CQi.CL_CPOS2STRUC)
self.__send_STRING(attribute)
self.__send_INT_LIST(cpos)
return self.__recv_response()
"""
" NOTE: temporary addition for the Euralex2000 tutorial, but should
" probably be included in CQi specs
"""
def cl_cpos2lbound(self, attribute, cpos):
# INPUT: (STRING attribute, INT_LIST cpos)
# OUTPUT: CQI_DATA_INT_LIST
# returns left boundary of s-attribute region enclosing cpos, -1 if not
# in region
# print('CL_CPOS2LBOUND')
self.__send_WORD(CQi.CL_CPOS2LBOUND)
self.__send_STRING(attribute)
self.__send_INT_LIST(cpos)
return self.__recv_response()
def cl_cpos2rbound(self, attribute, cpos):
# INPUT: (STRING attribute, INT_LIST cpos)
# OUTPUT: CQI_DATA_INT_LIST
# returns right boundary of s-attribute region enclosing cpos, -1 if
# not in region
# print('CL_CPOS2RBOUND')
self.__send_WORD(CQi.CL_CPOS2RBOUND)
self.__send_STRING(attribute)
self.__send_INT_LIST(cpos)
return self.__recv_response()
def cl_cpos2alg(self, attribute, cpos):
# INPUT: (STRING attribute, INT_LIST cpos)
# OUTPUT: CQI_DATA_INT_LIST
# returns -1 for every corpus position not inside an alignment
# print('CL_CPOS2ALG')
self.__send_WORD(CQi.CL_CPOS2ALG)
self.__send_STRING(attribute)
self.__send_INT_LIST(cpos)
return self.__recv_response()
def cl_struc2str(self, attribute, strucs):
# INPUT: (STRING attribute, INT_LIST strucs)
# OUTPUT: CQI_DATA_STRING_LIST
# returns annotated string values of structure regions in <strucs>; ""
# if out of range
# check CQI_CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES(<attribute>) first
# print('CL_STRUC2STR')
self.__send_WORD(CQi.CL_STRUC2STR)
self.__send_STRING(attribute)
self.__send_INT_LIST(strucs)
return self.__recv_response()
"""
" NOTE: the following mappings take a single argument and return multiple
" values, including lists of arbitrary size
"""
def cl_id2cpos(self, attribute, id):
# INPUT: (STRING attribute, INT id)
# OUTPUT: CQI_DATA_INT_LIST
# returns all corpus positions where the given token occurs
# print('CL_ID2CPOS')
self.__send_WORD(CQi.CL_ID2CPOS)
self.__send_STRING(attribute)
self.__send_INT(id)
return self.__recv_response()
def cl_idlist2cpos(self, attribute, id_list):
# INPUT: (STRING attribute, INT_LIST id_list)
# OUTPUT: CQI_DATA_INT_LIST
# returns all corpus positions where one of the tokens in <id_list>
# occurs; the returned list is sorted as a whole, not per token id
# print('CL_IDLIST2CPOS')
self.__send_WORD(CQi.CL_IDLIST2CPOS)
self.__send_STRING(attribute)
self.__send_INT_LIST(id_list)
return self.__recv_response()
def cl_regex2id(self, attribute, regex):
# INPUT: (STRING attribute, STRING regex)
# OUTPUT: CQI_DATA_INT_LIST
# returns lexicon IDs of all tokens that match <regex>; the returned
# list may be empty (size 0);
# print('CL_REGEX2ID')
self.__send_WORD(CQi.CL_REGEX2ID)
self.__send_STRING(attribute)
self.__send_STRING(regex)
return self.__recv_response()
def cl_struc2cpos(self, attribute, struc):
# INPUT: (STRING attribute, INT struc)
# OUTPUT: CQI_DATA_INT_INT
# returns start and end corpus positions of structure region <struc>
# print('CL_STRUC2CPOS')
self.__send_WORD(CQi.CL_STRUC2CPOS)
self.__send_STRING(attribute)
self.__send_INT(struc)
return self.__recv_response()
def cl_alg2cpos(self, attribute, alg):
# INPUT: (STRING attribute, INT alg)
# OUTPUT: CQI_DATA_INT_INT_INT_INT
# returns (src_start, src_end, target_start, target_end)
# print('CL_ALG2CPOS')
self.__send_WORD(CQi.CL_ALG2CPOS)
self.__send_STRING(attribute)
self.__send_INT(alg)
return self.__recv_response()
def cqp_query(self, mother_corpus, subcorpus_name, query):
# INPUT: (STRING mother_corpus, STRING subcorpus_name, STRING query)
# OUTPUT: CQI_STATUS_OK
# <query> must include the ';' character terminating the query.
# print('CQP_QUERY')
self.__send_WORD(CQi.CQP_QUERY)
self.__send_STRING(mother_corpus)
self.__send_STRING(subcorpus_name)
self.__send_STRING(query)
self.__recv_WORD()
def cqp_list_subcorpora(self, corpus):
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING_LIST
# print('CQP_LIST_SUBCORPORA')
self.__send_WORD(CQi.CQP_LIST_SUBCORPORA)
self.__send_STRING(corpus)
return self.__recv_response()
def cqp_subcorpus_size(self, subcorpus):
# INPUT: (STRING subcorpus)
# OUTPUT: CQI_DATA_INT
# print('CQP_SUBCORPUS_SIZE')
self.__send_WORD(CQi.CQP_SUBCORPUS_SIZE)
self.__send_STRING(subcorpus)
return self.__recv_response()
def cqp_subcorpus_has_field(self, subcorpus, field):
# INPUT: (STRING subcorpus, BYTE field)
# OUTPUT: CQI_DATA_BOOL
# print('CQP_SUBCORPUS_HAS_FIELD')
self.__send_WORD(CQi.CQP_SUBCORPUS_HAS_FIELD)
self.__send_STRING(subcorpus)
self.__send_BYTE(field)
return self.__recv_response()
def cqp_dump_subcorpus(self, subcorpus, field, first, last):
# INPUT: (STRING subcorpus, BYTE field, INT first, INT last)
# OUTPUT: CQI_DATA_INT_LIST
# Dump the values of <field> for match ranges <first> .. <last>
# in <subcorpus>. <field> is one of the CQI_CONST_FIELD_* constants.
# print('CQP_DUMP_SUBCORPUS')
self.__send_WORD(CQi.CQP_DUMP_SUBCORPUS)
self.__send_STRING(subcorpus)
self.__send_BYTE(field)
self.__send_INT(first)
self.__send_INT(last)
return self.__recv_response()
def cqp_drop_subcorpus(self, subcorpus):
# INPUT: (STRING subcorpus)
# OUTPUT: CQI_STATUS_OK
# delete a subcorpus from memory
# print('CQP_DROP_SUBCORPUS')
self.__send_WORD(CQi.CQP_DROP_SUBCORPUS)
self.__send_STRING(subcorpus)
self.__recv_response()
"""
" NOTE: The following two functions are temporarily included for the
" Euralex 2000 tutorial demo
"""
def cqp_fdist_1(self, subcorpus, cutoff, field, attribute):
""" NOTE: frequency distribution of single tokens """
# INPUT: (STRING subcorpus, INT cutoff, BYTE field, STRING attribute)
# OUTPUT: CQI_DATA_INT_LIST
# returns <n> (id, frequency) pairs flattened into a list of size 2*<n>
# field is one of CQI_CONST_FIELD_MATCH, CQI_CONST_FIELD_TARGET,
# CQI_CONST_FIELD_KEYWORD
# NB: pairs are sorted by frequency desc.
# print('CQP_FDIST_1')
self.__send_WORD(CQi.CQP_FDIST_1)
self.__send_STRING(subcorpus)
self.__send_INT(cutoff)
self.__send_BYTE(field)
self.__send_STRING(attribute)
return self.__recv_response()
def cqp_fdist_2(self, subcorpus, cutoff, field1, attribute1, field2,
attribute2):
""" NOTE: frequency distribution of pairs of tokens """
# INPUT: (STRING subcorpus, INT cutoff, BYTE field1, STRING attribute1,
# BYTE field2, STRING attribute2)
# OUTPUT: CQI_DATA_INT_LIST
# returns <n> (id1, id2, frequency) pairs flattened into a list of size
# 3*<n>
# NB: triples are sorted by frequency desc.
# print('CQP_FDIST_2')
self.__send_WORD(CQi.CQP_FDIST_2)
self.__send_STRING(subcorpus)
self.__send_INT(cutoff)
self.__send_BYTE(field1)
self.__send_STRING(attribute1)
self.__send_BYTE(field2)
self.__send_STRING(attribute2)
return self.__recv_response()
def __recv_response(self):
byte_data = self.__recv_WORD()
response_type = byte_data >> 8
if response_type == CQi.STATUS:
response = byte_data
elif response_type == CQi.ERROR:
raise Exception(CQi.lookup[byte_data])
elif response_type == CQi.DATA:
response = self.__recv_DATA(byte_data)
elif response_type == CQi.CL_ERROR:
raise Exception(CQi.lookup[byte_data])
elif response_type == CQi.CQP_ERROR:
raise Exception(CQi.lookup[byte_data])
else:
raise Exception(
'Unknown response type: {}'.format(hex(response_type))
)
return response
def __recv_DATA(self, data_type):
if data_type == CQi.DATA_BYTE:
data = self.__recv_DATA_BYTE()
elif data_type == CQi.DATA_BOOL:
data = self.__recv_DATA_BOOL()
elif data_type == CQi.DATA_INT:
data = self.__recv_DATA_INT()
elif data_type == CQi.DATA_STRING:
data = self.__recv_DATA_STRING()
elif data_type == CQi.DATA_BYTE_LIST:
data = self.__recv_DATA_BYTE_LIST()
elif data_type == CQi.DATA_BOOL_LIST:
data = self.__recv_DATA_BOOL_LIST()
elif data_type == CQi.DATA_INT_LIST:
data = self.__recv_DATA_INT_LIST()
elif data_type == CQi.DATA_STRING_LIST:
data = self.__recv_DATA_STRING_LIST()
elif data_type == CQi.DATA_INT_INT:
data = self.__recv_DATA_INT_INT()
elif data_type == CQi.DATA_INT_INT_INT_INT:
data = self.__recv_DATA_INT_INT_INT_INT()
elif data_type == CQi.DATA_INT_TABLE:
data = self.__recv_DATA_INT_TABLE()
else:
raise Exception('Unknown data type: {}'.format(hex(data_type)))
return data
def __recv_DATA_BYTE(self):
byte_data = self.connection.recv(1)
return struct.unpack('!B', byte_data)[0]
def __recv_DATA_BOOL(self):
byte_data = self.connection.recv(1)
return struct.unpack('!?', byte_data)[0]
def __recv_DATA_INT(self):
byte_data = self.connection.recv(4)
return struct.unpack('!i', byte_data)[0]
def __recv_DATA_STRING(self):
n = self.__recv_WORD()
byte_data = self.connection.recv(n)
return struct.unpack('!{}s'.format(n), byte_data)[0].decode()
def __recv_DATA_BYTE_LIST(self):
data = []
n = self.__recv_DATA_INT()
while n > 0:
data.append(self.__recv_DATA_BYTE())
n -= 1
return data
def __recv_DATA_BOOL_LIST(self):
data = []
n = self.__recv_DATA_INT()
while n > 0:
data.append(self.__recv_DATA_BOOL())
n -= 1
return data
def __recv_DATA_INT_LIST(self):
data = []
n = self.__recv_DATA_INT()
while n > 0:
data.append(self.__recv_DATA_INT())
n -= 1
return data
def __recv_DATA_STRING_LIST(self):
data = []
n = self.__recv_DATA_INT()
while n > 0:
data.append(self.__recv_DATA_STRING())
n -= 1
return data
def __recv_DATA_INT_INT(self):
return (self.__recv_INT(), self.__recv_INT())
def __recv_DATA_INT_INT_INT_INT(self):
return (self.__recv_INT(),
self.__recv_INT(),
self.__recv_INT(),
self.__recv_INT())
def __recv_DATA_INT_TABLE(self):
rows = self.__recv_DATA_INT()
columns = self.__recv_DATA_INT()
data = []
for i in range(0, rows):
row = []
for j in range(0, columns):
row.append(self.__recv_DATA_INT())
data.append(row)
return data
def __recv_WORD(self):
byte_data = self.connection.recv(2)
return struct.unpack('!H', byte_data)[0]
def __send_BYTE(self, byte_data):
data = struct.pack('!B', byte_data)
self.connection.sendall(data)
def __send_BOOL(self, bool_data):
data = struct.pack('!?', bool_data)
self.connection.sendall(data)
def __send_INT(self, int_data):
data = struct.pack('!i', int_data)
self.connection.sendall(data)
def __send_STRING(self, string_data):
encoded_string_data = string_data.encode('utf-8')
n = len(encoded_string_data)
data = struct.pack('!H{}s'.format(n), n, encoded_string_data)
self.connection.sendall(data)
def __send_INT_LIST(self, int_list_data):
n = len(int_list_data)
self.__send_INT(n)
for int_data in int_list_data:
self.__send_INT(int_data)
def __send_STRING_LIST(self, string_list_data):
n = len(string_list_data)
self.__send_INT(n)
for string_data in string_list_data:
self.__send_STRING(string_data)
def __send_WORD(self, word_data):
data = struct.pack('!H', word_data)
self.connection.sendall(data)

View File

@ -0,0 +1,240 @@
from .CQiClient import CQiClient
import multiprocessing
import collections
import socket
class CQiWrapper(CQiClient):
"""
CQIiWrapper object
High level wrapper that groups and renames some functions of CQiClient
for ease of use. Also structures recieved data into python dictionaries.
Keyword arguments:
username -- username used to connect to the cqp server
password -- password of the user to connect to the cqp server
"""
SUBCORPUS_NAMES = []
def __init__(self, host='127.0.0.1', port=4877, username='opaque',
password='opaque'):
super(CQiWrapper, self).__init__(host=host, port=port)
self.username = username
self.password = password
def connect(self):
"""
Connect with CQP server
Connects via socket to the CQP server using the given username and
password from class initiation.
"""
self.ctrl_connect(self.username, self.password)
def create_attribute_strings(self, corpus_name):
self.word_str = corpus_name + '.word'
self.lemma_str = corpus_name + '.lemma'
self.pos_str = corpus_name + '.pos'
self.sem_str = corpus_name + '.sem'
self.entry_str = corpus_name + '.entry'
self.entry_author_str = self.entry_str + '_author'
self.entry_title_str = self.entry_str + '_title'
self.attributes = [self.word_str,
self.lemma_str,
self.pos_str,
self.sem_str,
self.entry_str,
self.entry_author_str,
self.entry_title_str]
def disconnect(self):
"""
Disconnect from CQP server
Disconnects from the CQP server. Closes used socket after disconnect.
"""
self.ctrl_bye()
self.connection.close()
def query_subcorpus(self, corpus_name, result_subcorpus_name, query):
"""
Create subcorpus
Input query will be used to create a subcorpus holding all cpos match
positions for that query.
Keyword arguments:
corpus_name -- name of the corpus the query will be used on
result_subcorpus_name -- user set name of the subcorpus which holds all
cpos match positions, produced by the query
query -- query written in cqp query language
"""
self.cqp_query(corpus_name, result_subcorpus_name, query)
self.result_subcorpus_ns = (corpus_name
+ ':'
+ result_subcorpus_name)
self.SUBCORPUS_NAMES.append(self.result_subcorpus_ns)
self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus_ns)
print('Nr of all matches is:', self.nr_matches)
def show_subcorpora(self):
print('Known subcorpora:', self.SUBCORPUS_NAMES)
return self.SUBCORPUS_NAMES
def show_results(self,
corpus_name,
result_start_count=0,
result_max_count=50,
context_len=10,):
"""
Show query results
Shows the actual matched strings produce by the query. Uses the cpos
match indexes to grab those strings. saves them into an orderd
dictionary. Also saves coresponding tags, lemmas and context:
OrderedDict([
(0,
{
'tokens': ['Big', 'Brother', 'himself'],
'lemmas': ['big', 'brother', 'himself'],
'pos_tags': ['JJ', 'NN1', 'PPX1'],
'sem_tags': ['|A11.1+|N3.2+|N5+|', '|S2.2m|S4m|S9/S2.2m|',
'|Z8m|'],
'context_before': ['figures', 'of', 'the', 'Party', ',',
'almost', 'on', 'a', 'level', 'with'],
'context_after': [',', 'and', 'then', 'had', 'engaged',
'in', 'counter-revolu-', 'tionary',
'activities', ','],
'entry_title': '1984', 'entry_author':
'george_orwell',
'cpos_start': 110490,
'cpos_end': 110492
}
)
])
Keyword arguments:
corpus_name -- name of the parent corpus the subcorpus is part of
result_start_count -- start position of the dumped subcorpus.
(default 0) If it is 0 matches 0 to 50 will be shown. If it is 50
matches 50 to 100 will be shown.
result_max_count -- defines how many matches at once will be shown.
(default 50)
context_len -- defines how many words before and after a match will be
shown (default 10)
"""
self.context_len = context_len
word_str = corpus_name + '.word'
self.corpus_max_len = self.cl_attribute_size(word_str)
if self.nr_matches == 0:
print('Query resulted in 0 matches.')
else:
if self.nr_matches <= 50:
matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
0x10,
0,
self.nr_matches - 1)
matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
0x11,
0, self.nr_matches - 1)
else:
matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
0x10,
result_start_count,
result_max_count - 1)
matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
0x11,
result_start_count,
result_max_count - 1)
match_indexes = zip(matches_start, matches_end)
matches = []
manager = multiprocessing.Manager()
return_dict = manager.dict()
for i, index_pair in enumerate(match_indexes):
match = multiprocessing.Process(target=self.__get_matches,
args=(i,
index_pair,
corpus_name,
return_dict))
matches.append(match)
match.start()
for match in matches:
match.join()
# sort matches into ordered dict
ordered_results = collections.OrderedDict()
for key in sorted(return_dict.keys()):
ordered_results[key] = return_dict[key]
print('ORDERED_RESULTS', ordered_results)
def __get_matches(self, i, index_pair, corpus_name, return_dict):
"""
Get matches as readable output
Gets the actual match strings of cpos match indexes. Private helper
method used in show_results.
Keyword arguments:
i -- serial number for match at given cpos
index_pair -- match start and match end cpos
corpus_name -- name of the parent corpus
return_dict -- dictionary created with manager.dict() that holds the
extracted strings tags etc.
"""
print('START:', index_pair[0])
print('END:', index_pair[1])
print('=============================')
tmp_session = CQiWrapper(username=self.username, password=self.password,
host=self.host, port=self.port)
tmp_session.connect()
tokens = tmp_session.cl_cpos2str(self.word_str,
range(index_pair[0],
index_pair[1] + 1))
lemmas = tmp_session.cl_cpos2str(self.lemma_str,
range(index_pair[0],
index_pair[1] + 1))
pos_tags = tmp_session.cl_cpos2str(self.pos_str,
range(index_pair[0],
index_pair[1] + 1))
sem_tags = tmp_session.cl_cpos2str(self.sem_str,
range(index_pair[0],
index_pair[1] + 1))
struc_entry = tmp_session.cl_cpos2struc(self.entry_str,
range(index_pair[0],
index_pair[1] + 1))
before_index = max([0, index_pair[0] - self.context_len])
after_index = min([self.corpus_max_len,
index_pair[1] + self.context_len])
context_before = tmp_session.cl_cpos2str(self.word_str,
range(before_index,
index_pair[0]))
context_after = tmp_session.cl_cpos2str(self.word_str,
range(index_pair[1] + 1,
after_index + 1))
entry_titles = tmp_session.cl_struc2str(self.entry_title_str,
struc_entry)
entry_authors = tmp_session.cl_struc2str(self.entry_author_str,
struc_entry)
return_dict[i] = {'tokens': tokens,
'lemmas': lemmas,
'pos_tags': pos_tags,
'sem_tags': sem_tags,
'context_before': context_before,
'context_after': context_after,
'entry_title': entry_titles[0],
'entry_author': entry_authors[0],
'cpos_start': index_pair[0],
'cpos_end': index_pair[1]}
tmp_session.disconnect()
def get_cpos_info(self, cpos):
match_dict = collections.OrderedDict()
for attribute in self.attributes:
if '.entry' not in attribute:
match_str = self.cl_cpos2str(attribute, range(cpos[0], cpos[1]))
match_dict[attribute] = match_str
else:
continue
print(match_dict)