diff --git a/app/corpora/CQiClient/CQi.py b/app/corpora/CQiClient/CQi.py deleted file mode 100644 index 5d39395a..00000000 --- a/app/corpora/CQiClient/CQi.py +++ /dev/null @@ -1,406 +0,0 @@ -# ########################################################################### # -# IMS CQi specification # -# # -# Version: 0.1a ;o) # -# Author: Stefan Evert (evert@ims.uni-stuttgart.de) # -# Modified by: Patrick Jentsch # -# ########################################################################### # - - -""" 1. padding """ -PAD = 0x00 - - -""" 2. CQi responses """ -""" 2.1 CQI_STATUS_* """ -STATUS = 0x01 -STATUS_OK = 0x0101 -STATUS_CONNECT_OK = 0x0102 -STATUS_BYE_OK = 0x0103 -STATUS_PING_OK = 0x0104 - -""" 2.2 CQI_ERROR_* """ -ERROR = 0x02 -ERROR_GENERAL_ERROR = 0x0201 -ERROR_CONNECT_REFUSED = 0x0202 -ERROR_USER_ABORT = 0x0203 -ERROR_SYNTAX_ERROR = 0x0204 -# includes corpus/attribute/subcorpus specifier syntax - -""" 2.3 CQI_DATA_* """ -DATA = 0x03 -DATA_BYTE = 0x0301 -DATA_BOOL = 0x0302 -DATA_INT = 0x0303 -DATA_STRING = 0x0304 -DATA_BYTE_LIST = 0x0305 -DATA_BOOL_LIST = 0x0306 -DATA_INT_LIST = 0x0307 -DATA_STRING_LIST = 0x0308 -DATA_INT_INT = 0x0309 -DATA_INT_INT_INT_INT = 0x030A -DATA_INT_TABLE = 0x030B - -""" 2.4 CQI_CL_ERROR_* """ -""" -" NOTE: some CL error codes are not represented in the CQi specs -" - usually because they're not used in the CL any more -" - CDA_ENOSTRING is not considered an error (returns -1) -" - CDA_EARGS: dynamic attribute calls not yet supported -""" -CL_ERROR = 0x04 -CL_ERROR_NO_SUCH_ATTRIBUTE = 0x0401 -# returned if CQi server couldn't open attribute -CL_ERROR_WRONG_ATTRIBUTE_TYPE = 0x0402 -# CDA_EATTTYPE -CL_ERROR_OUT_OF_RANGE = 0x0403 -# CDA_EIDORNG, CDA_EIDXORNG, CDA_EPOSORNG -CL_ERROR_REGEX = 0x0404 -# CDA_EPATTERN (not used), CDA_EBADREGEX -CL_ERROR_CORPUS_ACCESS = 0x0405 -# CDA_ENODATA -CL_ERROR_OUT_OF_MEMORY = 0x0406 -# CDA_ENOMEM -# this means the CQi server has run out of memory; -# try discarding some other corpora and/or subcorpora -CL_ERROR_INTERNAL = 0x0407 -# CDA_EOTHER, CDA_ENYI -# this is the classical 'please contact technical support' error - -""" 2.5 CQI_CQP_ERROR_* """ -CQP_ERROR = 0x05 -# CQP error messages yet to be defined -CQP_ERROR_GENERAL = 0x0501 -CQP_ERROR_NO_SUCH_CORPUS = 0x0502 -CQP_ERROR_INVALID_FIELD = 0x0503 -CQP_ERROR_OUT_OF_RANGE = 0x0504 -# various cases where a number is out of range - - -""" 3. CQi commands """ -""" 3.1 CQI_CTRL_* """ -CTRL = 0x11 -CTRL_CONNECT = 0x1101 -# INPUT: (STRING username, STRING password) -# OUTPUT: CQI_STATUS_CONNECT_OK, CQI_ERROR_CONNECT_REFUSED -CTRL_BYE = 0x1102 -# INPUT: () -# OUTPUT: CQI_STATUS_BYE_OK -CTRL_USER_ABORT = 0x1103 -# INPUT: () -# OUTPUT: -CTRL_PING = 0x1104 -# INPUT: () -# OUTPUT: CQI_STATUS_PING_OK -CTRL_LAST_GENERAL_ERROR = 0x1105 -# INPUT: () -# OUTPUT: CQI_DATA_STRING -# full-text error message for the last general error reported by the CQi server - -""" 3.2 CQI_ASK_FEATURE_* """ -ASK_FEATURE = 0x12 -ASK_FEATURE_CQI_1_0 = 0x1201 -# INPUT: () -# OUTPUT: CQI_DATA_BOOL -ASK_FEATURE_CL_2_3 = 0x1202 -# INPUT: () -# OUTPUT: CQI_DATA_BOOL -ASK_FEATURE_CQP_2_3 = 0x1203 -# INPUT: () -# OUTPUT: CQI_DATA_BOOL - -""" 3.3 CQI_CORPUS_* """ -CORPUS = 0x13 -CORPUS_LIST_CORPORA = 0x1301 -# INPUT: () -# OUTPUT: CQI_DATA_STRING_LIST -CORPUS_CHARSET = 0x1303 -# INPUT: (STRING corpus) -# OUTPUT: CQI_DATA_STRING -CORPUS_PROPERTIES = 0x1304 -# INPUT: (STRING corpus) -# OUTPUT: CQI_DATA_STRING_LIST -CORPUS_POSITIONAL_ATTRIBUTES = 0x1305 -# INPUT: (STRING corpus) -# OUTPUT: CQI_DATA_STRING_LIST -CORPUS_STRUCTURAL_ATTRIBUTES = 0x1306 -# INPUT: (STRING corpus) -# OUTPUT: CQI_DATA_STRING_LIST -CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES = 0x1307 -# INPUT: (STRING attribute) -# OUTPUT: CQI_DATA_BOOL -CORPUS_ALIGNMENT_ATTRIBUTES = 0x1308 -# INPUT: (STRING corpus) -# OUTPUT: CQI_DATA_STRING_LIST -CORPUS_FULL_NAME = 0x1309 -# INPUT: (STRING corpus) -# OUTPUT: CQI_DATA_STRING -# the full name of as specified in its registry entry -CORPUS_INFO = 0x130A -# INPUT: (STRING corpus) -# OUTPUT: CQI_DATA_STRING_LIST -# returns the contents of the .info file of as a list of lines -CORPUS_DROP_CORPUS = 0x130B -# INPUT: (STRING corpus) -# OUTPUT: CQI_STATUS_OK -# try to unload a corpus and all its attributes from memory - -""" 3.4 CQI_CL_* """ -CL = 0x14 -# low-level corpus access (CL functions) -CL_ATTRIBUTE_SIZE = 0x1401 -# INPUT: (STRING attribute) -# OUTPUT: CQI_DATA_INT -# returns the size of : -# - number of tokens (positional) -# - number of regions (structural) -# - number of alignments (alignment) -CL_LEXICON_SIZE = 0x1402 -# INPUT: (STRING attribute) -# OUTPUT: CQI_DATA_INT -# returns the number of entries in the lexicon of a positional attribute; -# valid lexicon IDs range from 0 .. (lexicon_size - 1) -CL_DROP_ATTRIBUTE = 0x1403 -# INPUT: (STRING attribute) -# OUTPUT: CQI_STATUS_OK -# unload attribute from memory -""" -" NOTE: simple (scalar) mappings are applied to lists (the returned list has -" exactly the same length as the list passed as an argument) -""" -CL_STR2ID = 0x1404 -# INPUT: (STRING attribute, STRING_LIST strings) -# OUTPUT: CQI_DATA_INT_LIST -# returns -1 for every string in that is not found in the lexicon -CL_ID2STR = 0x1405 -# INPUT: (STRING attribute, INT_LIST id) -# OUTPUT: CQI_DATA_STRING_LIST -# returns "" for every ID in that is out of range -CL_ID2FREQ = 0x1406 -# INPUT: (STRING attribute, INT_LIST id) -# OUTPUT: CQI_DATA_INT_LIST -# returns 0 for every ID in that is out of range -CL_CPOS2ID = 0x1407 -# INPUT: (STRING attribute, INT_LIST cpos) -# OUTPUT: CQI_DATA_INT_LIST -# returns -1 for every corpus position in that is out of range -CL_CPOS2STR = 0x1408 -# INPUT: (STRING attribute, INT_LIST cpos) -# OUTPUT: CQI_DATA_STRING_LIST -# returns "" for every corpus position in that is out of range -CL_CPOS2STRUC = 0x1409 -# INPUT: (STRING attribute, INT_LIST cpos) -# OUTPUT: CQI_DATA_INT_LIST -# returns -1 for every corpus position not inside a structure region -""" -" NOTE: temporary addition for the Euralex2000 tutorial, but should probably be -" included in CQi specs -""" -CL_CPOS2LBOUND = 0x1420 -# INPUT: (STRING attribute, INT_LIST cpos) -# OUTPUT: CQI_DATA_INT_LIST -# returns left boundary of s-attribute region enclosing cpos, -1 if not in -# region -CL_CPOS2RBOUND = 0x1421 -# INPUT: (STRING attribute, INT_LIST cpos) -# OUTPUT: CQI_DATA_INT_LIST -# returns right boundary of s-attribute region enclosing cpos, -1 if not in -# region -CL_CPOS2ALG = 0x140A -# INPUT: (STRING attribute, INT_LIST cpos) -# OUTPUT: CQI_DATA_INT_LIST -# returns -1 for every corpus position not inside an alignment -CL_STRUC2STR = 0x140B -# INPUT: (STRING attribute, INT_LIST strucs) -# OUTPUT: CQI_DATA_STRING_LIST -# returns annotated string values of structure regions in ; "" if out -# of range -# check CQI_CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES() first -""" -" NOTE: the following mappings take a single argument and return multiple -" values, including lists of arbitrary size -""" -CL_ID2CPOS = 0x140C -# INPUT: (STRING attribute, INT id) -# OUTPUT: CQI_DATA_INT_LIST -# returns all corpus positions where the given token occurs -CL_IDLIST2CPOS = 0x140D -# INPUT: (STRING attribute, INT_LIST id_list) -# OUTPUT: CQI_DATA_INT_LIST -# returns all corpus positions where one of the tokens in -# occurs; the returned list is sorted as a whole, not per token id -CL_REGEX2ID = 0x140E -# INPUT: (STRING attribute, STRING regex) -# OUTPUT: CQI_DATA_INT_LIST -# returns lexicon IDs of all tokens that match ; the returned -# list may be empty (size 0); -CL_STRUC2CPOS = 0x140F -# INPUT: (STRING attribute, INT struc) -# OUTPUT: CQI_DATA_INT_INT -# returns start and end corpus positions of structure region -CL_ALG2CPOS = 0x1410 -# INPUT: (STRING attribute, INT alg) -# OUTPUT: CQI_DATA_INT_INT_INT_INT -# returns (src_start, src_end, target_start, target_end) - -""" 3.5 CQI_CQP_* """ -CQP = 0x15 -CQP_QUERY = 0x1501 -# INPUT: (STRING mother_corpus, STRING subcorpus_name, STRING query) -# OUTPUT: CQI_STATUS_OK -# must include the ';' character terminating the query. -CQP_LIST_SUBCORPORA = 0x1502 -# INPUT: (STRING corpus) -# OUTPUT: CQI_DATA_STRING_LIST -CQP_SUBCORPUS_SIZE = 0x1503 -# INPUT: (STRING subcorpus) -# OUTPUT: CQI_DATA_INT -CQP_SUBCORPUS_HAS_FIELD = 0x1504 -# INPUT: (STRING subcorpus, BYTE field) -# OUTPUT: CQI_DATA_BOOL -CQP_DUMP_SUBCORPUS = 0x1505 -# INPUT: (STRING subcorpus, BYTE field, INT first, INT last) -# OUTPUT: CQI_DATA_INT_LIST -# Dump the values of for match ranges .. in . -# is one of the CQI_CONST_FIELD_* constants. -CQP_DROP_SUBCORPUS = 0x1509 -# INPUT: (STRING subcorpus) -# OUTPUT: CQI_STATUS_OK -# delete a subcorpus from memory -""" -" NOTE: The following two functions are temporarily included for the Euralex -" 2000 tutorial demo -""" -""" NOTE: frequency distribution of single tokens """ -CQP_FDIST_1 = 0x1510 -# INPUT: (STRING subcorpus, INT cutoff, BYTE field, STRING attribute) -# OUTPUT: CQI_DATA_INT_LIST -# returns (id, frequency) pairs flattened into a list of size 2* -# field is one of CQI_CONST_FIELD_MATCH, CQI_CONST_FIELD_TARGET, -# CQI_CONST_FIELD_KEYWORD -# NB: pairs are sorted by frequency desc. -""" NOTE: frequency distribution of pairs of tokens """ -CQP_FDIST_2 = 0x1511 -# INPUT: (STRING subcorpus, INT cutoff, BYTE field1, STRING attribute1, -# BYTE field2, STRING attribute2) -# OUTPUT: CQI_DATA_INT_LIST -# returns (id1, id2, frequency) pairs flattened into a list of size 3* -# NB: triples are sorted by frequency desc. - - -""" 4. Constant Definitions """ -CONST_FALSE = 0x00 -CONST_NO = 0x00 -CONST_TRUE = 0x01 -CONST_YES = 0x01 -""" -" NOTE: The following constants specify which field will be returned by -" CQI_CQP_DUMP_SUBCORPUS and some other subcorpus commands. -""" -CONST_FIELD_MATCH = 0x10 -CONST_FIELD_MATCHEND = 0x11 -""" -" NOTE: The constants specifiying target0 .. target9 are guaranteed to have the -" numerical values 0 .. 9, so clients do not need to look up the constant -" values if they're handling arbitrary targets. -""" -CONST_FIELD_TARGET_0 = 0x00 -CONST_FIELD_TARGET_1 = 0x01 -CONST_FIELD_TARGET_2 = 0x02 -CONST_FIELD_TARGET_3 = 0x03 -CONST_FIELD_TARGET_4 = 0x04 -CONST_FIELD_TARGET_5 = 0x05 -CONST_FIELD_TARGET_6 = 0x06 -CONST_FIELD_TARGET_7 = 0x07 -CONST_FIELD_TARGET_8 = 0x08 -CONST_FIELD_TARGET_9 = 0x09 -""" -" NOTE: The following constants are provided for backward compatibility with -" traditional CQP field names & while the generalised target concept -" isn't yet implemented in the CQPserver. -""" -CONST_FIELD_TARGET = 0x00 -CONST_FIELD_KEYWORD = 0x09 -""" NOTE: CQi version is CQI_MAJOR_VERSION.CQI_MINOR_VERSION """ -MAJOR_VERSION = 0x00 -MINOR_VERSION = 0x01 - - -""" 5. CQi lookup dictionary. """ -lookup = { - 257: 'CQI_STATUS_OK', - 258: 'CQI_STATUS_CONNECT_OK', - 259: 'CQI_STATUS_BYE_OK', - 260: 'CQI_STATUS_PING_OK', - 513: 'CQI_ERROR_GENERAL_ERROR', - 514: 'CQI_ERROR_CONNECT_REFUSED', - 515: 'CQI_ERROR_USER_ABORT', - 516: 'CQI_ERROR_SYNTAX_ERROR', - 769: 'CQI_DATA_BYTE', - 770: 'CQI_DATA_BOOL', - 771: 'CQI_DATA_INT', - 772: 'CQI_DATA_STRING', - 773: 'CQI_DATA_BYTE_LIST', - 774: 'CQI_DATA_BOOL_LIST', - 775: 'CQI_DATA_INT_LIST', - 776: 'CQI_DATA_STRING_LIST', - 777: 'CQI_DATA_INT_INT', - 778: 'CQI_DATA_INT_INT_INT_INT', - 779: 'CQI_DATA_INT_TABLE', - 1025: 'CQI_CL_ERROR_NO_SUCH_ATTRIBUTE', - 1026: 'CQI_CL_ERROR_WRONG_ATTRIBUTE_TYPE', - 1027: 'CQI_CL_ERROR_OUT_OF_RANGE', - 1028: 'CQI_CL_ERROR_REGEX', - 1029: 'CQI_CL_ERROR_CORPUS_ACCESS', - 1030: 'CQI_CL_ERROR_OUT_OF_MEMORY', - 1031: 'CQI_CL_ERROR_INTERNAL', - 1281: 'CQI_CQP_ERROR_GENERAL', - 1282: 'CQI_CQP_ERROR_NO_SUCH_CORPUS', - 1283: 'CQI_CQP_ERROR_INVALID_FIELD', - 1284: 'CQI_CQP_ERROR_OUT_OF_RANGE', - 4353: 'CQI_CTRL_CONNECT', - 4354: 'CQI_CTRL_BYE', - 4355: 'CQI_CTRL_USER_ABORT', - 4356: 'CQI_CTRL_PING', - 4357: 'CQI_CTRL_LAST_GENERAL_ERROR', - 4609: 'CQI_ASK_FEATURE_CQI_1_0', - 4610: 'CQI_ASK_FEATURE_CL_2_3', - 4611: 'CQI_ASK_FEATURE_CQP_2_3', - 4865: 'CQI_CORPUS_LIST_CORPORA', - 4867: 'CQI_CORPUS_CHARSET', - 4868: 'CQI_CORPUS_PROPERTIES', - 4869: 'CQI_CORPUS_POSITIONAL_ATTRIBUTES', - 4870: 'CQI_CORPUS_STRUCTURAL_ATTRIBUTES', - 4871: 'CQI_CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES', - 4872: 'CQI_CORPUS_ALIGNMENT_ATTRIBUTES', - 4873: 'CQI_CORPUS_FULL_NAME', - 4874: 'CQI_CORPUS_INFO', - 4875: 'CQI_CORPUS_DROP_CORPUS', - 5121: 'CQI_CL_ATTRIBUTE_SIZE', - 5122: 'CQI_CL_LEXICON_SIZE', - 5123: 'CQI_CL_DROP_ATTRIBUTE', - 5124: 'CQI_CL_STR2ID', - 5125: 'CQI_CL_ID2STR', - 5126: 'CQI_CL_ID2FREQ', - 5127: 'CQI_CL_CPOS2ID', - 5128: 'CQI_CL_CPOS2STR', - 5129: 'CQI_CL_CPOS2STRUC', - 5130: 'CQI_CL_CPOS2ALG', - 5131: 'CQI_CL_STRUC2STR', - 5132: 'CQI_CL_ID2CPOS', - 5133: 'CQI_CL_IDLIST2CPOS', - 5134: 'CQI_CL_REGEX2ID', - 5135: 'CQI_CL_STRUC2CPOS', - 5136: 'CQI_CL_ALG2CPOS', - 5152: 'CQI_CL_CPOS2LBOUND', - 5153: 'CQI_CL_CPOS2RBOUND', - 5377: 'CQI_CQP_QUERY', - 5378: 'CQI_CQP_LIST_SUBCORPORA', - 5379: 'CQI_CQP_SUBCORPUS_SIZE', - 5380: 'CQI_CQP_SUBCORPUS_HAS_FIELD', - 5381: 'CQI_CQP_DUMP_SUBCORPUS', - 5385: 'CQI_CQP_DROP_SUBCORPUS', - 5392: 'CQI_CQP_FDIST_1', - 5393: 'CQI_CQP_FDIST_2' -} diff --git a/app/corpora/CQiClient/CQiClient.py b/app/corpora/CQiClient/CQiClient.py deleted file mode 100644 index 39a24c4c..00000000 --- a/app/corpora/CQiClient/CQiClient.py +++ /dev/null @@ -1,611 +0,0 @@ -from . import CQi -import socket -import struct - - -class CQiClient: - def __init__(self, host='127.0.0.1', port=4877): - self.host = host - self.port = port - self.connection = socket.socket() - self.connection.connect((self.host, self.port)) - - def ctrl_connect(self, username, password): - # INPUT: (STRING username, STRING password) - # OUTPUT: CQI_STATUS_CONNECT_OK, CQI_ERROR_CONNECT_REFUSED - # print('CTRL_CONNECT') - self.__send_WORD(CQi.CTRL_CONNECT) - self.__send_STRING(username) - self.__send_STRING(password) - self.__recv_response() - - def ctrl_bye(self): - # INPUT: () - # OUTPUT: CQI_STATUS_BYE_OK - # print('CTRL_BYE') - self.__send_WORD(CQi.CTRL_BYE) - self.__recv_response() - - def ctrl_user_abort(self): - # INPUT: () - # OUTPUT: - # print('CTRL_USER_ABORT') - self.__send_WORD(CQi.CTRL_USER_ABORT) - - def ctrl_ping(self): - # INPUT: () - # OUTPUT: CQI_STATUS_PING_OK - # print('CTRL_PING') - self.__send_WORD(CQi.CTRL_PING) - self.__recv_response() - - def ctrl_last_general_error(self): - # INPUT: () - # OUTPUT: CQI_DATA_STRING - # full-text error message for the last general error reported by the - # CQi server - # print('CTRL_LAST_GENERAL_ERROR') - self.__send_WORD(CQi.CTRL_LAST_GENERAL_ERROR) - return self.__recv_response() - - def ask_feature_cqi_1_0(self): - # INPUT: () - # OUTPUT: CQI_DATA_BOOL - # print('ASK_FEATURE_CQI_1_0') - self.__send_WORD(CQi.ASK_FEATURE_CQI_1_0) - return self.__recv_response() - - def ask_feature_cl_2_3(self): - # INPUT: () - # OUTPUT: CQI_DATA_BOOL - # print('ASK_FEATURE_CL_2_3') - self.__send_WORD(CQi.ASK_FEATURE_CL_2_3) - return self.__recv_response() - - def ask_feature_cqp_2_3(self): - # INPUT: () - # OUTPUT: CQI_DATA_BOOL - # print('ASK_FEATURE_CL_2_3') - self.__send_WORD(CQi.ASK_FEATURE_CL_2_3) - return self.__recv_response() - - def corpus_list_coprora(self): - # INPUT: () - # OUTPUT: CQI_DATA_STRING_LIST - # print('CORPUS_LIST_CORPORA') - self.__send_WORD(CQi.CORPUS_LIST_CORPORA) - return self.__recv_response() - - def corpus_charset(self, corpus): - # INPUT: (STRING corpus) - # OUTPUT: CQI_DATA_STRING - # print('CORPUS_CHARSET') - self.__send_WORD(CQi.CORPUS_CHARSET) - self.__send_STRING(corpus) - return self.__recv_response() - - def corpus_properties(self, corpus): - # INPUT: (STRING corpus) - # OUTPUT: CQI_DATA_STRING_LIST - # print('CORPUS_PROPERTIES') - self.__send_WORD(CQi.CORPUS_PROPERTIES) - self.__send_STRING(corpus) - return self.__recv_response() - - def corpus_positional_attributes(self, corpus): - # INPUT: (STRING corpus) - # OUTPUT: CQI_DATA_STRING_LIST - # print('CORPUS_POSITIONAL_ATTRIBUTES') - self.__send_WORD(CQi.CORPUS_POSITIONAL_ATTRIBUTES) - self.__send_STRING(corpus) - return self.__recv_response() - - def corpus_structural_attributes(self, corpus): - # INPUT: (STRING corpus) - # OUTPUT: CQI_DATA_STRING_LIST - # print('CORPUS_STRUCTURAL_ATTRIBUTES') - self.__send_WORD(CQi.CORPUS_STRUCTURAL_ATTRIBUTES) - self.__send_STRING(corpus) - return self.__recv_response() - - def corpus_structural_attribute_has_values(self, attribute): - # INPUT: (STRING attribute) - # OUTPUT: CQI_DATA_BOOL - # print('CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES') - self.__send_WORD(CQi.CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES) - self.__send_STRING(attribute) - return self.__recv_response() - - def corpus_alignment_attributes(self, corpus): - # INPUT: (STRING corpus) - # OUTPUT: CQI_DATA_STRING_LIST - # print('CORPUS_ALIGNMENT_ATTRIBUTES') - self.__send_WORD(CQi.CORPUS_ALIGNMENT_ATTRIBUTES) - self.__send_STRING(corpus) - return self.__recv_response() - - def corpus_full_name(self, corpus): - # INPUT: (STRING corpus) - # OUTPUT: CQI_DATA_STRING - # the full name of as specified in its registry entry - # print('CORPUS_FULL_NAME') - self.__send_WORD(CQi.CORPUS_FULL_NAME) - self.__send_STRING(corpus) - return self.__recv_response() - - def corpus_info(self, corpus): - # INPUT: (STRING corpus) - # OUTPUT: CQI_DATA_STRING_LIST - # returns the contents of the .info file of as a list of lines - # print('CORPUS_INFO') - self.__send_WORD(CQi.CORPUS_INFO) - self.__send_STRING(corpus) - return self.__recv_response() - - def corpus_drop_corpus(self, corpus): - ''' - ' Broken - ' TODO: Check what type of return value is provided by the server. - ''' - # INPUT: (STRING corpus) - # OUTPUT: CQI_STATUS_OK - # try to unload a corpus and all its attributes from memory - # print('CORPUS_DROP_CORPUS') - self.__send_WORD(CQi.CORPUS_DROP_CORPUS) - self.__send_STRING(corpus) - self.__recv_response() - - def cl_attribute_size(self, attribute): - # INPUT: (STRING attribute) - # OUTPUT: CQI_DATA_INT - # returns the size of : - # number of tokens (positional) - # number of regions (structural) - # number of alignments (alignment) - # print('CL_ATTRIBUTE_SIZE') - self.__send_WORD(CQi.CL_ATTRIBUTE_SIZE) - self.__send_STRING(attribute) - return self.__recv_response() - - def cl_lexicon_size(self, attribute): - # INPUT: (STRING attribute) - # OUTPUT: CQI_DATA_INT - # returns the number of entries in the lexicon of a positional - # attribute; - # valid lexicon IDs range from 0 .. (lexicon_size - 1) - # print('CL_LEXICON_SIZE') - self.__send_WORD(CQi.CL_LEXICON_SIZE) - self.__send_STRING(attribute) - return self.__recv_response() - - def cl_drop_attribute(self, attribute): - # INPUT: (STRING attribute) - # OUTPUT: CQI_STATUS_OK - # unload attribute from memory - # print('CL_DROP_ATTRIBUTE') - self.__send_WORD(CQi.CL_LEXICON_SIZE) - self.__send_STRING(attribute) - self.__recv_response() - - """ - " NOTE: simple (scalar) mappings are applied to lists (the returned list - " has exactly the same length as the list passed as an argument) - """ - - def cl_str2id(self, attribute, strings): - # INPUT: (STRING attribute, STRING_LIST strings) - # OUTPUT: CQI_DATA_INT_LIST - # returns -1 for every string in that is not found in the - # lexicon - # print('CL_STR2ID') - self.__send_WORD(CQi.CL_LEXICON_SIZE) - self.__send_STRING(attribute) - self.__send_STRING_LIST(strings) - return self.__recv_response() - - def cl_id2str(self, attribute, id): - # INPUT: (STRING attribute, INT_LIST id) - # OUTPUT: CQI_DATA_STRING_LIST - # returns "" for every ID in that is out of range - # print('CL_ID2STR') - self.__send_WORD(CQi.CL_ID2STR) - self.__send_STRING(attribute) - self.__send_INT_LIST(id) - return self.__recv_response() - - def cl_id2freq(self, attribute, id): - # INPUT: (STRING attribute, INT_LIST id) - # OUTPUT: CQI_DATA_INT_LIST - # returns 0 for every ID in that is out of range - # print('CL_ID2FREQ') - self.__send_WORD(CQi.CL_ID2FREQ) - self.__send_STRING(attribute) - self.__send_INT_LIST(id) - return self.__recv_response() - - def cl_cpos2id(self, attribute, cpos): - # INPUT: (STRING attribute, INT_LIST cpos) - # OUTPUT: CQI_DATA_INT_LIST - # returns -1 for every corpus position in that is out of range - # print('CL_CPOS2ID') - self.__send_WORD(CQi.CL_ID2FREQ) - self.__send_STRING(attribute) - self.__send_INT_LIST(cpos) - return self.__recv_response() - - def cl_cpos2str(self, attribute, cpos): - # INPUT: (STRING attribute, INT_LIST cpos) - # OUTPUT: CQI_DATA_STRING_LIST - # returns "" for every corpus position in that is out of range - # print('CL_CPOS2STR') - self.__send_WORD(CQi.CL_CPOS2STR) - self.__send_STRING(attribute) - self.__send_INT_LIST(cpos) - return self.__recv_response() - - def cl_cpos2struc(self, attribute, cpos): - # INPUT: (STRING attribute, INT_LIST cpos) - # OUTPUT: CQI_DATA_INT_LIST - # returns -1 for every corpus position not inside a structure region - # print('CL_CPOS2STRUC') - self.__send_WORD(CQi.CL_CPOS2STRUC) - self.__send_STRING(attribute) - self.__send_INT_LIST(cpos) - return self.__recv_response() - - """ - " NOTE: temporary addition for the Euralex2000 tutorial, but should - " probably be included in CQi specs - """ - - def cl_cpos2lbound(self, attribute, cpos): - # INPUT: (STRING attribute, INT_LIST cpos) - # OUTPUT: CQI_DATA_INT_LIST - # returns left boundary of s-attribute region enclosing cpos, -1 if not - # in region - # print('CL_CPOS2LBOUND') - self.__send_WORD(CQi.CL_CPOS2LBOUND) - self.__send_STRING(attribute) - self.__send_INT_LIST(cpos) - return self.__recv_response() - - def cl_cpos2rbound(self, attribute, cpos): - # INPUT: (STRING attribute, INT_LIST cpos) - # OUTPUT: CQI_DATA_INT_LIST - # returns right boundary of s-attribute region enclosing cpos, -1 if - # not in region - # print('CL_CPOS2RBOUND') - self.__send_WORD(CQi.CL_CPOS2RBOUND) - self.__send_STRING(attribute) - self.__send_INT_LIST(cpos) - return self.__recv_response() - - def cl_cpos2alg(self, attribute, cpos): - # INPUT: (STRING attribute, INT_LIST cpos) - # OUTPUT: CQI_DATA_INT_LIST - # returns -1 for every corpus position not inside an alignment - # print('CL_CPOS2ALG') - self.__send_WORD(CQi.CL_CPOS2ALG) - self.__send_STRING(attribute) - self.__send_INT_LIST(cpos) - return self.__recv_response() - - def cl_struc2str(self, attribute, strucs): - # INPUT: (STRING attribute, INT_LIST strucs) - # OUTPUT: CQI_DATA_STRING_LIST - # returns annotated string values of structure regions in ; "" - # if out of range - # check CQI_CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES() first - # print('CL_STRUC2STR') - self.__send_WORD(CQi.CL_STRUC2STR) - self.__send_STRING(attribute) - self.__send_INT_LIST(strucs) - return self.__recv_response() - - """ - " NOTE: the following mappings take a single argument and return multiple - " values, including lists of arbitrary size - """ - - def cl_id2cpos(self, attribute, id): - # INPUT: (STRING attribute, INT id) - # OUTPUT: CQI_DATA_INT_LIST - # returns all corpus positions where the given token occurs - # print('CL_ID2CPOS') - self.__send_WORD(CQi.CL_ID2CPOS) - self.__send_STRING(attribute) - self.__send_INT(id) - return self.__recv_response() - - def cl_idlist2cpos(self, attribute, id_list): - # INPUT: (STRING attribute, INT_LIST id_list) - # OUTPUT: CQI_DATA_INT_LIST - # returns all corpus positions where one of the tokens in - # occurs; the returned list is sorted as a whole, not per token id - # print('CL_IDLIST2CPOS') - self.__send_WORD(CQi.CL_IDLIST2CPOS) - self.__send_STRING(attribute) - self.__send_INT_LIST(id_list) - return self.__recv_response() - - def cl_regex2id(self, attribute, regex): - # INPUT: (STRING attribute, STRING regex) - # OUTPUT: CQI_DATA_INT_LIST - # returns lexicon IDs of all tokens that match ; the returned - # list may be empty (size 0); - # print('CL_REGEX2ID') - self.__send_WORD(CQi.CL_REGEX2ID) - self.__send_STRING(attribute) - self.__send_STRING(regex) - return self.__recv_response() - - def cl_struc2cpos(self, attribute, struc): - # INPUT: (STRING attribute, INT struc) - # OUTPUT: CQI_DATA_INT_INT - # returns start and end corpus positions of structure region - # print('CL_STRUC2CPOS') - self.__send_WORD(CQi.CL_STRUC2CPOS) - self.__send_STRING(attribute) - self.__send_INT(struc) - return self.__recv_response() - - def cl_alg2cpos(self, attribute, alg): - # INPUT: (STRING attribute, INT alg) - # OUTPUT: CQI_DATA_INT_INT_INT_INT - # returns (src_start, src_end, target_start, target_end) - # print('CL_ALG2CPOS') - self.__send_WORD(CQi.CL_ALG2CPOS) - self.__send_STRING(attribute) - self.__send_INT(alg) - return self.__recv_response() - - def cqp_query(self, mother_corpus, subcorpus_name, query): - # INPUT: (STRING mother_corpus, STRING subcorpus_name, STRING query) - # OUTPUT: CQI_STATUS_OK - # must include the ';' character terminating the query. - # print('CQP_QUERY') - self.__send_WORD(CQi.CQP_QUERY) - self.__send_STRING(mother_corpus) - self.__send_STRING(subcorpus_name) - self.__send_STRING(query) - self.__recv_WORD() - - def cqp_list_subcorpora(self, corpus): - # INPUT: (STRING corpus) - # OUTPUT: CQI_DATA_STRING_LIST - # print('CQP_LIST_SUBCORPORA') - self.__send_WORD(CQi.CQP_LIST_SUBCORPORA) - self.__send_STRING(corpus) - return self.__recv_response() - - def cqp_subcorpus_size(self, subcorpus): - # INPUT: (STRING subcorpus) - # OUTPUT: CQI_DATA_INT - # print('CQP_SUBCORPUS_SIZE') - self.__send_WORD(CQi.CQP_SUBCORPUS_SIZE) - self.__send_STRING(subcorpus) - return self.__recv_response() - - def cqp_subcorpus_has_field(self, subcorpus, field): - # INPUT: (STRING subcorpus, BYTE field) - # OUTPUT: CQI_DATA_BOOL - # print('CQP_SUBCORPUS_HAS_FIELD') - self.__send_WORD(CQi.CQP_SUBCORPUS_HAS_FIELD) - self.__send_STRING(subcorpus) - self.__send_BYTE(field) - return self.__recv_response() - - def cqp_dump_subcorpus(self, subcorpus, field, first, last): - # INPUT: (STRING subcorpus, BYTE field, INT first, INT last) - # OUTPUT: CQI_DATA_INT_LIST - # Dump the values of for match ranges .. - # in . is one of the CQI_CONST_FIELD_* constants. - # print('CQP_DUMP_SUBCORPUS') - self.__send_WORD(CQi.CQP_DUMP_SUBCORPUS) - self.__send_STRING(subcorpus) - self.__send_BYTE(field) - self.__send_INT(first) - self.__send_INT(last) - return self.__recv_response() - - def cqp_drop_subcorpus(self, subcorpus): - # INPUT: (STRING subcorpus) - # OUTPUT: CQI_STATUS_OK - # delete a subcorpus from memory - # print('CQP_DROP_SUBCORPUS') - self.__send_WORD(CQi.CQP_DROP_SUBCORPUS) - self.__send_STRING(subcorpus) - self.__recv_response() - - """ - " NOTE: The following two functions are temporarily included for the - " Euralex 2000 tutorial demo - """ - - def cqp_fdist_1(self, subcorpus, cutoff, field, attribute): - """ NOTE: frequency distribution of single tokens """ - # INPUT: (STRING subcorpus, INT cutoff, BYTE field, STRING attribute) - # OUTPUT: CQI_DATA_INT_LIST - # returns (id, frequency) pairs flattened into a list of size 2* - # field is one of CQI_CONST_FIELD_MATCH, CQI_CONST_FIELD_TARGET, - # CQI_CONST_FIELD_KEYWORD - # NB: pairs are sorted by frequency desc. - # print('CQP_FDIST_1') - self.__send_WORD(CQi.CQP_FDIST_1) - self.__send_STRING(subcorpus) - self.__send_INT(cutoff) - self.__send_BYTE(field) - self.__send_STRING(attribute) - return self.__recv_response() - - def cqp_fdist_2(self, subcorpus, cutoff, field1, attribute1, field2, - attribute2): - """ NOTE: frequency distribution of pairs of tokens """ - # INPUT: (STRING subcorpus, INT cutoff, BYTE field1, STRING attribute1, - # BYTE field2, STRING attribute2) - # OUTPUT: CQI_DATA_INT_LIST - # returns (id1, id2, frequency) pairs flattened into a list of size - # 3* - # NB: triples are sorted by frequency desc. - # print('CQP_FDIST_2') - self.__send_WORD(CQi.CQP_FDIST_2) - self.__send_STRING(subcorpus) - self.__send_INT(cutoff) - self.__send_BYTE(field1) - self.__send_STRING(attribute1) - self.__send_BYTE(field2) - self.__send_STRING(attribute2) - return self.__recv_response() - - def __recv_response(self): - byte_data = self.__recv_WORD() - response_type = byte_data >> 8 - if response_type == CQi.STATUS: - response = byte_data - elif response_type == CQi.ERROR: - raise Exception(CQi.lookup[byte_data]) - elif response_type == CQi.DATA: - response = self.__recv_DATA(byte_data) - elif response_type == CQi.CL_ERROR: - raise Exception(CQi.lookup[byte_data]) - elif response_type == CQi.CQP_ERROR: - raise Exception(CQi.lookup[byte_data]) - else: - raise Exception( - 'Unknown response type: {}'.format(hex(response_type)) - ) - return response - - def __recv_DATA(self, data_type): - if data_type == CQi.DATA_BYTE: - data = self.__recv_DATA_BYTE() - elif data_type == CQi.DATA_BOOL: - data = self.__recv_DATA_BOOL() - elif data_type == CQi.DATA_INT: - data = self.__recv_DATA_INT() - elif data_type == CQi.DATA_STRING: - data = self.__recv_DATA_STRING() - elif data_type == CQi.DATA_BYTE_LIST: - data = self.__recv_DATA_BYTE_LIST() - elif data_type == CQi.DATA_BOOL_LIST: - data = self.__recv_DATA_BOOL_LIST() - elif data_type == CQi.DATA_INT_LIST: - data = self.__recv_DATA_INT_LIST() - elif data_type == CQi.DATA_STRING_LIST: - data = self.__recv_DATA_STRING_LIST() - elif data_type == CQi.DATA_INT_INT: - data = self.__recv_DATA_INT_INT() - elif data_type == CQi.DATA_INT_INT_INT_INT: - data = self.__recv_DATA_INT_INT_INT_INT() - elif data_type == CQi.DATA_INT_TABLE: - data = self.__recv_DATA_INT_TABLE() - else: - raise Exception('Unknown data type: {}'.format(hex(data_type))) - return data - - def __recv_DATA_BYTE(self): - byte_data = self.connection.recv(1) - return struct.unpack('!B', byte_data)[0] - - def __recv_DATA_BOOL(self): - byte_data = self.connection.recv(1) - return struct.unpack('!?', byte_data)[0] - - def __recv_DATA_INT(self): - byte_data = self.connection.recv(4) - return struct.unpack('!i', byte_data)[0] - - def __recv_DATA_STRING(self): - n = self.__recv_WORD() - byte_data = self.connection.recv(n) - return struct.unpack('!{}s'.format(n), byte_data)[0].decode() - - def __recv_DATA_BYTE_LIST(self): - data = [] - n = self.__recv_DATA_INT() - while n > 0: - data.append(self.__recv_DATA_BYTE()) - n -= 1 - return data - - def __recv_DATA_BOOL_LIST(self): - data = [] - n = self.__recv_DATA_INT() - while n > 0: - data.append(self.__recv_DATA_BOOL()) - n -= 1 - return data - - def __recv_DATA_INT_LIST(self): - data = [] - n = self.__recv_DATA_INT() - while n > 0: - data.append(self.__recv_DATA_INT()) - n -= 1 - return data - - def __recv_DATA_STRING_LIST(self): - data = [] - n = self.__recv_DATA_INT() - while n > 0: - data.append(self.__recv_DATA_STRING()) - n -= 1 - return data - - def __recv_DATA_INT_INT(self): - return (self.__recv_INT(), self.__recv_INT()) - - def __recv_DATA_INT_INT_INT_INT(self): - return (self.__recv_INT(), - self.__recv_INT(), - self.__recv_INT(), - self.__recv_INT()) - - def __recv_DATA_INT_TABLE(self): - rows = self.__recv_DATA_INT() - columns = self.__recv_DATA_INT() - data = [] - for i in range(0, rows): - row = [] - for j in range(0, columns): - row.append(self.__recv_DATA_INT()) - data.append(row) - return data - - def __recv_WORD(self): - byte_data = self.connection.recv(2) - return struct.unpack('!H', byte_data)[0] - - def __send_BYTE(self, byte_data): - data = struct.pack('!B', byte_data) - self.connection.sendall(data) - - def __send_BOOL(self, bool_data): - data = struct.pack('!?', bool_data) - self.connection.sendall(data) - - def __send_INT(self, int_data): - data = struct.pack('!i', int_data) - self.connection.sendall(data) - - def __send_STRING(self, string_data): - encoded_string_data = string_data.encode('utf-8') - n = len(encoded_string_data) - data = struct.pack('!H{}s'.format(n), n, encoded_string_data) - self.connection.sendall(data) - - def __send_INT_LIST(self, int_list_data): - n = len(int_list_data) - self.__send_INT(n) - for int_data in int_list_data: - self.__send_INT(int_data) - - def __send_STRING_LIST(self, string_list_data): - n = len(string_list_data) - self.__send_INT(n) - for string_data in string_list_data: - self.__send_STRING(string_data) - - def __send_WORD(self, word_data): - data = struct.pack('!H', word_data) - self.connection.sendall(data)