From 82ce08b938d31412d2e84d63779070002f23e956 Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Wed, 6 Nov 2019 15:44:06 +0100 Subject: [PATCH] Add CQiClient --- app/corpora/CQiClient/CQi.py | 406 +++++++++++++++++++ app/corpora/CQiClient/CQiClient.py | 611 +++++++++++++++++++++++++++++ app/corpora/events.py | 5 + 3 files changed, 1022 insertions(+) create mode 100644 app/corpora/CQiClient/CQi.py create mode 100644 app/corpora/CQiClient/CQiClient.py diff --git a/app/corpora/CQiClient/CQi.py b/app/corpora/CQiClient/CQi.py new file mode 100644 index 00000000..5d39395a --- /dev/null +++ b/app/corpora/CQiClient/CQi.py @@ -0,0 +1,406 @@ +# ########################################################################### # +# IMS CQi specification # +# # +# Version: 0.1a ;o) # +# Author: Stefan Evert (evert@ims.uni-stuttgart.de) # +# Modified by: Patrick Jentsch # +# ########################################################################### # + + +""" 1. padding """ +PAD = 0x00 + + +""" 2. CQi responses """ +""" 2.1 CQI_STATUS_* """ +STATUS = 0x01 +STATUS_OK = 0x0101 +STATUS_CONNECT_OK = 0x0102 +STATUS_BYE_OK = 0x0103 +STATUS_PING_OK = 0x0104 + +""" 2.2 CQI_ERROR_* """ +ERROR = 0x02 +ERROR_GENERAL_ERROR = 0x0201 +ERROR_CONNECT_REFUSED = 0x0202 +ERROR_USER_ABORT = 0x0203 +ERROR_SYNTAX_ERROR = 0x0204 +# includes corpus/attribute/subcorpus specifier syntax + +""" 2.3 CQI_DATA_* """ +DATA = 0x03 +DATA_BYTE = 0x0301 +DATA_BOOL = 0x0302 +DATA_INT = 0x0303 +DATA_STRING = 0x0304 +DATA_BYTE_LIST = 0x0305 +DATA_BOOL_LIST = 0x0306 +DATA_INT_LIST = 0x0307 +DATA_STRING_LIST = 0x0308 +DATA_INT_INT = 0x0309 +DATA_INT_INT_INT_INT = 0x030A +DATA_INT_TABLE = 0x030B + +""" 2.4 CQI_CL_ERROR_* """ +""" +" NOTE: some CL error codes are not represented in the CQi specs +" - usually because they're not used in the CL any more +" - CDA_ENOSTRING is not considered an error (returns -1) +" - CDA_EARGS: dynamic attribute calls not yet supported +""" +CL_ERROR = 0x04 +CL_ERROR_NO_SUCH_ATTRIBUTE = 0x0401 +# returned if CQi server couldn't open attribute +CL_ERROR_WRONG_ATTRIBUTE_TYPE = 0x0402 +# CDA_EATTTYPE +CL_ERROR_OUT_OF_RANGE = 0x0403 +# CDA_EIDORNG, CDA_EIDXORNG, CDA_EPOSORNG +CL_ERROR_REGEX = 0x0404 +# CDA_EPATTERN (not used), CDA_EBADREGEX +CL_ERROR_CORPUS_ACCESS = 0x0405 +# CDA_ENODATA +CL_ERROR_OUT_OF_MEMORY = 0x0406 +# CDA_ENOMEM +# this means the CQi server has run out of memory; +# try discarding some other corpora and/or subcorpora +CL_ERROR_INTERNAL = 0x0407 +# CDA_EOTHER, CDA_ENYI +# this is the classical 'please contact technical support' error + +""" 2.5 CQI_CQP_ERROR_* """ +CQP_ERROR = 0x05 +# CQP error messages yet to be defined +CQP_ERROR_GENERAL = 0x0501 +CQP_ERROR_NO_SUCH_CORPUS = 0x0502 +CQP_ERROR_INVALID_FIELD = 0x0503 +CQP_ERROR_OUT_OF_RANGE = 0x0504 +# various cases where a number is out of range + + +""" 3. CQi commands """ +""" 3.1 CQI_CTRL_* """ +CTRL = 0x11 +CTRL_CONNECT = 0x1101 +# INPUT: (STRING username, STRING password) +# OUTPUT: CQI_STATUS_CONNECT_OK, CQI_ERROR_CONNECT_REFUSED +CTRL_BYE = 0x1102 +# INPUT: () +# OUTPUT: CQI_STATUS_BYE_OK +CTRL_USER_ABORT = 0x1103 +# INPUT: () +# OUTPUT: +CTRL_PING = 0x1104 +# INPUT: () +# OUTPUT: CQI_STATUS_PING_OK +CTRL_LAST_GENERAL_ERROR = 0x1105 +# INPUT: () +# OUTPUT: CQI_DATA_STRING +# full-text error message for the last general error reported by the CQi server + +""" 3.2 CQI_ASK_FEATURE_* """ +ASK_FEATURE = 0x12 +ASK_FEATURE_CQI_1_0 = 0x1201 +# INPUT: () +# OUTPUT: CQI_DATA_BOOL +ASK_FEATURE_CL_2_3 = 0x1202 +# INPUT: () +# OUTPUT: CQI_DATA_BOOL +ASK_FEATURE_CQP_2_3 = 0x1203 +# INPUT: () +# OUTPUT: CQI_DATA_BOOL + +""" 3.3 CQI_CORPUS_* """ +CORPUS = 0x13 +CORPUS_LIST_CORPORA = 0x1301 +# INPUT: () +# OUTPUT: CQI_DATA_STRING_LIST +CORPUS_CHARSET = 0x1303 +# INPUT: (STRING corpus) +# OUTPUT: CQI_DATA_STRING +CORPUS_PROPERTIES = 0x1304 +# INPUT: (STRING corpus) +# OUTPUT: CQI_DATA_STRING_LIST +CORPUS_POSITIONAL_ATTRIBUTES = 0x1305 +# INPUT: (STRING corpus) +# OUTPUT: CQI_DATA_STRING_LIST +CORPUS_STRUCTURAL_ATTRIBUTES = 0x1306 +# INPUT: (STRING corpus) +# OUTPUT: CQI_DATA_STRING_LIST +CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES = 0x1307 +# INPUT: (STRING attribute) +# OUTPUT: CQI_DATA_BOOL +CORPUS_ALIGNMENT_ATTRIBUTES = 0x1308 +# INPUT: (STRING corpus) +# OUTPUT: CQI_DATA_STRING_LIST +CORPUS_FULL_NAME = 0x1309 +# INPUT: (STRING corpus) +# OUTPUT: CQI_DATA_STRING +# the full name of as specified in its registry entry +CORPUS_INFO = 0x130A +# INPUT: (STRING corpus) +# OUTPUT: CQI_DATA_STRING_LIST +# returns the contents of the .info file of as a list of lines +CORPUS_DROP_CORPUS = 0x130B +# INPUT: (STRING corpus) +# OUTPUT: CQI_STATUS_OK +# try to unload a corpus and all its attributes from memory + +""" 3.4 CQI_CL_* """ +CL = 0x14 +# low-level corpus access (CL functions) +CL_ATTRIBUTE_SIZE = 0x1401 +# INPUT: (STRING attribute) +# OUTPUT: CQI_DATA_INT +# returns the size of : +# - number of tokens (positional) +# - number of regions (structural) +# - number of alignments (alignment) +CL_LEXICON_SIZE = 0x1402 +# INPUT: (STRING attribute) +# OUTPUT: CQI_DATA_INT +# returns the number of entries in the lexicon of a positional attribute; +# valid lexicon IDs range from 0 .. (lexicon_size - 1) +CL_DROP_ATTRIBUTE = 0x1403 +# INPUT: (STRING attribute) +# OUTPUT: CQI_STATUS_OK +# unload attribute from memory +""" +" NOTE: simple (scalar) mappings are applied to lists (the returned list has +" exactly the same length as the list passed as an argument) +""" +CL_STR2ID = 0x1404 +# INPUT: (STRING attribute, STRING_LIST strings) +# OUTPUT: CQI_DATA_INT_LIST +# returns -1 for every string in that is not found in the lexicon +CL_ID2STR = 0x1405 +# INPUT: (STRING attribute, INT_LIST id) +# OUTPUT: CQI_DATA_STRING_LIST +# returns "" for every ID in that is out of range +CL_ID2FREQ = 0x1406 +# INPUT: (STRING attribute, INT_LIST id) +# OUTPUT: CQI_DATA_INT_LIST +# returns 0 for every ID in that is out of range +CL_CPOS2ID = 0x1407 +# INPUT: (STRING attribute, INT_LIST cpos) +# OUTPUT: CQI_DATA_INT_LIST +# returns -1 for every corpus position in that is out of range +CL_CPOS2STR = 0x1408 +# INPUT: (STRING attribute, INT_LIST cpos) +# OUTPUT: CQI_DATA_STRING_LIST +# returns "" for every corpus position in that is out of range +CL_CPOS2STRUC = 0x1409 +# INPUT: (STRING attribute, INT_LIST cpos) +# OUTPUT: CQI_DATA_INT_LIST +# returns -1 for every corpus position not inside a structure region +""" +" NOTE: temporary addition for the Euralex2000 tutorial, but should probably be +" included in CQi specs +""" +CL_CPOS2LBOUND = 0x1420 +# INPUT: (STRING attribute, INT_LIST cpos) +# OUTPUT: CQI_DATA_INT_LIST +# returns left boundary of s-attribute region enclosing cpos, -1 if not in +# region +CL_CPOS2RBOUND = 0x1421 +# INPUT: (STRING attribute, INT_LIST cpos) +# OUTPUT: CQI_DATA_INT_LIST +# returns right boundary of s-attribute region enclosing cpos, -1 if not in +# region +CL_CPOS2ALG = 0x140A +# INPUT: (STRING attribute, INT_LIST cpos) +# OUTPUT: CQI_DATA_INT_LIST +# returns -1 for every corpus position not inside an alignment +CL_STRUC2STR = 0x140B +# INPUT: (STRING attribute, INT_LIST strucs) +# OUTPUT: CQI_DATA_STRING_LIST +# returns annotated string values of structure regions in ; "" if out +# of range +# check CQI_CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES() first +""" +" NOTE: the following mappings take a single argument and return multiple +" values, including lists of arbitrary size +""" +CL_ID2CPOS = 0x140C +# INPUT: (STRING attribute, INT id) +# OUTPUT: CQI_DATA_INT_LIST +# returns all corpus positions where the given token occurs +CL_IDLIST2CPOS = 0x140D +# INPUT: (STRING attribute, INT_LIST id_list) +# OUTPUT: CQI_DATA_INT_LIST +# returns all corpus positions where one of the tokens in +# occurs; the returned list is sorted as a whole, not per token id +CL_REGEX2ID = 0x140E +# INPUT: (STRING attribute, STRING regex) +# OUTPUT: CQI_DATA_INT_LIST +# returns lexicon IDs of all tokens that match ; the returned +# list may be empty (size 0); +CL_STRUC2CPOS = 0x140F +# INPUT: (STRING attribute, INT struc) +# OUTPUT: CQI_DATA_INT_INT +# returns start and end corpus positions of structure region +CL_ALG2CPOS = 0x1410 +# INPUT: (STRING attribute, INT alg) +# OUTPUT: CQI_DATA_INT_INT_INT_INT +# returns (src_start, src_end, target_start, target_end) + +""" 3.5 CQI_CQP_* """ +CQP = 0x15 +CQP_QUERY = 0x1501 +# INPUT: (STRING mother_corpus, STRING subcorpus_name, STRING query) +# OUTPUT: CQI_STATUS_OK +# must include the ';' character terminating the query. +CQP_LIST_SUBCORPORA = 0x1502 +# INPUT: (STRING corpus) +# OUTPUT: CQI_DATA_STRING_LIST +CQP_SUBCORPUS_SIZE = 0x1503 +# INPUT: (STRING subcorpus) +# OUTPUT: CQI_DATA_INT +CQP_SUBCORPUS_HAS_FIELD = 0x1504 +# INPUT: (STRING subcorpus, BYTE field) +# OUTPUT: CQI_DATA_BOOL +CQP_DUMP_SUBCORPUS = 0x1505 +# INPUT: (STRING subcorpus, BYTE field, INT first, INT last) +# OUTPUT: CQI_DATA_INT_LIST +# Dump the values of for match ranges .. in . +# is one of the CQI_CONST_FIELD_* constants. +CQP_DROP_SUBCORPUS = 0x1509 +# INPUT: (STRING subcorpus) +# OUTPUT: CQI_STATUS_OK +# delete a subcorpus from memory +""" +" NOTE: The following two functions are temporarily included for the Euralex +" 2000 tutorial demo +""" +""" NOTE: frequency distribution of single tokens """ +CQP_FDIST_1 = 0x1510 +# INPUT: (STRING subcorpus, INT cutoff, BYTE field, STRING attribute) +# OUTPUT: CQI_DATA_INT_LIST +# returns (id, frequency) pairs flattened into a list of size 2* +# field is one of CQI_CONST_FIELD_MATCH, CQI_CONST_FIELD_TARGET, +# CQI_CONST_FIELD_KEYWORD +# NB: pairs are sorted by frequency desc. +""" NOTE: frequency distribution of pairs of tokens """ +CQP_FDIST_2 = 0x1511 +# INPUT: (STRING subcorpus, INT cutoff, BYTE field1, STRING attribute1, +# BYTE field2, STRING attribute2) +# OUTPUT: CQI_DATA_INT_LIST +# returns (id1, id2, frequency) pairs flattened into a list of size 3* +# NB: triples are sorted by frequency desc. + + +""" 4. Constant Definitions """ +CONST_FALSE = 0x00 +CONST_NO = 0x00 +CONST_TRUE = 0x01 +CONST_YES = 0x01 +""" +" NOTE: The following constants specify which field will be returned by +" CQI_CQP_DUMP_SUBCORPUS and some other subcorpus commands. +""" +CONST_FIELD_MATCH = 0x10 +CONST_FIELD_MATCHEND = 0x11 +""" +" NOTE: The constants specifiying target0 .. target9 are guaranteed to have the +" numerical values 0 .. 9, so clients do not need to look up the constant +" values if they're handling arbitrary targets. +""" +CONST_FIELD_TARGET_0 = 0x00 +CONST_FIELD_TARGET_1 = 0x01 +CONST_FIELD_TARGET_2 = 0x02 +CONST_FIELD_TARGET_3 = 0x03 +CONST_FIELD_TARGET_4 = 0x04 +CONST_FIELD_TARGET_5 = 0x05 +CONST_FIELD_TARGET_6 = 0x06 +CONST_FIELD_TARGET_7 = 0x07 +CONST_FIELD_TARGET_8 = 0x08 +CONST_FIELD_TARGET_9 = 0x09 +""" +" NOTE: The following constants are provided for backward compatibility with +" traditional CQP field names & while the generalised target concept +" isn't yet implemented in the CQPserver. +""" +CONST_FIELD_TARGET = 0x00 +CONST_FIELD_KEYWORD = 0x09 +""" NOTE: CQi version is CQI_MAJOR_VERSION.CQI_MINOR_VERSION """ +MAJOR_VERSION = 0x00 +MINOR_VERSION = 0x01 + + +""" 5. CQi lookup dictionary. """ +lookup = { + 257: 'CQI_STATUS_OK', + 258: 'CQI_STATUS_CONNECT_OK', + 259: 'CQI_STATUS_BYE_OK', + 260: 'CQI_STATUS_PING_OK', + 513: 'CQI_ERROR_GENERAL_ERROR', + 514: 'CQI_ERROR_CONNECT_REFUSED', + 515: 'CQI_ERROR_USER_ABORT', + 516: 'CQI_ERROR_SYNTAX_ERROR', + 769: 'CQI_DATA_BYTE', + 770: 'CQI_DATA_BOOL', + 771: 'CQI_DATA_INT', + 772: 'CQI_DATA_STRING', + 773: 'CQI_DATA_BYTE_LIST', + 774: 'CQI_DATA_BOOL_LIST', + 775: 'CQI_DATA_INT_LIST', + 776: 'CQI_DATA_STRING_LIST', + 777: 'CQI_DATA_INT_INT', + 778: 'CQI_DATA_INT_INT_INT_INT', + 779: 'CQI_DATA_INT_TABLE', + 1025: 'CQI_CL_ERROR_NO_SUCH_ATTRIBUTE', + 1026: 'CQI_CL_ERROR_WRONG_ATTRIBUTE_TYPE', + 1027: 'CQI_CL_ERROR_OUT_OF_RANGE', + 1028: 'CQI_CL_ERROR_REGEX', + 1029: 'CQI_CL_ERROR_CORPUS_ACCESS', + 1030: 'CQI_CL_ERROR_OUT_OF_MEMORY', + 1031: 'CQI_CL_ERROR_INTERNAL', + 1281: 'CQI_CQP_ERROR_GENERAL', + 1282: 'CQI_CQP_ERROR_NO_SUCH_CORPUS', + 1283: 'CQI_CQP_ERROR_INVALID_FIELD', + 1284: 'CQI_CQP_ERROR_OUT_OF_RANGE', + 4353: 'CQI_CTRL_CONNECT', + 4354: 'CQI_CTRL_BYE', + 4355: 'CQI_CTRL_USER_ABORT', + 4356: 'CQI_CTRL_PING', + 4357: 'CQI_CTRL_LAST_GENERAL_ERROR', + 4609: 'CQI_ASK_FEATURE_CQI_1_0', + 4610: 'CQI_ASK_FEATURE_CL_2_3', + 4611: 'CQI_ASK_FEATURE_CQP_2_3', + 4865: 'CQI_CORPUS_LIST_CORPORA', + 4867: 'CQI_CORPUS_CHARSET', + 4868: 'CQI_CORPUS_PROPERTIES', + 4869: 'CQI_CORPUS_POSITIONAL_ATTRIBUTES', + 4870: 'CQI_CORPUS_STRUCTURAL_ATTRIBUTES', + 4871: 'CQI_CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES', + 4872: 'CQI_CORPUS_ALIGNMENT_ATTRIBUTES', + 4873: 'CQI_CORPUS_FULL_NAME', + 4874: 'CQI_CORPUS_INFO', + 4875: 'CQI_CORPUS_DROP_CORPUS', + 5121: 'CQI_CL_ATTRIBUTE_SIZE', + 5122: 'CQI_CL_LEXICON_SIZE', + 5123: 'CQI_CL_DROP_ATTRIBUTE', + 5124: 'CQI_CL_STR2ID', + 5125: 'CQI_CL_ID2STR', + 5126: 'CQI_CL_ID2FREQ', + 5127: 'CQI_CL_CPOS2ID', + 5128: 'CQI_CL_CPOS2STR', + 5129: 'CQI_CL_CPOS2STRUC', + 5130: 'CQI_CL_CPOS2ALG', + 5131: 'CQI_CL_STRUC2STR', + 5132: 'CQI_CL_ID2CPOS', + 5133: 'CQI_CL_IDLIST2CPOS', + 5134: 'CQI_CL_REGEX2ID', + 5135: 'CQI_CL_STRUC2CPOS', + 5136: 'CQI_CL_ALG2CPOS', + 5152: 'CQI_CL_CPOS2LBOUND', + 5153: 'CQI_CL_CPOS2RBOUND', + 5377: 'CQI_CQP_QUERY', + 5378: 'CQI_CQP_LIST_SUBCORPORA', + 5379: 'CQI_CQP_SUBCORPUS_SIZE', + 5380: 'CQI_CQP_SUBCORPUS_HAS_FIELD', + 5381: 'CQI_CQP_DUMP_SUBCORPUS', + 5385: 'CQI_CQP_DROP_SUBCORPUS', + 5392: 'CQI_CQP_FDIST_1', + 5393: 'CQI_CQP_FDIST_2' +} diff --git a/app/corpora/CQiClient/CQiClient.py b/app/corpora/CQiClient/CQiClient.py new file mode 100644 index 00000000..39a24c4c --- /dev/null +++ b/app/corpora/CQiClient/CQiClient.py @@ -0,0 +1,611 @@ +from . import CQi +import socket +import struct + + +class CQiClient: + def __init__(self, host='127.0.0.1', port=4877): + self.host = host + self.port = port + self.connection = socket.socket() + self.connection.connect((self.host, self.port)) + + def ctrl_connect(self, username, password): + # INPUT: (STRING username, STRING password) + # OUTPUT: CQI_STATUS_CONNECT_OK, CQI_ERROR_CONNECT_REFUSED + # print('CTRL_CONNECT') + self.__send_WORD(CQi.CTRL_CONNECT) + self.__send_STRING(username) + self.__send_STRING(password) + self.__recv_response() + + def ctrl_bye(self): + # INPUT: () + # OUTPUT: CQI_STATUS_BYE_OK + # print('CTRL_BYE') + self.__send_WORD(CQi.CTRL_BYE) + self.__recv_response() + + def ctrl_user_abort(self): + # INPUT: () + # OUTPUT: + # print('CTRL_USER_ABORT') + self.__send_WORD(CQi.CTRL_USER_ABORT) + + def ctrl_ping(self): + # INPUT: () + # OUTPUT: CQI_STATUS_PING_OK + # print('CTRL_PING') + self.__send_WORD(CQi.CTRL_PING) + self.__recv_response() + + def ctrl_last_general_error(self): + # INPUT: () + # OUTPUT: CQI_DATA_STRING + # full-text error message for the last general error reported by the + # CQi server + # print('CTRL_LAST_GENERAL_ERROR') + self.__send_WORD(CQi.CTRL_LAST_GENERAL_ERROR) + return self.__recv_response() + + def ask_feature_cqi_1_0(self): + # INPUT: () + # OUTPUT: CQI_DATA_BOOL + # print('ASK_FEATURE_CQI_1_0') + self.__send_WORD(CQi.ASK_FEATURE_CQI_1_0) + return self.__recv_response() + + def ask_feature_cl_2_3(self): + # INPUT: () + # OUTPUT: CQI_DATA_BOOL + # print('ASK_FEATURE_CL_2_3') + self.__send_WORD(CQi.ASK_FEATURE_CL_2_3) + return self.__recv_response() + + def ask_feature_cqp_2_3(self): + # INPUT: () + # OUTPUT: CQI_DATA_BOOL + # print('ASK_FEATURE_CL_2_3') + self.__send_WORD(CQi.ASK_FEATURE_CL_2_3) + return self.__recv_response() + + def corpus_list_coprora(self): + # INPUT: () + # OUTPUT: CQI_DATA_STRING_LIST + # print('CORPUS_LIST_CORPORA') + self.__send_WORD(CQi.CORPUS_LIST_CORPORA) + return self.__recv_response() + + def corpus_charset(self, corpus): + # INPUT: (STRING corpus) + # OUTPUT: CQI_DATA_STRING + # print('CORPUS_CHARSET') + self.__send_WORD(CQi.CORPUS_CHARSET) + self.__send_STRING(corpus) + return self.__recv_response() + + def corpus_properties(self, corpus): + # INPUT: (STRING corpus) + # OUTPUT: CQI_DATA_STRING_LIST + # print('CORPUS_PROPERTIES') + self.__send_WORD(CQi.CORPUS_PROPERTIES) + self.__send_STRING(corpus) + return self.__recv_response() + + def corpus_positional_attributes(self, corpus): + # INPUT: (STRING corpus) + # OUTPUT: CQI_DATA_STRING_LIST + # print('CORPUS_POSITIONAL_ATTRIBUTES') + self.__send_WORD(CQi.CORPUS_POSITIONAL_ATTRIBUTES) + self.__send_STRING(corpus) + return self.__recv_response() + + def corpus_structural_attributes(self, corpus): + # INPUT: (STRING corpus) + # OUTPUT: CQI_DATA_STRING_LIST + # print('CORPUS_STRUCTURAL_ATTRIBUTES') + self.__send_WORD(CQi.CORPUS_STRUCTURAL_ATTRIBUTES) + self.__send_STRING(corpus) + return self.__recv_response() + + def corpus_structural_attribute_has_values(self, attribute): + # INPUT: (STRING attribute) + # OUTPUT: CQI_DATA_BOOL + # print('CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES') + self.__send_WORD(CQi.CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES) + self.__send_STRING(attribute) + return self.__recv_response() + + def corpus_alignment_attributes(self, corpus): + # INPUT: (STRING corpus) + # OUTPUT: CQI_DATA_STRING_LIST + # print('CORPUS_ALIGNMENT_ATTRIBUTES') + self.__send_WORD(CQi.CORPUS_ALIGNMENT_ATTRIBUTES) + self.__send_STRING(corpus) + return self.__recv_response() + + def corpus_full_name(self, corpus): + # INPUT: (STRING corpus) + # OUTPUT: CQI_DATA_STRING + # the full name of as specified in its registry entry + # print('CORPUS_FULL_NAME') + self.__send_WORD(CQi.CORPUS_FULL_NAME) + self.__send_STRING(corpus) + return self.__recv_response() + + def corpus_info(self, corpus): + # INPUT: (STRING corpus) + # OUTPUT: CQI_DATA_STRING_LIST + # returns the contents of the .info file of as a list of lines + # print('CORPUS_INFO') + self.__send_WORD(CQi.CORPUS_INFO) + self.__send_STRING(corpus) + return self.__recv_response() + + def corpus_drop_corpus(self, corpus): + ''' + ' Broken + ' TODO: Check what type of return value is provided by the server. + ''' + # INPUT: (STRING corpus) + # OUTPUT: CQI_STATUS_OK + # try to unload a corpus and all its attributes from memory + # print('CORPUS_DROP_CORPUS') + self.__send_WORD(CQi.CORPUS_DROP_CORPUS) + self.__send_STRING(corpus) + self.__recv_response() + + def cl_attribute_size(self, attribute): + # INPUT: (STRING attribute) + # OUTPUT: CQI_DATA_INT + # returns the size of : + # number of tokens (positional) + # number of regions (structural) + # number of alignments (alignment) + # print('CL_ATTRIBUTE_SIZE') + self.__send_WORD(CQi.CL_ATTRIBUTE_SIZE) + self.__send_STRING(attribute) + return self.__recv_response() + + def cl_lexicon_size(self, attribute): + # INPUT: (STRING attribute) + # OUTPUT: CQI_DATA_INT + # returns the number of entries in the lexicon of a positional + # attribute; + # valid lexicon IDs range from 0 .. (lexicon_size - 1) + # print('CL_LEXICON_SIZE') + self.__send_WORD(CQi.CL_LEXICON_SIZE) + self.__send_STRING(attribute) + return self.__recv_response() + + def cl_drop_attribute(self, attribute): + # INPUT: (STRING attribute) + # OUTPUT: CQI_STATUS_OK + # unload attribute from memory + # print('CL_DROP_ATTRIBUTE') + self.__send_WORD(CQi.CL_LEXICON_SIZE) + self.__send_STRING(attribute) + self.__recv_response() + + """ + " NOTE: simple (scalar) mappings are applied to lists (the returned list + " has exactly the same length as the list passed as an argument) + """ + + def cl_str2id(self, attribute, strings): + # INPUT: (STRING attribute, STRING_LIST strings) + # OUTPUT: CQI_DATA_INT_LIST + # returns -1 for every string in that is not found in the + # lexicon + # print('CL_STR2ID') + self.__send_WORD(CQi.CL_LEXICON_SIZE) + self.__send_STRING(attribute) + self.__send_STRING_LIST(strings) + return self.__recv_response() + + def cl_id2str(self, attribute, id): + # INPUT: (STRING attribute, INT_LIST id) + # OUTPUT: CQI_DATA_STRING_LIST + # returns "" for every ID in that is out of range + # print('CL_ID2STR') + self.__send_WORD(CQi.CL_ID2STR) + self.__send_STRING(attribute) + self.__send_INT_LIST(id) + return self.__recv_response() + + def cl_id2freq(self, attribute, id): + # INPUT: (STRING attribute, INT_LIST id) + # OUTPUT: CQI_DATA_INT_LIST + # returns 0 for every ID in that is out of range + # print('CL_ID2FREQ') + self.__send_WORD(CQi.CL_ID2FREQ) + self.__send_STRING(attribute) + self.__send_INT_LIST(id) + return self.__recv_response() + + def cl_cpos2id(self, attribute, cpos): + # INPUT: (STRING attribute, INT_LIST cpos) + # OUTPUT: CQI_DATA_INT_LIST + # returns -1 for every corpus position in that is out of range + # print('CL_CPOS2ID') + self.__send_WORD(CQi.CL_ID2FREQ) + self.__send_STRING(attribute) + self.__send_INT_LIST(cpos) + return self.__recv_response() + + def cl_cpos2str(self, attribute, cpos): + # INPUT: (STRING attribute, INT_LIST cpos) + # OUTPUT: CQI_DATA_STRING_LIST + # returns "" for every corpus position in that is out of range + # print('CL_CPOS2STR') + self.__send_WORD(CQi.CL_CPOS2STR) + self.__send_STRING(attribute) + self.__send_INT_LIST(cpos) + return self.__recv_response() + + def cl_cpos2struc(self, attribute, cpos): + # INPUT: (STRING attribute, INT_LIST cpos) + # OUTPUT: CQI_DATA_INT_LIST + # returns -1 for every corpus position not inside a structure region + # print('CL_CPOS2STRUC') + self.__send_WORD(CQi.CL_CPOS2STRUC) + self.__send_STRING(attribute) + self.__send_INT_LIST(cpos) + return self.__recv_response() + + """ + " NOTE: temporary addition for the Euralex2000 tutorial, but should + " probably be included in CQi specs + """ + + def cl_cpos2lbound(self, attribute, cpos): + # INPUT: (STRING attribute, INT_LIST cpos) + # OUTPUT: CQI_DATA_INT_LIST + # returns left boundary of s-attribute region enclosing cpos, -1 if not + # in region + # print('CL_CPOS2LBOUND') + self.__send_WORD(CQi.CL_CPOS2LBOUND) + self.__send_STRING(attribute) + self.__send_INT_LIST(cpos) + return self.__recv_response() + + def cl_cpos2rbound(self, attribute, cpos): + # INPUT: (STRING attribute, INT_LIST cpos) + # OUTPUT: CQI_DATA_INT_LIST + # returns right boundary of s-attribute region enclosing cpos, -1 if + # not in region + # print('CL_CPOS2RBOUND') + self.__send_WORD(CQi.CL_CPOS2RBOUND) + self.__send_STRING(attribute) + self.__send_INT_LIST(cpos) + return self.__recv_response() + + def cl_cpos2alg(self, attribute, cpos): + # INPUT: (STRING attribute, INT_LIST cpos) + # OUTPUT: CQI_DATA_INT_LIST + # returns -1 for every corpus position not inside an alignment + # print('CL_CPOS2ALG') + self.__send_WORD(CQi.CL_CPOS2ALG) + self.__send_STRING(attribute) + self.__send_INT_LIST(cpos) + return self.__recv_response() + + def cl_struc2str(self, attribute, strucs): + # INPUT: (STRING attribute, INT_LIST strucs) + # OUTPUT: CQI_DATA_STRING_LIST + # returns annotated string values of structure regions in ; "" + # if out of range + # check CQI_CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES() first + # print('CL_STRUC2STR') + self.__send_WORD(CQi.CL_STRUC2STR) + self.__send_STRING(attribute) + self.__send_INT_LIST(strucs) + return self.__recv_response() + + """ + " NOTE: the following mappings take a single argument and return multiple + " values, including lists of arbitrary size + """ + + def cl_id2cpos(self, attribute, id): + # INPUT: (STRING attribute, INT id) + # OUTPUT: CQI_DATA_INT_LIST + # returns all corpus positions where the given token occurs + # print('CL_ID2CPOS') + self.__send_WORD(CQi.CL_ID2CPOS) + self.__send_STRING(attribute) + self.__send_INT(id) + return self.__recv_response() + + def cl_idlist2cpos(self, attribute, id_list): + # INPUT: (STRING attribute, INT_LIST id_list) + # OUTPUT: CQI_DATA_INT_LIST + # returns all corpus positions where one of the tokens in + # occurs; the returned list is sorted as a whole, not per token id + # print('CL_IDLIST2CPOS') + self.__send_WORD(CQi.CL_IDLIST2CPOS) + self.__send_STRING(attribute) + self.__send_INT_LIST(id_list) + return self.__recv_response() + + def cl_regex2id(self, attribute, regex): + # INPUT: (STRING attribute, STRING regex) + # OUTPUT: CQI_DATA_INT_LIST + # returns lexicon IDs of all tokens that match ; the returned + # list may be empty (size 0); + # print('CL_REGEX2ID') + self.__send_WORD(CQi.CL_REGEX2ID) + self.__send_STRING(attribute) + self.__send_STRING(regex) + return self.__recv_response() + + def cl_struc2cpos(self, attribute, struc): + # INPUT: (STRING attribute, INT struc) + # OUTPUT: CQI_DATA_INT_INT + # returns start and end corpus positions of structure region + # print('CL_STRUC2CPOS') + self.__send_WORD(CQi.CL_STRUC2CPOS) + self.__send_STRING(attribute) + self.__send_INT(struc) + return self.__recv_response() + + def cl_alg2cpos(self, attribute, alg): + # INPUT: (STRING attribute, INT alg) + # OUTPUT: CQI_DATA_INT_INT_INT_INT + # returns (src_start, src_end, target_start, target_end) + # print('CL_ALG2CPOS') + self.__send_WORD(CQi.CL_ALG2CPOS) + self.__send_STRING(attribute) + self.__send_INT(alg) + return self.__recv_response() + + def cqp_query(self, mother_corpus, subcorpus_name, query): + # INPUT: (STRING mother_corpus, STRING subcorpus_name, STRING query) + # OUTPUT: CQI_STATUS_OK + # must include the ';' character terminating the query. + # print('CQP_QUERY') + self.__send_WORD(CQi.CQP_QUERY) + self.__send_STRING(mother_corpus) + self.__send_STRING(subcorpus_name) + self.__send_STRING(query) + self.__recv_WORD() + + def cqp_list_subcorpora(self, corpus): + # INPUT: (STRING corpus) + # OUTPUT: CQI_DATA_STRING_LIST + # print('CQP_LIST_SUBCORPORA') + self.__send_WORD(CQi.CQP_LIST_SUBCORPORA) + self.__send_STRING(corpus) + return self.__recv_response() + + def cqp_subcorpus_size(self, subcorpus): + # INPUT: (STRING subcorpus) + # OUTPUT: CQI_DATA_INT + # print('CQP_SUBCORPUS_SIZE') + self.__send_WORD(CQi.CQP_SUBCORPUS_SIZE) + self.__send_STRING(subcorpus) + return self.__recv_response() + + def cqp_subcorpus_has_field(self, subcorpus, field): + # INPUT: (STRING subcorpus, BYTE field) + # OUTPUT: CQI_DATA_BOOL + # print('CQP_SUBCORPUS_HAS_FIELD') + self.__send_WORD(CQi.CQP_SUBCORPUS_HAS_FIELD) + self.__send_STRING(subcorpus) + self.__send_BYTE(field) + return self.__recv_response() + + def cqp_dump_subcorpus(self, subcorpus, field, first, last): + # INPUT: (STRING subcorpus, BYTE field, INT first, INT last) + # OUTPUT: CQI_DATA_INT_LIST + # Dump the values of for match ranges .. + # in . is one of the CQI_CONST_FIELD_* constants. + # print('CQP_DUMP_SUBCORPUS') + self.__send_WORD(CQi.CQP_DUMP_SUBCORPUS) + self.__send_STRING(subcorpus) + self.__send_BYTE(field) + self.__send_INT(first) + self.__send_INT(last) + return self.__recv_response() + + def cqp_drop_subcorpus(self, subcorpus): + # INPUT: (STRING subcorpus) + # OUTPUT: CQI_STATUS_OK + # delete a subcorpus from memory + # print('CQP_DROP_SUBCORPUS') + self.__send_WORD(CQi.CQP_DROP_SUBCORPUS) + self.__send_STRING(subcorpus) + self.__recv_response() + + """ + " NOTE: The following two functions are temporarily included for the + " Euralex 2000 tutorial demo + """ + + def cqp_fdist_1(self, subcorpus, cutoff, field, attribute): + """ NOTE: frequency distribution of single tokens """ + # INPUT: (STRING subcorpus, INT cutoff, BYTE field, STRING attribute) + # OUTPUT: CQI_DATA_INT_LIST + # returns (id, frequency) pairs flattened into a list of size 2* + # field is one of CQI_CONST_FIELD_MATCH, CQI_CONST_FIELD_TARGET, + # CQI_CONST_FIELD_KEYWORD + # NB: pairs are sorted by frequency desc. + # print('CQP_FDIST_1') + self.__send_WORD(CQi.CQP_FDIST_1) + self.__send_STRING(subcorpus) + self.__send_INT(cutoff) + self.__send_BYTE(field) + self.__send_STRING(attribute) + return self.__recv_response() + + def cqp_fdist_2(self, subcorpus, cutoff, field1, attribute1, field2, + attribute2): + """ NOTE: frequency distribution of pairs of tokens """ + # INPUT: (STRING subcorpus, INT cutoff, BYTE field1, STRING attribute1, + # BYTE field2, STRING attribute2) + # OUTPUT: CQI_DATA_INT_LIST + # returns (id1, id2, frequency) pairs flattened into a list of size + # 3* + # NB: triples are sorted by frequency desc. + # print('CQP_FDIST_2') + self.__send_WORD(CQi.CQP_FDIST_2) + self.__send_STRING(subcorpus) + self.__send_INT(cutoff) + self.__send_BYTE(field1) + self.__send_STRING(attribute1) + self.__send_BYTE(field2) + self.__send_STRING(attribute2) + return self.__recv_response() + + def __recv_response(self): + byte_data = self.__recv_WORD() + response_type = byte_data >> 8 + if response_type == CQi.STATUS: + response = byte_data + elif response_type == CQi.ERROR: + raise Exception(CQi.lookup[byte_data]) + elif response_type == CQi.DATA: + response = self.__recv_DATA(byte_data) + elif response_type == CQi.CL_ERROR: + raise Exception(CQi.lookup[byte_data]) + elif response_type == CQi.CQP_ERROR: + raise Exception(CQi.lookup[byte_data]) + else: + raise Exception( + 'Unknown response type: {}'.format(hex(response_type)) + ) + return response + + def __recv_DATA(self, data_type): + if data_type == CQi.DATA_BYTE: + data = self.__recv_DATA_BYTE() + elif data_type == CQi.DATA_BOOL: + data = self.__recv_DATA_BOOL() + elif data_type == CQi.DATA_INT: + data = self.__recv_DATA_INT() + elif data_type == CQi.DATA_STRING: + data = self.__recv_DATA_STRING() + elif data_type == CQi.DATA_BYTE_LIST: + data = self.__recv_DATA_BYTE_LIST() + elif data_type == CQi.DATA_BOOL_LIST: + data = self.__recv_DATA_BOOL_LIST() + elif data_type == CQi.DATA_INT_LIST: + data = self.__recv_DATA_INT_LIST() + elif data_type == CQi.DATA_STRING_LIST: + data = self.__recv_DATA_STRING_LIST() + elif data_type == CQi.DATA_INT_INT: + data = self.__recv_DATA_INT_INT() + elif data_type == CQi.DATA_INT_INT_INT_INT: + data = self.__recv_DATA_INT_INT_INT_INT() + elif data_type == CQi.DATA_INT_TABLE: + data = self.__recv_DATA_INT_TABLE() + else: + raise Exception('Unknown data type: {}'.format(hex(data_type))) + return data + + def __recv_DATA_BYTE(self): + byte_data = self.connection.recv(1) + return struct.unpack('!B', byte_data)[0] + + def __recv_DATA_BOOL(self): + byte_data = self.connection.recv(1) + return struct.unpack('!?', byte_data)[0] + + def __recv_DATA_INT(self): + byte_data = self.connection.recv(4) + return struct.unpack('!i', byte_data)[0] + + def __recv_DATA_STRING(self): + n = self.__recv_WORD() + byte_data = self.connection.recv(n) + return struct.unpack('!{}s'.format(n), byte_data)[0].decode() + + def __recv_DATA_BYTE_LIST(self): + data = [] + n = self.__recv_DATA_INT() + while n > 0: + data.append(self.__recv_DATA_BYTE()) + n -= 1 + return data + + def __recv_DATA_BOOL_LIST(self): + data = [] + n = self.__recv_DATA_INT() + while n > 0: + data.append(self.__recv_DATA_BOOL()) + n -= 1 + return data + + def __recv_DATA_INT_LIST(self): + data = [] + n = self.__recv_DATA_INT() + while n > 0: + data.append(self.__recv_DATA_INT()) + n -= 1 + return data + + def __recv_DATA_STRING_LIST(self): + data = [] + n = self.__recv_DATA_INT() + while n > 0: + data.append(self.__recv_DATA_STRING()) + n -= 1 + return data + + def __recv_DATA_INT_INT(self): + return (self.__recv_INT(), self.__recv_INT()) + + def __recv_DATA_INT_INT_INT_INT(self): + return (self.__recv_INT(), + self.__recv_INT(), + self.__recv_INT(), + self.__recv_INT()) + + def __recv_DATA_INT_TABLE(self): + rows = self.__recv_DATA_INT() + columns = self.__recv_DATA_INT() + data = [] + for i in range(0, rows): + row = [] + for j in range(0, columns): + row.append(self.__recv_DATA_INT()) + data.append(row) + return data + + def __recv_WORD(self): + byte_data = self.connection.recv(2) + return struct.unpack('!H', byte_data)[0] + + def __send_BYTE(self, byte_data): + data = struct.pack('!B', byte_data) + self.connection.sendall(data) + + def __send_BOOL(self, bool_data): + data = struct.pack('!?', bool_data) + self.connection.sendall(data) + + def __send_INT(self, int_data): + data = struct.pack('!i', int_data) + self.connection.sendall(data) + + def __send_STRING(self, string_data): + encoded_string_data = string_data.encode('utf-8') + n = len(encoded_string_data) + data = struct.pack('!H{}s'.format(n), n, encoded_string_data) + self.connection.sendall(data) + + def __send_INT_LIST(self, int_list_data): + n = len(int_list_data) + self.__send_INT(n) + for int_data in int_list_data: + self.__send_INT(int_data) + + def __send_STRING_LIST(self, string_list_data): + n = len(string_list_data) + self.__send_INT(n) + for string_data in string_list_data: + self.__send_STRING(string_data) + + def __send_WORD(self, word_data): + data = struct.pack('!H', word_data) + self.connection.sendall(data) diff --git a/app/corpora/events.py b/app/corpora/events.py index 51b240ac..f513052e 100644 --- a/app/corpora/events.py +++ b/app/corpora/events.py @@ -3,12 +3,15 @@ from app.events import connected_sessions from app.models import Corpus from flask import current_app, request from flask_login import login_required +from .CQiClient.CQiClient import CQiClient import logging @socketio.on('init_corpus_analysis') @login_required def init_corpus_analysis(corpus_id): + logger = logging.getLogger(__name__) + logger.warning('init_corpus_analysis') ''' TODO: Check if current_user is allowed to subscribe to this ''' socketio.start_background_task(observe_corpus_analysis_connection, current_app._get_current_object(), @@ -25,7 +28,9 @@ def recv_query(message): def observe_corpus_analysis_connection(app, corpus_id, session_id): logger = logging.getLogger(__name__) with app.app_context(): + cqi_client = CQiClient(host='172.25.0.2') while session_id in connected_sessions: + logger.warning(cqi_client.ctrl_ping()) logger.warning('Run container, run!') socketio.sleep(3) corpus = Corpus.query.filter_by(id=corpus_id).first()