diff --git a/app/corpora/CQiWrapper/CQi.py b/app/corpora/CQiWrapper/CQi.py index 5d39395a..62eebdb7 100644 --- a/app/corpora/CQiWrapper/CQi.py +++ b/app/corpora/CQiWrapper/CQi.py @@ -6,6 +6,8 @@ # Modified by: Patrick Jentsch # # ########################################################################### # +import socket +import struct """ 1. padding """ @@ -404,3 +406,610 @@ lookup = { 5392: 'CQI_CQP_FDIST_1', 5393: 'CQI_CQP_FDIST_2' } + + +class Client: + def __init__(self, host='127.0.0.1', port=4877): + self.host = host + self.port = port + self.connection = socket.socket() + self.connection.connect((self.host, self.port)) + + def ctrl_connect(self, username, password): + # INPUT: (STRING username, STRING password) + # OUTPUT: CQI_STATUS_CONNECT_OK, CQI_ERROR_CONNECT_REFUSED + # print('CTRL_CONNECT') + self.__send_WORD(CTRL_CONNECT) + self.__send_STRING(username) + self.__send_STRING(password) + return self.__recv_response() + + def ctrl_bye(self): + # INPUT: () + # OUTPUT: CQI_STATUS_BYE_OK + # print('CTRL_BYE') + self.__send_WORD(CTRL_BYE) + return self.__recv_response() + + def ctrl_user_abort(self): + # INPUT: () + # OUTPUT: + # print('CTRL_USER_ABORT') + self.__send_WORD(CTRL_USER_ABORT) + + def ctrl_ping(self): + # INPUT: () + # OUTPUT: CQI_STATUS_PING_OK + # print('CTRL_PING') + self.__send_WORD(CTRL_PING) + return self.__recv_response() + + def ctrl_last_general_error(self): + # INPUT: () + # OUTPUT: CQI_DATA_STRING + # full-text error message for the last general error reported by the + # CQi server + # print('CTRL_LAST_GENERAL_ERROR') + self.__send_WORD(CTRL_LAST_GENERAL_ERROR) + return self.__recv_response() + + def ask_feature_cqi_1_0(self): + # INPUT: () + # OUTPUT: CQI_DATA_BOOL + # print('ASK_FEATURE_CQI_1_0') + self.__send_WORD(ASK_FEATURE_CQI_1_0) + return self.__recv_response() + + def ask_feature_cl_2_3(self): + # INPUT: () + # OUTPUT: CQI_DATA_BOOL + # print('ASK_FEATURE_CL_2_3') + self.__send_WORD(ASK_FEATURE_CL_2_3) + return self.__recv_response() + + def ask_feature_cqp_2_3(self): + # INPUT: () + # OUTPUT: CQI_DATA_BOOL + # print('ASK_FEATURE_CL_2_3') + self.__send_WORD(ASK_FEATURE_CL_2_3) + return self.__recv_response() + + def corpus_list_coprora(self): + # INPUT: () + # OUTPUT: CQI_DATA_STRING_LIST + # print('CORPUS_LIST_CORPORA') + self.__send_WORD(CORPUS_LIST_CORPORA) + return self.__recv_response() + + def corpus_charset(self, corpus): + # INPUT: (STRING corpus) + # OUTPUT: CQI_DATA_STRING + # print('CORPUS_CHARSET') + self.__send_WORD(CORPUS_CHARSET) + self.__send_STRING(corpus) + return self.__recv_response() + + def corpus_properties(self, corpus): + # INPUT: (STRING corpus) + # OUTPUT: CQI_DATA_STRING_LIST + # print('CORPUS_PROPERTIES') + self.__send_WORD(CORPUS_PROPERTIES) + self.__send_STRING(corpus) + return self.__recv_response() + + def corpus_positional_attributes(self, corpus): + # INPUT: (STRING corpus) + # OUTPUT: CQI_DATA_STRING_LIST + # print('CORPUS_POSITIONAL_ATTRIBUTES') + self.__send_WORD(CORPUS_POSITIONAL_ATTRIBUTES) + self.__send_STRING(corpus) + return self.__recv_response() + + def corpus_structural_attributes(self, corpus): + # INPUT: (STRING corpus) + # OUTPUT: CQI_DATA_STRING_LIST + # print('CORPUS_STRUCTURAL_ATTRIBUTES') + self.__send_WORD(CORPUS_STRUCTURAL_ATTRIBUTES) + self.__send_STRING(corpus) + return self.__recv_response() + + def corpus_structural_attribute_has_values(self, attribute): + # INPUT: (STRING attribute) + # OUTPUT: CQI_DATA_BOOL + # print('CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES') + self.__send_WORD(CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES) + self.__send_STRING(attribute) + return self.__recv_response() + + def corpus_alignment_attributes(self, corpus): + # INPUT: (STRING corpus) + # OUTPUT: CQI_DATA_STRING_LIST + # print('CORPUS_ALIGNMENT_ATTRIBUTES') + self.__send_WORD(CORPUS_ALIGNMENT_ATTRIBUTES) + self.__send_STRING(corpus) + return self.__recv_response() + + def corpus_full_name(self, corpus): + # INPUT: (STRING corpus) + # OUTPUT: CQI_DATA_STRING + # the full name of as specified in its registry entry + # print('CORPUS_FULL_NAME') + self.__send_WORD(CORPUS_FULL_NAME) + self.__send_STRING(corpus) + return self.__recv_response() + + def corpus_info(self, corpus): + # INPUT: (STRING corpus) + # OUTPUT: CQI_DATA_STRING_LIST + # returns the contents of the .info file of as a list of lines + # print('CORPUS_INFO') + self.__send_WORD(CORPUS_INFO) + self.__send_STRING(corpus) + return self.__recv_response() + + def corpus_drop_corpus(self, corpus): + ''' + ' Broken + ' TODO: Check what type of return value is provided by the server. + ''' + # INPUT: (STRING corpus) + # OUTPUT: CQI_STATUS_OK + # try to unload a corpus and all its attributes from memory + # print('CORPUS_DROP_CORPUS') + self.__send_WORD(CORPUS_DROP_CORPUS) + self.__send_STRING(corpus) + return self.__recv_response() + + def cl_attribute_size(self, attribute): + # INPUT: (STRING attribute) + # OUTPUT: CQI_DATA_INT + # returns the size of : + # number of tokens (positional) + # number of regions (structural) + # number of alignments (alignment) + # print('CL_ATTRIBUTE_SIZE') + self.__send_WORD(CL_ATTRIBUTE_SIZE) + self.__send_STRING(attribute) + return self.__recv_response() + + def cl_lexicon_size(self, attribute): + # INPUT: (STRING attribute) + # OUTPUT: CQI_DATA_INT + # returns the number of entries in the lexicon of a positional + # attribute; + # valid lexicon IDs range from 0 .. (lexicon_size - 1) + # print('CL_LEXICON_SIZE') + self.__send_WORD(CL_LEXICON_SIZE) + self.__send_STRING(attribute) + return self.__recv_response() + + def cl_drop_attribute(self, attribute): + # INPUT: (STRING attribute) + # OUTPUT: CQI_STATUS_OK + # unload attribute from memory + # print('CL_DROP_ATTRIBUTE') + self.__send_WORD(CL_LEXICON_SIZE) + self.__send_STRING(attribute) + return self.__recv_response() + + """ + " NOTE: simple (scalar) mappings are applied to lists (the returned list + " has exactly the same length as the list passed as an argument) + """ + + def cl_str2id(self, attribute, strings): + # INPUT: (STRING attribute, STRING_LIST strings) + # OUTPUT: CQI_DATA_INT_LIST + # returns -1 for every string in that is not found in the + # lexicon + # print('CL_STR2ID') + self.__send_WORD(CL_LEXICON_SIZE) + self.__send_STRING(attribute) + self.__send_STRING_LIST(strings) + return self.__recv_response() + + def cl_id2str(self, attribute, id): + # INPUT: (STRING attribute, INT_LIST id) + # OUTPUT: CQI_DATA_STRING_LIST + # returns "" for every ID in that is out of range + # print('CL_ID2STR') + self.__send_WORD(CL_ID2STR) + self.__send_STRING(attribute) + self.__send_INT_LIST(id) + return self.__recv_response() + + def cl_id2freq(self, attribute, id): + # INPUT: (STRING attribute, INT_LIST id) + # OUTPUT: CQI_DATA_INT_LIST + # returns 0 for every ID in that is out of range + # print('CL_ID2FREQ') + self.__send_WORD(CL_ID2FREQ) + self.__send_STRING(attribute) + self.__send_INT_LIST(id) + return self.__recv_response() + + def cl_cpos2id(self, attribute, cpos): + # INPUT: (STRING attribute, INT_LIST cpos) + # OUTPUT: CQI_DATA_INT_LIST + # returns -1 for every corpus position in that is out of range + # print('CL_CPOS2ID') + self.__send_WORD(CL_ID2FREQ) + self.__send_STRING(attribute) + self.__send_INT_LIST(cpos) + return self.__recv_response() + + def cl_cpos2str(self, attribute, cpos): + # INPUT: (STRING attribute, INT_LIST cpos) + # OUTPUT: CQI_DATA_STRING_LIST + # returns "" for every corpus position in that is out of range + # print('CL_CPOS2STR') + self.__send_WORD(CL_CPOS2STR) + self.__send_STRING(attribute) + self.__send_INT_LIST(cpos) + return self.__recv_response() + + def cl_cpos2struc(self, attribute, cpos): + # INPUT: (STRING attribute, INT_LIST cpos) + # OUTPUT: CQI_DATA_INT_LIST + # returns -1 for every corpus position not inside a structure region + # print('CL_CPOS2STRUC') + self.__send_WORD(CL_CPOS2STRUC) + self.__send_STRING(attribute) + self.__send_INT_LIST(cpos) + return self.__recv_response() + + """ + " NOTE: temporary addition for the Euralex2000 tutorial, but should + " probably be included in CQi specs + """ + + def cl_cpos2lbound(self, attribute, cpos): + # INPUT: (STRING attribute, INT_LIST cpos) + # OUTPUT: CQI_DATA_INT_LIST + # returns left boundary of s-attribute region enclosing cpos, -1 if not + # in region + # print('CL_CPOS2LBOUND') + self.__send_WORD(CL_CPOS2LBOUND) + self.__send_STRING(attribute) + self.__send_INT_LIST(cpos) + return self.__recv_response() + + def cl_cpos2rbound(self, attribute, cpos): + # INPUT: (STRING attribute, INT_LIST cpos) + # OUTPUT: CQI_DATA_INT_LIST + # returns right boundary of s-attribute region enclosing cpos, -1 if + # not in region + # print('CL_CPOS2RBOUND') + self.__send_WORD(CL_CPOS2RBOUND) + self.__send_STRING(attribute) + self.__send_INT_LIST(cpos) + return self.__recv_response() + + def cl_cpos2alg(self, attribute, cpos): + # INPUT: (STRING attribute, INT_LIST cpos) + # OUTPUT: CQI_DATA_INT_LIST + # returns -1 for every corpus position not inside an alignment + # print('CL_CPOS2ALG') + self.__send_WORD(CL_CPOS2ALG) + self.__send_STRING(attribute) + self.__send_INT_LIST(cpos) + return self.__recv_response() + + def cl_struc2str(self, attribute, strucs): + # INPUT: (STRING attribute, INT_LIST strucs) + # OUTPUT: CQI_DATA_STRING_LIST + # returns annotated string values of structure regions in ; "" + # if out of range + # check CQI_CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES() first + # print('CL_STRUC2STR') + self.__send_WORD(CL_STRUC2STR) + self.__send_STRING(attribute) + self.__send_INT_LIST(strucs) + return self.__recv_response() + + """ + " NOTE: the following mappings take a single argument and return multiple + " values, including lists of arbitrary size + """ + + def cl_id2cpos(self, attribute, id): + # INPUT: (STRING attribute, INT id) + # OUTPUT: CQI_DATA_INT_LIST + # returns all corpus positions where the given token occurs + # print('CL_ID2CPOS') + self.__send_WORD(CL_ID2CPOS) + self.__send_STRING(attribute) + self.__send_INT(id) + return self.__recv_response() + + def cl_idlist2cpos(self, attribute, id_list): + # INPUT: (STRING attribute, INT_LIST id_list) + # OUTPUT: CQI_DATA_INT_LIST + # returns all corpus positions where one of the tokens in + # occurs; the returned list is sorted as a whole, not per token id + # print('CL_IDLIST2CPOS') + self.__send_WORD(CL_IDLIST2CPOS) + self.__send_STRING(attribute) + self.__send_INT_LIST(id_list) + return self.__recv_response() + + def cl_regex2id(self, attribute, regex): + # INPUT: (STRING attribute, STRING regex) + # OUTPUT: CQI_DATA_INT_LIST + # returns lexicon IDs of all tokens that match ; the returned + # list may be empty (size 0); + # print('CL_REGEX2ID') + self.__send_WORD(CL_REGEX2ID) + self.__send_STRING(attribute) + self.__send_STRING(regex) + return self.__recv_response() + + def cl_struc2cpos(self, attribute, struc): + # INPUT: (STRING attribute, INT struc) + # OUTPUT: CQI_DATA_INT_INT + # returns start and end corpus positions of structure region + # print('CL_STRUC2CPOS') + self.__send_WORD(CL_STRUC2CPOS) + self.__send_STRING(attribute) + self.__send_INT(struc) + return self.__recv_response() + + def cl_alg2cpos(self, attribute, alg): + # INPUT: (STRING attribute, INT alg) + # OUTPUT: CQI_DATA_INT_INT_INT_INT + # returns (src_start, src_end, target_start, target_end) + # print('CL_ALG2CPOS') + self.__send_WORD(CL_ALG2CPOS) + self.__send_STRING(attribute) + self.__send_INT(alg) + return self.__recv_response() + + def cqp_query(self, mother_corpus, subcorpus_name, query): + # INPUT: (STRING mother_corpus, STRING subcorpus_name, STRING query) + # OUTPUT: CQI_STATUS_OK + # must include the ';' character terminating the query. + # print('CQP_QUERY') + self.__send_WORD(CQP_QUERY) + self.__send_STRING(mother_corpus) + self.__send_STRING(subcorpus_name) + self.__send_STRING(query) + return self.__recv_response() + + def cqp_list_subcorpora(self, corpus): + # INPUT: (STRING corpus) + # OUTPUT: CQI_DATA_STRING_LIST + # print('CQP_LIST_SUBCORPORA') + self.__send_WORD(CQP_LIST_SUBCORPORA) + self.__send_STRING(corpus) + return self.__recv_response() + + def cqp_subcorpus_size(self, subcorpus): + # INPUT: (STRING subcorpus) + # OUTPUT: CQI_DATA_INT + # print('CQP_SUBCORPUS_SIZE') + self.__send_WORD(CQP_SUBCORPUS_SIZE) + self.__send_STRING(subcorpus) + return self.__recv_response() + + def cqp_subcorpus_has_field(self, subcorpus, field): + # INPUT: (STRING subcorpus, BYTE field) + # OUTPUT: CQI_DATA_BOOL + # print('CQP_SUBCORPUS_HAS_FIELD') + self.__send_WORD(CQP_SUBCORPUS_HAS_FIELD) + self.__send_STRING(subcorpus) + self.__send_BYTE(field) + return self.__recv_response() + + def cqp_dump_subcorpus(self, subcorpus, field, first, last): + # INPUT: (STRING subcorpus, BYTE field, INT first, INT last) + # OUTPUT: CQI_DATA_INT_LIST + # Dump the values of for match ranges .. + # in . is one of the CQI_CONST_FIELD_* constants. + # print('CQP_DUMP_SUBCORPUS') + self.__send_WORD(CQP_DUMP_SUBCORPUS) + self.__send_STRING(subcorpus) + self.__send_BYTE(field) + self.__send_INT(first) + self.__send_INT(last) + return self.__recv_response() + + def cqp_drop_subcorpus(self, subcorpus): + # INPUT: (STRING subcorpus) + # OUTPUT: CQI_STATUS_OK + # delete a subcorpus from memory + # print('CQP_DROP_SUBCORPUS') + self.__send_WORD(CQP_DROP_SUBCORPUS) + self.__send_STRING(subcorpus) + return self.__recv_response() + + """ + " NOTE: The following two functions are temporarily included for the + " Euralex 2000 tutorial demo + """ + + def cqp_fdist_1(self, subcorpus, cutoff, field, attribute): + """ NOTE: frequency distribution of single tokens """ + # INPUT: (STRING subcorpus, INT cutoff, BYTE field, STRING attribute) + # OUTPUT: CQI_DATA_INT_LIST + # returns (id, frequency) pairs flattened into a list of size 2* + # field is one of CQI_CONST_FIELD_MATCH, CQI_CONST_FIELD_TARGET, + # CQI_CONST_FIELD_KEYWORD + # NB: pairs are sorted by frequency desc. + # print('CQP_FDIST_1') + self.__send_WORD(CQP_FDIST_1) + self.__send_STRING(subcorpus) + self.__send_INT(cutoff) + self.__send_BYTE(field) + self.__send_STRING(attribute) + return self.__recv_response() + + def cqp_fdist_2(self, subcorpus, cutoff, field1, attribute1, field2, + attribute2): + """ NOTE: frequency distribution of pairs of tokens """ + # INPUT: (STRING subcorpus, INT cutoff, BYTE field1, STRING attribute1, + # BYTE field2, STRING attribute2) + # OUTPUT: CQI_DATA_INT_LIST + # returns (id1, id2, frequency) pairs flattened into a list of size + # 3* + # NB: triples are sorted by frequency desc. + # print('CQP_FDIST_2') + self.__send_WORD(CQP_FDIST_2) + self.__send_STRING(subcorpus) + self.__send_INT(cutoff) + self.__send_BYTE(field1) + self.__send_STRING(attribute1) + self.__send_BYTE(field2) + self.__send_STRING(attribute2) + return self.__recv_response() + + def __recv_response(self): + byte_data = self.__recv_WORD() + response_type = byte_data >> 8 + if response_type == CL_ERROR: + raise Exception(lookup[byte_data]) + elif response_type == CQP_ERROR: + raise Exception(lookup[byte_data]) + elif response_type == DATA: + return self.__recv_DATA(byte_data) + elif response_type == ERROR: + raise Exception(lookup[byte_data]) + elif response_type == STATUS: + return byte_data + else: + raise Exception( + 'Unknown response type: {}'.format(hex(response_type)) + ) + + def __recv_DATA(self, data_type): + if data_type == DATA_BYTE: + data = self.__recv_DATA_BYTE() + elif data_type == DATA_BOOL: + data = self.__recv_DATA_BOOL() + elif data_type == DATA_INT: + data = self.__recv_DATA_INT() + elif data_type == DATA_STRING: + data = self.__recv_DATA_STRING() + elif data_type == DATA_BYTE_LIST: + data = self.__recv_DATA_BYTE_LIST() + elif data_type == DATA_BOOL_LIST: + data = self.__recv_DATA_BOOL_LIST() + elif data_type == DATA_INT_LIST: + data = self.__recv_DATA_INT_LIST() + elif data_type == DATA_STRING_LIST: + data = self.__recv_DATA_STRING_LIST() + elif data_type == DATA_INT_INT: + data = self.__recv_DATA_INT_INT() + elif data_type == DATA_INT_INT_INT_INT: + data = self.__recv_DATA_INT_INT_INT_INT() + elif data_type == DATA_INT_TABLE: + data = self.__recv_DATA_INT_TABLE() + else: + raise Exception('Unknown data type: {}'.format(hex(data_type))) + return data + + def __recv_DATA_BYTE(self): + byte_data = self.connection.recv(1, socket.MSG_WAITALL) + return struct.unpack('!B', byte_data)[0] + + def __recv_DATA_BOOL(self): + byte_data = self.connection.recv(1, socket.MSG_WAITALL) + return struct.unpack('!?', byte_data)[0] + + def __recv_DATA_INT(self): + byte_data = self.connection.recv(4, socket.MSG_WAITALL) + return struct.unpack('!i', byte_data)[0] + + def __recv_DATA_STRING(self): + n = self.__recv_WORD() + byte_data = self.connection.recv(n, socket.MSG_WAITALL) + return struct.unpack('!{}s'.format(n), byte_data)[0].decode() + + def __recv_DATA_BYTE_LIST(self): + data = [] + n = self.__recv_DATA_INT() + while n > 0: + data.append(self.__recv_DATA_BYTE()) + n -= 1 + return data + + def __recv_DATA_BOOL_LIST(self): + data = [] + n = self.__recv_DATA_INT() + while n > 0: + data.append(self.__recv_DATA_BOOL()) + n -= 1 + return data + + def __recv_DATA_INT_LIST(self): + data = [] + n = self.__recv_DATA_INT() + while n > 0: + data.append(self.__recv_DATA_INT()) + n -= 1 + return data + + def __recv_DATA_STRING_LIST(self): + data = [] + n = self.__recv_DATA_INT() + while n > 0: + data.append(self.__recv_DATA_STRING()) + n -= 1 + return data + + def __recv_DATA_INT_INT(self): + return (self.__recv_DATA_INT(), self.__recv_DATA_INT()) + + def __recv_DATA_INT_INT_INT_INT(self): + return (self.__recv_DATA_INT(), + self.__recv_DATA_INT(), + self.__recv_DATA_INT(), + self.__recv_DATA_INT()) + + def __recv_DATA_INT_TABLE(self): + rows = self.__recv_DATA_INT() + columns = self.__recv_DATA_INT() + data = [] + for i in range(0, rows): + row = [] + for j in range(0, columns): + row.append(self.__recv_DATA_INT()) + data.append(row) + return data + + def __recv_WORD(self): + byte_data = self.connection.recv(2, socket.MSG_WAITALL) + return struct.unpack('!H', byte_data)[0] + + def __send_BYTE(self, byte_data): + data = struct.pack('!B', byte_data) + self.connection.sendall(data) + + def __send_BOOL(self, bool_data): + data = struct.pack('!?', bool_data) + self.connection.sendall(data) + + def __send_INT(self, int_data): + data = struct.pack('!i', int_data) + self.connection.sendall(data) + + def __send_STRING(self, string_data): + encoded_string_data = string_data.encode('utf-8') + n = len(encoded_string_data) + data = struct.pack('!H{}s'.format(n), n, encoded_string_data) + self.connection.sendall(data) + + def __send_INT_LIST(self, int_list_data): + n = len(int_list_data) + self.__send_INT(n) + for int_data in int_list_data: + self.__send_INT(int_data) + + def __send_STRING_LIST(self, string_list_data): + n = len(string_list_data) + self.__send_INT(n) + for string_data in string_list_data: + self.__send_STRING(string_data) + + def __send_WORD(self, word_data): + data = struct.pack('!H', word_data) + self.connection.sendall(data) diff --git a/app/corpora/CQiWrapper/CQiClient.py b/app/corpora/CQiWrapper/CQiClient.py deleted file mode 100644 index fc620eca..00000000 --- a/app/corpora/CQiWrapper/CQiClient.py +++ /dev/null @@ -1,620 +0,0 @@ -from . import CQi -import socket -import struct - - -class CQiClient: - def __init__(self, host='127.0.0.1', port=4877): - self.host = host - self.port = port - self.connection = socket.socket() - self.connection.connect((self.host, self.port)) - - def ctrl_connect(self, username, password): - # INPUT: (STRING username, STRING password) - # OUTPUT: CQI_STATUS_CONNECT_OK, CQI_ERROR_CONNECT_REFUSED - # print('CTRL_CONNECT') - self.__send_WORD(CQi.CTRL_CONNECT) - self.__send_STRING(username) - self.__send_STRING(password) - return self.__recv_response(CQi.STATUS_CONNECT_OK) - - def ctrl_bye(self): - # INPUT: () - # OUTPUT: CQI_STATUS_BYE_OK - # print('CTRL_BYE') - self.__send_WORD(CQi.CTRL_BYE) - return self.__recv_response(CQi.STATUS_BYE_OK) - - def ctrl_user_abort(self): - # INPUT: () - # OUTPUT: - # print('CTRL_USER_ABORT') - self.__send_WORD(CQi.CTRL_USER_ABORT) - - def ctrl_ping(self): - # INPUT: () - # OUTPUT: CQI_STATUS_PING_OK - # print('CTRL_PING') - self.__send_WORD(CQi.CTRL_PING) - return self.__recv_response(CQi.STATUS_PING_OK) - - def ctrl_last_general_error(self): - # INPUT: () - # OUTPUT: CQI_DATA_STRING - # full-text error message for the last general error reported by the - # CQi server - # print('CTRL_LAST_GENERAL_ERROR') - self.__send_WORD(CQi.CTRL_LAST_GENERAL_ERROR) - return self.__recv_response(CQi.DATA_STRING) - - def ask_feature_cqi_1_0(self): - # INPUT: () - # OUTPUT: CQI_DATA_BOOL - # print('ASK_FEATURE_CQI_1_0') - self.__send_WORD(CQi.ASK_FEATURE_CQI_1_0) - return self.__recv_response(CQi.DATA_BOOL) - - def ask_feature_cl_2_3(self): - # INPUT: () - # OUTPUT: CQI_DATA_BOOL - # print('ASK_FEATURE_CL_2_3') - self.__send_WORD(CQi.ASK_FEATURE_CL_2_3) - return self.__recv_response(CQi.DATA_BOOL) - - def ask_feature_cqp_2_3(self): - # INPUT: () - # OUTPUT: CQI_DATA_BOOL - # print('ASK_FEATURE_CL_2_3') - self.__send_WORD(CQi.ASK_FEATURE_CL_2_3) - return self.__recv_response(CQi.DATA_BOOL) - - def corpus_list_coprora(self): - # INPUT: () - # OUTPUT: CQI_DATA_STRING_LIST - # print('CORPUS_LIST_CORPORA') - self.__send_WORD(CQi.CORPUS_LIST_CORPORA) - return self.__recv_response(CQi.DATA_STRING_LIST) - - def corpus_charset(self, corpus): - # INPUT: (STRING corpus) - # OUTPUT: CQI_DATA_STRING - # print('CORPUS_CHARSET') - self.__send_WORD(CQi.CORPUS_CHARSET) - self.__send_STRING(corpus) - return self.__recv_response(CQi.DATA_STRING) - - def corpus_properties(self, corpus): - # INPUT: (STRING corpus) - # OUTPUT: CQI_DATA_STRING_LIST - # print('CORPUS_PROPERTIES') - self.__send_WORD(CQi.CORPUS_PROPERTIES) - self.__send_STRING(corpus) - return self.__recv_response(CQi.DATA_STRING_LIST) - - def corpus_positional_attributes(self, corpus): - # INPUT: (STRING corpus) - # OUTPUT: CQI_DATA_STRING_LIST - # print('CORPUS_POSITIONAL_ATTRIBUTES') - self.__send_WORD(CQi.CORPUS_POSITIONAL_ATTRIBUTES) - self.__send_STRING(corpus) - return self.__recv_response(CQi.DATA_STRING_LIST) - - def corpus_structural_attributes(self, corpus): - # INPUT: (STRING corpus) - # OUTPUT: CQI_DATA_STRING_LIST - # print('CORPUS_STRUCTURAL_ATTRIBUTES') - self.__send_WORD(CQi.CORPUS_STRUCTURAL_ATTRIBUTES) - self.__send_STRING(corpus) - return self.__recv_response(CQi.DATA_STRING_LIST) - - def corpus_structural_attribute_has_values(self, attribute): - # INPUT: (STRING attribute) - # OUTPUT: CQI_DATA_BOOL - # print('CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES') - self.__send_WORD(CQi.CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES) - self.__send_STRING(attribute) - return self.__recv_response(CQi.DATA_BOOL) - - def corpus_alignment_attributes(self, corpus): - # INPUT: (STRING corpus) - # OUTPUT: CQI_DATA_STRING_LIST - # print('CORPUS_ALIGNMENT_ATTRIBUTES') - self.__send_WORD(CQi.CORPUS_ALIGNMENT_ATTRIBUTES) - self.__send_STRING(corpus) - return self.__recv_response(CQi.DATA_STRING_LIST) - - def corpus_full_name(self, corpus): - # INPUT: (STRING corpus) - # OUTPUT: CQI_DATA_STRING - # the full name of as specified in its registry entry - # print('CORPUS_FULL_NAME') - self.__send_WORD(CQi.CORPUS_FULL_NAME) - self.__send_STRING(corpus) - return self.__recv_response(CQi.DATA_STRING) - - def corpus_info(self, corpus): - # INPUT: (STRING corpus) - # OUTPUT: CQI_DATA_STRING_LIST - # returns the contents of the .info file of as a list of lines - # print('CORPUS_INFO') - self.__send_WORD(CQi.CORPUS_INFO) - self.__send_STRING(corpus) - return self.__recv_response(CQi.DATA_STRING_LIST) - - def corpus_drop_corpus(self, corpus): - ''' - ' Broken - ' TODO: Check what type of return value is provided by the server. - ''' - # INPUT: (STRING corpus) - # OUTPUT: CQI_STATUS_OK - # try to unload a corpus and all its attributes from memory - # print('CORPUS_DROP_CORPUS') - self.__send_WORD(CQi.CORPUS_DROP_CORPUS) - self.__send_STRING(corpus) - return self.__recv_response(CQi.STATUS_OK) - - def cl_attribute_size(self, attribute): - # INPUT: (STRING attribute) - # OUTPUT: CQI_DATA_INT - # returns the size of : - # number of tokens (positional) - # number of regions (structural) - # number of alignments (alignment) - # print('CL_ATTRIBUTE_SIZE') - self.__send_WORD(CQi.CL_ATTRIBUTE_SIZE) - self.__send_STRING(attribute) - return self.__recv_response(CQi.DATA_INT) - - def cl_lexicon_size(self, attribute): - # INPUT: (STRING attribute) - # OUTPUT: CQI_DATA_INT - # returns the number of entries in the lexicon of a positional - # attribute; - # valid lexicon IDs range from 0 .. (lexicon_size - 1) - # print('CL_LEXICON_SIZE') - self.__send_WORD(CQi.CL_LEXICON_SIZE) - self.__send_STRING(attribute) - return self.__recv_response(CQi.DATA_INT) - - def cl_drop_attribute(self, attribute): - # INPUT: (STRING attribute) - # OUTPUT: CQI_STATUS_OK - # unload attribute from memory - # print('CL_DROP_ATTRIBUTE') - self.__send_WORD(CQi.CL_LEXICON_SIZE) - self.__send_STRING(attribute) - return self.__recv_response(CQi.STATUS_OK) - - """ - " NOTE: simple (scalar) mappings are applied to lists (the returned list - " has exactly the same length as the list passed as an argument) - """ - - def cl_str2id(self, attribute, strings): - # INPUT: (STRING attribute, STRING_LIST strings) - # OUTPUT: CQI_DATA_INT_LIST - # returns -1 for every string in that is not found in the - # lexicon - # print('CL_STR2ID') - self.__send_WORD(CQi.CL_LEXICON_SIZE) - self.__send_STRING(attribute) - self.__send_STRING_LIST(strings) - return self.__recv_response(CQi.DATA_INT_LIST) - - def cl_id2str(self, attribute, id): - # INPUT: (STRING attribute, INT_LIST id) - # OUTPUT: CQI_DATA_STRING_LIST - # returns "" for every ID in that is out of range - # print('CL_ID2STR') - self.__send_WORD(CQi.CL_ID2STR) - self.__send_STRING(attribute) - self.__send_INT_LIST(id) - return self.__recv_response(CQi.DATA_STRING_LIST) - - def cl_id2freq(self, attribute, id): - # INPUT: (STRING attribute, INT_LIST id) - # OUTPUT: CQI_DATA_INT_LIST - # returns 0 for every ID in that is out of range - # print('CL_ID2FREQ') - self.__send_WORD(CQi.CL_ID2FREQ) - self.__send_STRING(attribute) - self.__send_INT_LIST(id) - return self.__recv_response(CQi.DATA_INT_LIST) - - def cl_cpos2id(self, attribute, cpos): - # INPUT: (STRING attribute, INT_LIST cpos) - # OUTPUT: CQI_DATA_INT_LIST - # returns -1 for every corpus position in that is out of range - # print('CL_CPOS2ID') - self.__send_WORD(CQi.CL_ID2FREQ) - self.__send_STRING(attribute) - self.__send_INT_LIST(cpos) - return self.__recv_response(CQi.DATA_INT_LIST) - - def cl_cpos2str(self, attribute, cpos): - # INPUT: (STRING attribute, INT_LIST cpos) - # OUTPUT: CQI_DATA_STRING_LIST - # returns "" for every corpus position in that is out of range - # print('CL_CPOS2STR') - self.__send_WORD(CQi.CL_CPOS2STR) - self.__send_STRING(attribute) - self.__send_INT_LIST(cpos) - return self.__recv_response(CQi.DATA_STRING_LIST) - - def cl_cpos2struc(self, attribute, cpos): - # INPUT: (STRING attribute, INT_LIST cpos) - # OUTPUT: CQI_DATA_INT_LIST - # returns -1 for every corpus position not inside a structure region - # print('CL_CPOS2STRUC') - self.__send_WORD(CQi.CL_CPOS2STRUC) - self.__send_STRING(attribute) - self.__send_INT_LIST(cpos) - return self.__recv_response(CQi.DATA_INT_LIST) - - """ - " NOTE: temporary addition for the Euralex2000 tutorial, but should - " probably be included in CQi specs - """ - - def cl_cpos2lbound(self, attribute, cpos): - # INPUT: (STRING attribute, INT_LIST cpos) - # OUTPUT: CQI_DATA_INT_LIST - # returns left boundary of s-attribute region enclosing cpos, -1 if not - # in region - # print('CL_CPOS2LBOUND') - self.__send_WORD(CQi.CL_CPOS2LBOUND) - self.__send_STRING(attribute) - self.__send_INT_LIST(cpos) - return self.__recv_response(CQi.DATA_INT_LIST) - - def cl_cpos2rbound(self, attribute, cpos): - # INPUT: (STRING attribute, INT_LIST cpos) - # OUTPUT: CQI_DATA_INT_LIST - # returns right boundary of s-attribute region enclosing cpos, -1 if - # not in region - # print('CL_CPOS2RBOUND') - self.__send_WORD(CQi.CL_CPOS2RBOUND) - self.__send_STRING(attribute) - self.__send_INT_LIST(cpos) - return self.__recv_response(CQi.DATA_INT_LIST) - - def cl_cpos2alg(self, attribute, cpos): - # INPUT: (STRING attribute, INT_LIST cpos) - # OUTPUT: CQI_DATA_INT_LIST - # returns -1 for every corpus position not inside an alignment - # print('CL_CPOS2ALG') - self.__send_WORD(CQi.CL_CPOS2ALG) - self.__send_STRING(attribute) - self.__send_INT_LIST(cpos) - return self.__recv_response(CQi.DATA_INT_LIST) - - def cl_struc2str(self, attribute, strucs): - # INPUT: (STRING attribute, INT_LIST strucs) - # OUTPUT: CQI_DATA_STRING_LIST - # returns annotated string values of structure regions in ; "" - # if out of range - # check CQI_CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES() first - # print('CL_STRUC2STR') - self.__send_WORD(CQi.CL_STRUC2STR) - self.__send_STRING(attribute) - self.__send_INT_LIST(strucs) - return self.__recv_response(CQi.DATA_STRING_LIST) - - """ - " NOTE: the following mappings take a single argument and return multiple - " values, including lists of arbitrary size - """ - - def cl_id2cpos(self, attribute, id): - # INPUT: (STRING attribute, INT id) - # OUTPUT: CQI_DATA_INT_LIST - # returns all corpus positions where the given token occurs - # print('CL_ID2CPOS') - self.__send_WORD(CQi.CL_ID2CPOS) - self.__send_STRING(attribute) - self.__send_INT(id) - return self.__recv_response(CQi.DATA_INT_LIST) - - def cl_idlist2cpos(self, attribute, id_list): - # INPUT: (STRING attribute, INT_LIST id_list) - # OUTPUT: CQI_DATA_INT_LIST - # returns all corpus positions where one of the tokens in - # occurs; the returned list is sorted as a whole, not per token id - # print('CL_IDLIST2CPOS') - self.__send_WORD(CQi.CL_IDLIST2CPOS) - self.__send_STRING(attribute) - self.__send_INT_LIST(id_list) - return self.__recv_response(CQi.DATA_INT_LIST) - - def cl_regex2id(self, attribute, regex): - # INPUT: (STRING attribute, STRING regex) - # OUTPUT: CQI_DATA_INT_LIST - # returns lexicon IDs of all tokens that match ; the returned - # list may be empty (size 0); - # print('CL_REGEX2ID') - self.__send_WORD(CQi.CL_REGEX2ID) - self.__send_STRING(attribute) - self.__send_STRING(regex) - return self.__recv_response(CQi.DATA_INT_LIST) - - def cl_struc2cpos(self, attribute, struc): - # INPUT: (STRING attribute, INT struc) - # OUTPUT: CQI_DATA_INT_INT - # returns start and end corpus positions of structure region - # print('CL_STRUC2CPOS') - self.__send_WORD(CQi.CL_STRUC2CPOS) - self.__send_STRING(attribute) - self.__send_INT(struc) - return self.__recv_response(CQi.DATA_INT_LIST) - - def cl_alg2cpos(self, attribute, alg): - # INPUT: (STRING attribute, INT alg) - # OUTPUT: CQI_DATA_INT_INT_INT_INT - # returns (src_start, src_end, target_start, target_end) - # print('CL_ALG2CPOS') - self.__send_WORD(CQi.CL_ALG2CPOS) - self.__send_STRING(attribute) - self.__send_INT(alg) - return self.__recv_response(CQi.DATA_INT_INT_INT_INT) - - def cqp_query(self, mother_corpus, subcorpus_name, query): - # INPUT: (STRING mother_corpus, STRING subcorpus_name, STRING query) - # OUTPUT: CQI_STATUS_OK - # must include the ';' character terminating the query. - # print('CQP_QUERY') - self.__send_WORD(CQi.CQP_QUERY) - self.__send_STRING(mother_corpus) - self.__send_STRING(subcorpus_name) - self.__send_STRING(query) - return self.__recv_response(CQi.STATUS_OK) - - def cqp_list_subcorpora(self, corpus): - # INPUT: (STRING corpus) - # OUTPUT: CQI_DATA_STRING_LIST - # print('CQP_LIST_SUBCORPORA') - self.__send_WORD(CQi.CQP_LIST_SUBCORPORA) - self.__send_STRING(corpus) - return self.__recv_response(CQi.DATA_STRING_LIST) - - def cqp_subcorpus_size(self, subcorpus): - # INPUT: (STRING subcorpus) - # OUTPUT: CQI_DATA_INT - # print('CQP_SUBCORPUS_SIZE') - self.__send_WORD(CQi.CQP_SUBCORPUS_SIZE) - self.__send_STRING(subcorpus) - return self.__recv_response(CQi.DATA_INT) - - def cqp_subcorpus_has_field(self, subcorpus, field): - # INPUT: (STRING subcorpus, BYTE field) - # OUTPUT: CQI_DATA_BOOL - # print('CQP_SUBCORPUS_HAS_FIELD') - self.__send_WORD(CQi.CQP_SUBCORPUS_HAS_FIELD) - self.__send_STRING(subcorpus) - self.__send_BYTE(field) - return self.__recv_response(CQi.DATA_BOOL) - - def cqp_dump_subcorpus(self, subcorpus, field, first, last): - # INPUT: (STRING subcorpus, BYTE field, INT first, INT last) - # OUTPUT: CQI_DATA_INT_LIST - # Dump the values of for match ranges .. - # in . is one of the CQI_CONST_FIELD_* constants. - # print('CQP_DUMP_SUBCORPUS') - self.__send_WORD(CQi.CQP_DUMP_SUBCORPUS) - self.__send_STRING(subcorpus) - self.__send_BYTE(field) - self.__send_INT(first) - self.__send_INT(last) - return self.__recv_response(CQi.DATA_INT_LIST) - - def cqp_drop_subcorpus(self, subcorpus): - # INPUT: (STRING subcorpus) - # OUTPUT: CQI_STATUS_OK - # delete a subcorpus from memory - # print('CQP_DROP_SUBCORPUS') - self.__send_WORD(CQi.CQP_DROP_SUBCORPUS) - self.__send_STRING(subcorpus) - return self.__recv_response(CQi.STATUS_OK) - - """ - " NOTE: The following two functions are temporarily included for the - " Euralex 2000 tutorial demo - """ - - def cqp_fdist_1(self, subcorpus, cutoff, field, attribute): - """ NOTE: frequency distribution of single tokens """ - # INPUT: (STRING subcorpus, INT cutoff, BYTE field, STRING attribute) - # OUTPUT: CQI_DATA_INT_LIST - # returns (id, frequency) pairs flattened into a list of size 2* - # field is one of CQI_CONST_FIELD_MATCH, CQI_CONST_FIELD_TARGET, - # CQI_CONST_FIELD_KEYWORD - # NB: pairs are sorted by frequency desc. - # print('CQP_FDIST_1') - self.__send_WORD(CQi.CQP_FDIST_1) - self.__send_STRING(subcorpus) - self.__send_INT(cutoff) - self.__send_BYTE(field) - self.__send_STRING(attribute) - return self.__recv_response(CQi.DATA_INT_LIST) - - def cqp_fdist_2(self, subcorpus, cutoff, field1, attribute1, field2, - attribute2): - """ NOTE: frequency distribution of pairs of tokens """ - # INPUT: (STRING subcorpus, INT cutoff, BYTE field1, STRING attribute1, - # BYTE field2, STRING attribute2) - # OUTPUT: CQI_DATA_INT_LIST - # returns (id1, id2, frequency) pairs flattened into a list of size - # 3* - # NB: triples are sorted by frequency desc. - # print('CQP_FDIST_2') - self.__send_WORD(CQi.CQP_FDIST_2) - self.__send_STRING(subcorpus) - self.__send_INT(cutoff) - self.__send_BYTE(field1) - self.__send_STRING(attribute1) - self.__send_BYTE(field2) - self.__send_STRING(attribute2) - return self.__recv_response(CQi.DATA_INT_LIST) - - def __recv_response(self, expected_response_type=None): - byte_data = self.__recv_WORD() - response_type = byte_data >> 8 - if response_type == CQi.CL_ERROR: - raise Exception(CQi.lookup[byte_data]) - elif response_type == CQi.CQP_ERROR: - raise Exception(CQi.lookup[byte_data]) - elif response_type == CQi.DATA: - if (expected_response_type is not None - and byte_data != expected_response_type): - raise Exception("Expected different response type." - + "Received: {}, ".format(byte_data) - + "Expected {}".format(expected_response_type)) - return self.__recv_DATA(byte_data) - elif response_type == CQi.ERROR: - raise Exception(CQi.lookup[byte_data]) - elif response_type == CQi.STATUS: - if (expected_response_type is not None - and byte_data != expected_response_type): - raise Exception("Expected different response type." - + "Received: {}, ".format(byte_data) - + "Expected {}".format(expected_response_type)) - return byte_data - else: - raise Exception( - 'Unknown response type: {}'.format(hex(response_type)) - ) - - def __recv_DATA(self, data_type): - if data_type == CQi.DATA_BYTE: - data = self.__recv_DATA_BYTE() - elif data_type == CQi.DATA_BOOL: - data = self.__recv_DATA_BOOL() - elif data_type == CQi.DATA_INT: - data = self.__recv_DATA_INT() - elif data_type == CQi.DATA_STRING: - data = self.__recv_DATA_STRING() - elif data_type == CQi.DATA_BYTE_LIST: - data = self.__recv_DATA_BYTE_LIST() - elif data_type == CQi.DATA_BOOL_LIST: - data = self.__recv_DATA_BOOL_LIST() - elif data_type == CQi.DATA_INT_LIST: - data = self.__recv_DATA_INT_LIST() - elif data_type == CQi.DATA_STRING_LIST: - data = self.__recv_DATA_STRING_LIST() - elif data_type == CQi.DATA_INT_INT: - data = self.__recv_DATA_INT_INT() - elif data_type == CQi.DATA_INT_INT_INT_INT: - data = self.__recv_DATA_INT_INT_INT_INT() - elif data_type == CQi.DATA_INT_TABLE: - data = self.__recv_DATA_INT_TABLE() - else: - raise Exception('Unknown data type: {}'.format(hex(data_type))) - return data - - def __recv_DATA_BYTE(self): - byte_data = self.connection.recv(1) - return struct.unpack('!B', byte_data)[0] - - def __recv_DATA_BOOL(self): - byte_data = self.connection.recv(1) - return struct.unpack('!?', byte_data)[0] - - def __recv_DATA_INT(self): - byte_data = self.connection.recv(4) - return struct.unpack('!i', byte_data)[0] - - def __recv_DATA_STRING(self): - n = self.__recv_WORD() - byte_data = self.connection.recv(n) - return struct.unpack('!{}s'.format(n), byte_data)[0].decode() - - def __recv_DATA_BYTE_LIST(self): - data = [] - n = self.__recv_DATA_INT() - while n > 0: - data.append(self.__recv_DATA_BYTE()) - n -= 1 - return data - - def __recv_DATA_BOOL_LIST(self): - data = [] - n = self.__recv_DATA_INT() - while n > 0: - data.append(self.__recv_DATA_BOOL()) - n -= 1 - return data - - def __recv_DATA_INT_LIST(self): - data = [] - n = self.__recv_DATA_INT() - while n > 0: - data.append(self.__recv_DATA_INT()) - n -= 1 - return data - - def __recv_DATA_STRING_LIST(self): - data = [] - n = self.__recv_DATA_INT() - while n > 0: - data.append(self.__recv_DATA_STRING()) - n -= 1 - return data - - def __recv_DATA_INT_INT(self): - return (self.__recv_DATA_INT(), self.__recv_DATA_INT()) - - def __recv_DATA_INT_INT_INT_INT(self): - return (self.__recv_DATA_INT(), - self.__recv_DATA_INT(), - self.__recv_DATA_INT(), - self.__recv_DATA_INT()) - - def __recv_DATA_INT_TABLE(self): - rows = self.__recv_DATA_INT() - columns = self.__recv_DATA_INT() - data = [] - for i in range(0, rows): - row = [] - for j in range(0, columns): - row.append(self.__recv_DATA_INT()) - data.append(row) - return data - - def __recv_WORD(self): - byte_data = self.connection.recv(2) - return struct.unpack('!H', byte_data)[0] - - def __send_BYTE(self, byte_data): - data = struct.pack('!B', byte_data) - self.connection.sendall(data) - - def __send_BOOL(self, bool_data): - data = struct.pack('!?', bool_data) - self.connection.sendall(data) - - def __send_INT(self, int_data): - data = struct.pack('!i', int_data) - self.connection.sendall(data) - - def __send_STRING(self, string_data): - encoded_string_data = string_data.encode('utf-8') - n = len(encoded_string_data) - data = struct.pack('!H{}s'.format(n), n, encoded_string_data) - self.connection.sendall(data) - - def __send_INT_LIST(self, int_list_data): - n = len(int_list_data) - self.__send_INT(n) - for int_data in int_list_data: - self.__send_INT(int_data) - - def __send_STRING_LIST(self, string_list_data): - n = len(string_list_data) - self.__send_INT(n) - for string_data in string_list_data: - self.__send_STRING(string_data) - - def __send_WORD(self, word_data): - data = struct.pack('!H', word_data) - self.connection.sendall(data) diff --git a/app/corpora/CQiWrapper/CQiWrapper.py b/app/corpora/CQiWrapper/CQiWrapper.py index dd0d29bc..dcd5d8f0 100644 --- a/app/corpora/CQiWrapper/CQiWrapper.py +++ b/app/corpora/CQiWrapper/CQiWrapper.py @@ -1,10 +1,9 @@ -from .CQiClient import CQiClient -from .CQi import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND -import time from app import logger # only works if imported into opaque web app +from . import CQi +import time -class CQiWrapper(CQiClient): +class CQiWrapper(CQi.Client): ''' CQIiWrapper object @@ -155,11 +154,11 @@ class CQiWrapper(CQiClient): offset_end = min((self.nr_matches + result_offset - 1), self.match_count - 1) logger.warning('Offset end is: {}'.format(offset_end)) match_boundaries = zip(self.cqp_dump_subcorpus(self.result_subcorpus, - CONST_FIELD_MATCH, + CQi.CONST_FIELD_MATCH, offset_start, offset_end), self.cqp_dump_subcorpus(self.result_subcorpus, - CONST_FIELD_MATCHEND, + CQi.CONST_FIELD_MATCHEND, offset_start, offset_end))