From 8e5e8408bd6fe7a01637d350dd3cb340d7b260d0 Mon Sep 17 00:00:00 2001 From: Stephan Porada Date: Thu, 7 Nov 2019 15:48:47 +0100 Subject: [PATCH] Add CQiWrapper --- app/corpora/CQiWrapper/CQi.py | 406 ++++++++++++++++++ app/corpora/CQiWrapper/CQiClient.py | 611 +++++++++++++++++++++++++++ app/corpora/CQiWrapper/CQiWrapper.py | 240 +++++++++++ 3 files changed, 1257 insertions(+) create mode 100644 app/corpora/CQiWrapper/CQi.py create mode 100644 app/corpora/CQiWrapper/CQiClient.py create mode 100644 app/corpora/CQiWrapper/CQiWrapper.py diff --git a/app/corpora/CQiWrapper/CQi.py b/app/corpora/CQiWrapper/CQi.py new file mode 100644 index 00000000..5d39395a --- /dev/null +++ b/app/corpora/CQiWrapper/CQi.py @@ -0,0 +1,406 @@ +# ########################################################################### # +# IMS CQi specification # +# # +# Version: 0.1a ;o) # +# Author: Stefan Evert (evert@ims.uni-stuttgart.de) # +# Modified by: Patrick Jentsch # +# ########################################################################### # + + +""" 1. padding """ +PAD = 0x00 + + +""" 2. CQi responses """ +""" 2.1 CQI_STATUS_* """ +STATUS = 0x01 +STATUS_OK = 0x0101 +STATUS_CONNECT_OK = 0x0102 +STATUS_BYE_OK = 0x0103 +STATUS_PING_OK = 0x0104 + +""" 2.2 CQI_ERROR_* """ +ERROR = 0x02 +ERROR_GENERAL_ERROR = 0x0201 +ERROR_CONNECT_REFUSED = 0x0202 +ERROR_USER_ABORT = 0x0203 +ERROR_SYNTAX_ERROR = 0x0204 +# includes corpus/attribute/subcorpus specifier syntax + +""" 2.3 CQI_DATA_* """ +DATA = 0x03 +DATA_BYTE = 0x0301 +DATA_BOOL = 0x0302 +DATA_INT = 0x0303 +DATA_STRING = 0x0304 +DATA_BYTE_LIST = 0x0305 +DATA_BOOL_LIST = 0x0306 +DATA_INT_LIST = 0x0307 +DATA_STRING_LIST = 0x0308 +DATA_INT_INT = 0x0309 +DATA_INT_INT_INT_INT = 0x030A +DATA_INT_TABLE = 0x030B + +""" 2.4 CQI_CL_ERROR_* """ +""" +" NOTE: some CL error codes are not represented in the CQi specs +" - usually because they're not used in the CL any more +" - CDA_ENOSTRING is not considered an error (returns -1) +" - CDA_EARGS: dynamic attribute calls not yet supported +""" +CL_ERROR = 0x04 +CL_ERROR_NO_SUCH_ATTRIBUTE = 0x0401 +# returned if CQi server couldn't open attribute +CL_ERROR_WRONG_ATTRIBUTE_TYPE = 0x0402 +# CDA_EATTTYPE +CL_ERROR_OUT_OF_RANGE = 0x0403 +# CDA_EIDORNG, CDA_EIDXORNG, CDA_EPOSORNG +CL_ERROR_REGEX = 0x0404 +# CDA_EPATTERN (not used), CDA_EBADREGEX +CL_ERROR_CORPUS_ACCESS = 0x0405 +# CDA_ENODATA +CL_ERROR_OUT_OF_MEMORY = 0x0406 +# CDA_ENOMEM +# this means the CQi server has run out of memory; +# try discarding some other corpora and/or subcorpora +CL_ERROR_INTERNAL = 0x0407 +# CDA_EOTHER, CDA_ENYI +# this is the classical 'please contact technical support' error + +""" 2.5 CQI_CQP_ERROR_* """ +CQP_ERROR = 0x05 +# CQP error messages yet to be defined +CQP_ERROR_GENERAL = 0x0501 +CQP_ERROR_NO_SUCH_CORPUS = 0x0502 +CQP_ERROR_INVALID_FIELD = 0x0503 +CQP_ERROR_OUT_OF_RANGE = 0x0504 +# various cases where a number is out of range + + +""" 3. CQi commands """ +""" 3.1 CQI_CTRL_* """ +CTRL = 0x11 +CTRL_CONNECT = 0x1101 +# INPUT: (STRING username, STRING password) +# OUTPUT: CQI_STATUS_CONNECT_OK, CQI_ERROR_CONNECT_REFUSED +CTRL_BYE = 0x1102 +# INPUT: () +# OUTPUT: CQI_STATUS_BYE_OK +CTRL_USER_ABORT = 0x1103 +# INPUT: () +# OUTPUT: +CTRL_PING = 0x1104 +# INPUT: () +# OUTPUT: CQI_STATUS_PING_OK +CTRL_LAST_GENERAL_ERROR = 0x1105 +# INPUT: () +# OUTPUT: CQI_DATA_STRING +# full-text error message for the last general error reported by the CQi server + +""" 3.2 CQI_ASK_FEATURE_* """ +ASK_FEATURE = 0x12 +ASK_FEATURE_CQI_1_0 = 0x1201 +# INPUT: () +# OUTPUT: CQI_DATA_BOOL +ASK_FEATURE_CL_2_3 = 0x1202 +# INPUT: () +# OUTPUT: CQI_DATA_BOOL +ASK_FEATURE_CQP_2_3 = 0x1203 +# INPUT: () +# OUTPUT: CQI_DATA_BOOL + +""" 3.3 CQI_CORPUS_* """ +CORPUS = 0x13 +CORPUS_LIST_CORPORA = 0x1301 +# INPUT: () +# OUTPUT: CQI_DATA_STRING_LIST +CORPUS_CHARSET = 0x1303 +# INPUT: (STRING corpus) +# OUTPUT: CQI_DATA_STRING +CORPUS_PROPERTIES = 0x1304 +# INPUT: (STRING corpus) +# OUTPUT: CQI_DATA_STRING_LIST +CORPUS_POSITIONAL_ATTRIBUTES = 0x1305 +# INPUT: (STRING corpus) +# OUTPUT: CQI_DATA_STRING_LIST +CORPUS_STRUCTURAL_ATTRIBUTES = 0x1306 +# INPUT: (STRING corpus) +# OUTPUT: CQI_DATA_STRING_LIST +CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES = 0x1307 +# INPUT: (STRING attribute) +# OUTPUT: CQI_DATA_BOOL +CORPUS_ALIGNMENT_ATTRIBUTES = 0x1308 +# INPUT: (STRING corpus) +# OUTPUT: CQI_DATA_STRING_LIST +CORPUS_FULL_NAME = 0x1309 +# INPUT: (STRING corpus) +# OUTPUT: CQI_DATA_STRING +# the full name of as specified in its registry entry +CORPUS_INFO = 0x130A +# INPUT: (STRING corpus) +# OUTPUT: CQI_DATA_STRING_LIST +# returns the contents of the .info file of as a list of lines +CORPUS_DROP_CORPUS = 0x130B +# INPUT: (STRING corpus) +# OUTPUT: CQI_STATUS_OK +# try to unload a corpus and all its attributes from memory + +""" 3.4 CQI_CL_* """ +CL = 0x14 +# low-level corpus access (CL functions) +CL_ATTRIBUTE_SIZE = 0x1401 +# INPUT: (STRING attribute) +# OUTPUT: CQI_DATA_INT +# returns the size of : +# - number of tokens (positional) +# - number of regions (structural) +# - number of alignments (alignment) +CL_LEXICON_SIZE = 0x1402 +# INPUT: (STRING attribute) +# OUTPUT: CQI_DATA_INT +# returns the number of entries in the lexicon of a positional attribute; +# valid lexicon IDs range from 0 .. (lexicon_size - 1) +CL_DROP_ATTRIBUTE = 0x1403 +# INPUT: (STRING attribute) +# OUTPUT: CQI_STATUS_OK +# unload attribute from memory +""" +" NOTE: simple (scalar) mappings are applied to lists (the returned list has +" exactly the same length as the list passed as an argument) +""" +CL_STR2ID = 0x1404 +# INPUT: (STRING attribute, STRING_LIST strings) +# OUTPUT: CQI_DATA_INT_LIST +# returns -1 for every string in that is not found in the lexicon +CL_ID2STR = 0x1405 +# INPUT: (STRING attribute, INT_LIST id) +# OUTPUT: CQI_DATA_STRING_LIST +# returns "" for every ID in that is out of range +CL_ID2FREQ = 0x1406 +# INPUT: (STRING attribute, INT_LIST id) +# OUTPUT: CQI_DATA_INT_LIST +# returns 0 for every ID in that is out of range +CL_CPOS2ID = 0x1407 +# INPUT: (STRING attribute, INT_LIST cpos) +# OUTPUT: CQI_DATA_INT_LIST +# returns -1 for every corpus position in that is out of range +CL_CPOS2STR = 0x1408 +# INPUT: (STRING attribute, INT_LIST cpos) +# OUTPUT: CQI_DATA_STRING_LIST +# returns "" for every corpus position in that is out of range +CL_CPOS2STRUC = 0x1409 +# INPUT: (STRING attribute, INT_LIST cpos) +# OUTPUT: CQI_DATA_INT_LIST +# returns -1 for every corpus position not inside a structure region +""" +" NOTE: temporary addition for the Euralex2000 tutorial, but should probably be +" included in CQi specs +""" +CL_CPOS2LBOUND = 0x1420 +# INPUT: (STRING attribute, INT_LIST cpos) +# OUTPUT: CQI_DATA_INT_LIST +# returns left boundary of s-attribute region enclosing cpos, -1 if not in +# region +CL_CPOS2RBOUND = 0x1421 +# INPUT: (STRING attribute, INT_LIST cpos) +# OUTPUT: CQI_DATA_INT_LIST +# returns right boundary of s-attribute region enclosing cpos, -1 if not in +# region +CL_CPOS2ALG = 0x140A +# INPUT: (STRING attribute, INT_LIST cpos) +# OUTPUT: CQI_DATA_INT_LIST +# returns -1 for every corpus position not inside an alignment +CL_STRUC2STR = 0x140B +# INPUT: (STRING attribute, INT_LIST strucs) +# OUTPUT: CQI_DATA_STRING_LIST +# returns annotated string values of structure regions in ; "" if out +# of range +# check CQI_CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES() first +""" +" NOTE: the following mappings take a single argument and return multiple +" values, including lists of arbitrary size +""" +CL_ID2CPOS = 0x140C +# INPUT: (STRING attribute, INT id) +# OUTPUT: CQI_DATA_INT_LIST +# returns all corpus positions where the given token occurs +CL_IDLIST2CPOS = 0x140D +# INPUT: (STRING attribute, INT_LIST id_list) +# OUTPUT: CQI_DATA_INT_LIST +# returns all corpus positions where one of the tokens in +# occurs; the returned list is sorted as a whole, not per token id +CL_REGEX2ID = 0x140E +# INPUT: (STRING attribute, STRING regex) +# OUTPUT: CQI_DATA_INT_LIST +# returns lexicon IDs of all tokens that match ; the returned +# list may be empty (size 0); +CL_STRUC2CPOS = 0x140F +# INPUT: (STRING attribute, INT struc) +# OUTPUT: CQI_DATA_INT_INT +# returns start and end corpus positions of structure region +CL_ALG2CPOS = 0x1410 +# INPUT: (STRING attribute, INT alg) +# OUTPUT: CQI_DATA_INT_INT_INT_INT +# returns (src_start, src_end, target_start, target_end) + +""" 3.5 CQI_CQP_* """ +CQP = 0x15 +CQP_QUERY = 0x1501 +# INPUT: (STRING mother_corpus, STRING subcorpus_name, STRING query) +# OUTPUT: CQI_STATUS_OK +# must include the ';' character terminating the query. +CQP_LIST_SUBCORPORA = 0x1502 +# INPUT: (STRING corpus) +# OUTPUT: CQI_DATA_STRING_LIST +CQP_SUBCORPUS_SIZE = 0x1503 +# INPUT: (STRING subcorpus) +# OUTPUT: CQI_DATA_INT +CQP_SUBCORPUS_HAS_FIELD = 0x1504 +# INPUT: (STRING subcorpus, BYTE field) +# OUTPUT: CQI_DATA_BOOL +CQP_DUMP_SUBCORPUS = 0x1505 +# INPUT: (STRING subcorpus, BYTE field, INT first, INT last) +# OUTPUT: CQI_DATA_INT_LIST +# Dump the values of for match ranges .. in . +# is one of the CQI_CONST_FIELD_* constants. +CQP_DROP_SUBCORPUS = 0x1509 +# INPUT: (STRING subcorpus) +# OUTPUT: CQI_STATUS_OK +# delete a subcorpus from memory +""" +" NOTE: The following two functions are temporarily included for the Euralex +" 2000 tutorial demo +""" +""" NOTE: frequency distribution of single tokens """ +CQP_FDIST_1 = 0x1510 +# INPUT: (STRING subcorpus, INT cutoff, BYTE field, STRING attribute) +# OUTPUT: CQI_DATA_INT_LIST +# returns (id, frequency) pairs flattened into a list of size 2* +# field is one of CQI_CONST_FIELD_MATCH, CQI_CONST_FIELD_TARGET, +# CQI_CONST_FIELD_KEYWORD +# NB: pairs are sorted by frequency desc. +""" NOTE: frequency distribution of pairs of tokens """ +CQP_FDIST_2 = 0x1511 +# INPUT: (STRING subcorpus, INT cutoff, BYTE field1, STRING attribute1, +# BYTE field2, STRING attribute2) +# OUTPUT: CQI_DATA_INT_LIST +# returns (id1, id2, frequency) pairs flattened into a list of size 3* +# NB: triples are sorted by frequency desc. + + +""" 4. Constant Definitions """ +CONST_FALSE = 0x00 +CONST_NO = 0x00 +CONST_TRUE = 0x01 +CONST_YES = 0x01 +""" +" NOTE: The following constants specify which field will be returned by +" CQI_CQP_DUMP_SUBCORPUS and some other subcorpus commands. +""" +CONST_FIELD_MATCH = 0x10 +CONST_FIELD_MATCHEND = 0x11 +""" +" NOTE: The constants specifiying target0 .. target9 are guaranteed to have the +" numerical values 0 .. 9, so clients do not need to look up the constant +" values if they're handling arbitrary targets. +""" +CONST_FIELD_TARGET_0 = 0x00 +CONST_FIELD_TARGET_1 = 0x01 +CONST_FIELD_TARGET_2 = 0x02 +CONST_FIELD_TARGET_3 = 0x03 +CONST_FIELD_TARGET_4 = 0x04 +CONST_FIELD_TARGET_5 = 0x05 +CONST_FIELD_TARGET_6 = 0x06 +CONST_FIELD_TARGET_7 = 0x07 +CONST_FIELD_TARGET_8 = 0x08 +CONST_FIELD_TARGET_9 = 0x09 +""" +" NOTE: The following constants are provided for backward compatibility with +" traditional CQP field names & while the generalised target concept +" isn't yet implemented in the CQPserver. +""" +CONST_FIELD_TARGET = 0x00 +CONST_FIELD_KEYWORD = 0x09 +""" NOTE: CQi version is CQI_MAJOR_VERSION.CQI_MINOR_VERSION """ +MAJOR_VERSION = 0x00 +MINOR_VERSION = 0x01 + + +""" 5. CQi lookup dictionary. """ +lookup = { + 257: 'CQI_STATUS_OK', + 258: 'CQI_STATUS_CONNECT_OK', + 259: 'CQI_STATUS_BYE_OK', + 260: 'CQI_STATUS_PING_OK', + 513: 'CQI_ERROR_GENERAL_ERROR', + 514: 'CQI_ERROR_CONNECT_REFUSED', + 515: 'CQI_ERROR_USER_ABORT', + 516: 'CQI_ERROR_SYNTAX_ERROR', + 769: 'CQI_DATA_BYTE', + 770: 'CQI_DATA_BOOL', + 771: 'CQI_DATA_INT', + 772: 'CQI_DATA_STRING', + 773: 'CQI_DATA_BYTE_LIST', + 774: 'CQI_DATA_BOOL_LIST', + 775: 'CQI_DATA_INT_LIST', + 776: 'CQI_DATA_STRING_LIST', + 777: 'CQI_DATA_INT_INT', + 778: 'CQI_DATA_INT_INT_INT_INT', + 779: 'CQI_DATA_INT_TABLE', + 1025: 'CQI_CL_ERROR_NO_SUCH_ATTRIBUTE', + 1026: 'CQI_CL_ERROR_WRONG_ATTRIBUTE_TYPE', + 1027: 'CQI_CL_ERROR_OUT_OF_RANGE', + 1028: 'CQI_CL_ERROR_REGEX', + 1029: 'CQI_CL_ERROR_CORPUS_ACCESS', + 1030: 'CQI_CL_ERROR_OUT_OF_MEMORY', + 1031: 'CQI_CL_ERROR_INTERNAL', + 1281: 'CQI_CQP_ERROR_GENERAL', + 1282: 'CQI_CQP_ERROR_NO_SUCH_CORPUS', + 1283: 'CQI_CQP_ERROR_INVALID_FIELD', + 1284: 'CQI_CQP_ERROR_OUT_OF_RANGE', + 4353: 'CQI_CTRL_CONNECT', + 4354: 'CQI_CTRL_BYE', + 4355: 'CQI_CTRL_USER_ABORT', + 4356: 'CQI_CTRL_PING', + 4357: 'CQI_CTRL_LAST_GENERAL_ERROR', + 4609: 'CQI_ASK_FEATURE_CQI_1_0', + 4610: 'CQI_ASK_FEATURE_CL_2_3', + 4611: 'CQI_ASK_FEATURE_CQP_2_3', + 4865: 'CQI_CORPUS_LIST_CORPORA', + 4867: 'CQI_CORPUS_CHARSET', + 4868: 'CQI_CORPUS_PROPERTIES', + 4869: 'CQI_CORPUS_POSITIONAL_ATTRIBUTES', + 4870: 'CQI_CORPUS_STRUCTURAL_ATTRIBUTES', + 4871: 'CQI_CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES', + 4872: 'CQI_CORPUS_ALIGNMENT_ATTRIBUTES', + 4873: 'CQI_CORPUS_FULL_NAME', + 4874: 'CQI_CORPUS_INFO', + 4875: 'CQI_CORPUS_DROP_CORPUS', + 5121: 'CQI_CL_ATTRIBUTE_SIZE', + 5122: 'CQI_CL_LEXICON_SIZE', + 5123: 'CQI_CL_DROP_ATTRIBUTE', + 5124: 'CQI_CL_STR2ID', + 5125: 'CQI_CL_ID2STR', + 5126: 'CQI_CL_ID2FREQ', + 5127: 'CQI_CL_CPOS2ID', + 5128: 'CQI_CL_CPOS2STR', + 5129: 'CQI_CL_CPOS2STRUC', + 5130: 'CQI_CL_CPOS2ALG', + 5131: 'CQI_CL_STRUC2STR', + 5132: 'CQI_CL_ID2CPOS', + 5133: 'CQI_CL_IDLIST2CPOS', + 5134: 'CQI_CL_REGEX2ID', + 5135: 'CQI_CL_STRUC2CPOS', + 5136: 'CQI_CL_ALG2CPOS', + 5152: 'CQI_CL_CPOS2LBOUND', + 5153: 'CQI_CL_CPOS2RBOUND', + 5377: 'CQI_CQP_QUERY', + 5378: 'CQI_CQP_LIST_SUBCORPORA', + 5379: 'CQI_CQP_SUBCORPUS_SIZE', + 5380: 'CQI_CQP_SUBCORPUS_HAS_FIELD', + 5381: 'CQI_CQP_DUMP_SUBCORPUS', + 5385: 'CQI_CQP_DROP_SUBCORPUS', + 5392: 'CQI_CQP_FDIST_1', + 5393: 'CQI_CQP_FDIST_2' +} diff --git a/app/corpora/CQiWrapper/CQiClient.py b/app/corpora/CQiWrapper/CQiClient.py new file mode 100644 index 00000000..39a24c4c --- /dev/null +++ b/app/corpora/CQiWrapper/CQiClient.py @@ -0,0 +1,611 @@ +from . import CQi +import socket +import struct + + +class CQiClient: + def __init__(self, host='127.0.0.1', port=4877): + self.host = host + self.port = port + self.connection = socket.socket() + self.connection.connect((self.host, self.port)) + + def ctrl_connect(self, username, password): + # INPUT: (STRING username, STRING password) + # OUTPUT: CQI_STATUS_CONNECT_OK, CQI_ERROR_CONNECT_REFUSED + # print('CTRL_CONNECT') + self.__send_WORD(CQi.CTRL_CONNECT) + self.__send_STRING(username) + self.__send_STRING(password) + self.__recv_response() + + def ctrl_bye(self): + # INPUT: () + # OUTPUT: CQI_STATUS_BYE_OK + # print('CTRL_BYE') + self.__send_WORD(CQi.CTRL_BYE) + self.__recv_response() + + def ctrl_user_abort(self): + # INPUT: () + # OUTPUT: + # print('CTRL_USER_ABORT') + self.__send_WORD(CQi.CTRL_USER_ABORT) + + def ctrl_ping(self): + # INPUT: () + # OUTPUT: CQI_STATUS_PING_OK + # print('CTRL_PING') + self.__send_WORD(CQi.CTRL_PING) + self.__recv_response() + + def ctrl_last_general_error(self): + # INPUT: () + # OUTPUT: CQI_DATA_STRING + # full-text error message for the last general error reported by the + # CQi server + # print('CTRL_LAST_GENERAL_ERROR') + self.__send_WORD(CQi.CTRL_LAST_GENERAL_ERROR) + return self.__recv_response() + + def ask_feature_cqi_1_0(self): + # INPUT: () + # OUTPUT: CQI_DATA_BOOL + # print('ASK_FEATURE_CQI_1_0') + self.__send_WORD(CQi.ASK_FEATURE_CQI_1_0) + return self.__recv_response() + + def ask_feature_cl_2_3(self): + # INPUT: () + # OUTPUT: CQI_DATA_BOOL + # print('ASK_FEATURE_CL_2_3') + self.__send_WORD(CQi.ASK_FEATURE_CL_2_3) + return self.__recv_response() + + def ask_feature_cqp_2_3(self): + # INPUT: () + # OUTPUT: CQI_DATA_BOOL + # print('ASK_FEATURE_CL_2_3') + self.__send_WORD(CQi.ASK_FEATURE_CL_2_3) + return self.__recv_response() + + def corpus_list_coprora(self): + # INPUT: () + # OUTPUT: CQI_DATA_STRING_LIST + # print('CORPUS_LIST_CORPORA') + self.__send_WORD(CQi.CORPUS_LIST_CORPORA) + return self.__recv_response() + + def corpus_charset(self, corpus): + # INPUT: (STRING corpus) + # OUTPUT: CQI_DATA_STRING + # print('CORPUS_CHARSET') + self.__send_WORD(CQi.CORPUS_CHARSET) + self.__send_STRING(corpus) + return self.__recv_response() + + def corpus_properties(self, corpus): + # INPUT: (STRING corpus) + # OUTPUT: CQI_DATA_STRING_LIST + # print('CORPUS_PROPERTIES') + self.__send_WORD(CQi.CORPUS_PROPERTIES) + self.__send_STRING(corpus) + return self.__recv_response() + + def corpus_positional_attributes(self, corpus): + # INPUT: (STRING corpus) + # OUTPUT: CQI_DATA_STRING_LIST + # print('CORPUS_POSITIONAL_ATTRIBUTES') + self.__send_WORD(CQi.CORPUS_POSITIONAL_ATTRIBUTES) + self.__send_STRING(corpus) + return self.__recv_response() + + def corpus_structural_attributes(self, corpus): + # INPUT: (STRING corpus) + # OUTPUT: CQI_DATA_STRING_LIST + # print('CORPUS_STRUCTURAL_ATTRIBUTES') + self.__send_WORD(CQi.CORPUS_STRUCTURAL_ATTRIBUTES) + self.__send_STRING(corpus) + return self.__recv_response() + + def corpus_structural_attribute_has_values(self, attribute): + # INPUT: (STRING attribute) + # OUTPUT: CQI_DATA_BOOL + # print('CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES') + self.__send_WORD(CQi.CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES) + self.__send_STRING(attribute) + return self.__recv_response() + + def corpus_alignment_attributes(self, corpus): + # INPUT: (STRING corpus) + # OUTPUT: CQI_DATA_STRING_LIST + # print('CORPUS_ALIGNMENT_ATTRIBUTES') + self.__send_WORD(CQi.CORPUS_ALIGNMENT_ATTRIBUTES) + self.__send_STRING(corpus) + return self.__recv_response() + + def corpus_full_name(self, corpus): + # INPUT: (STRING corpus) + # OUTPUT: CQI_DATA_STRING + # the full name of as specified in its registry entry + # print('CORPUS_FULL_NAME') + self.__send_WORD(CQi.CORPUS_FULL_NAME) + self.__send_STRING(corpus) + return self.__recv_response() + + def corpus_info(self, corpus): + # INPUT: (STRING corpus) + # OUTPUT: CQI_DATA_STRING_LIST + # returns the contents of the .info file of as a list of lines + # print('CORPUS_INFO') + self.__send_WORD(CQi.CORPUS_INFO) + self.__send_STRING(corpus) + return self.__recv_response() + + def corpus_drop_corpus(self, corpus): + ''' + ' Broken + ' TODO: Check what type of return value is provided by the server. + ''' + # INPUT: (STRING corpus) + # OUTPUT: CQI_STATUS_OK + # try to unload a corpus and all its attributes from memory + # print('CORPUS_DROP_CORPUS') + self.__send_WORD(CQi.CORPUS_DROP_CORPUS) + self.__send_STRING(corpus) + self.__recv_response() + + def cl_attribute_size(self, attribute): + # INPUT: (STRING attribute) + # OUTPUT: CQI_DATA_INT + # returns the size of : + # number of tokens (positional) + # number of regions (structural) + # number of alignments (alignment) + # print('CL_ATTRIBUTE_SIZE') + self.__send_WORD(CQi.CL_ATTRIBUTE_SIZE) + self.__send_STRING(attribute) + return self.__recv_response() + + def cl_lexicon_size(self, attribute): + # INPUT: (STRING attribute) + # OUTPUT: CQI_DATA_INT + # returns the number of entries in the lexicon of a positional + # attribute; + # valid lexicon IDs range from 0 .. (lexicon_size - 1) + # print('CL_LEXICON_SIZE') + self.__send_WORD(CQi.CL_LEXICON_SIZE) + self.__send_STRING(attribute) + return self.__recv_response() + + def cl_drop_attribute(self, attribute): + # INPUT: (STRING attribute) + # OUTPUT: CQI_STATUS_OK + # unload attribute from memory + # print('CL_DROP_ATTRIBUTE') + self.__send_WORD(CQi.CL_LEXICON_SIZE) + self.__send_STRING(attribute) + self.__recv_response() + + """ + " NOTE: simple (scalar) mappings are applied to lists (the returned list + " has exactly the same length as the list passed as an argument) + """ + + def cl_str2id(self, attribute, strings): + # INPUT: (STRING attribute, STRING_LIST strings) + # OUTPUT: CQI_DATA_INT_LIST + # returns -1 for every string in that is not found in the + # lexicon + # print('CL_STR2ID') + self.__send_WORD(CQi.CL_LEXICON_SIZE) + self.__send_STRING(attribute) + self.__send_STRING_LIST(strings) + return self.__recv_response() + + def cl_id2str(self, attribute, id): + # INPUT: (STRING attribute, INT_LIST id) + # OUTPUT: CQI_DATA_STRING_LIST + # returns "" for every ID in that is out of range + # print('CL_ID2STR') + self.__send_WORD(CQi.CL_ID2STR) + self.__send_STRING(attribute) + self.__send_INT_LIST(id) + return self.__recv_response() + + def cl_id2freq(self, attribute, id): + # INPUT: (STRING attribute, INT_LIST id) + # OUTPUT: CQI_DATA_INT_LIST + # returns 0 for every ID in that is out of range + # print('CL_ID2FREQ') + self.__send_WORD(CQi.CL_ID2FREQ) + self.__send_STRING(attribute) + self.__send_INT_LIST(id) + return self.__recv_response() + + def cl_cpos2id(self, attribute, cpos): + # INPUT: (STRING attribute, INT_LIST cpos) + # OUTPUT: CQI_DATA_INT_LIST + # returns -1 for every corpus position in that is out of range + # print('CL_CPOS2ID') + self.__send_WORD(CQi.CL_ID2FREQ) + self.__send_STRING(attribute) + self.__send_INT_LIST(cpos) + return self.__recv_response() + + def cl_cpos2str(self, attribute, cpos): + # INPUT: (STRING attribute, INT_LIST cpos) + # OUTPUT: CQI_DATA_STRING_LIST + # returns "" for every corpus position in that is out of range + # print('CL_CPOS2STR') + self.__send_WORD(CQi.CL_CPOS2STR) + self.__send_STRING(attribute) + self.__send_INT_LIST(cpos) + return self.__recv_response() + + def cl_cpos2struc(self, attribute, cpos): + # INPUT: (STRING attribute, INT_LIST cpos) + # OUTPUT: CQI_DATA_INT_LIST + # returns -1 for every corpus position not inside a structure region + # print('CL_CPOS2STRUC') + self.__send_WORD(CQi.CL_CPOS2STRUC) + self.__send_STRING(attribute) + self.__send_INT_LIST(cpos) + return self.__recv_response() + + """ + " NOTE: temporary addition for the Euralex2000 tutorial, but should + " probably be included in CQi specs + """ + + def cl_cpos2lbound(self, attribute, cpos): + # INPUT: (STRING attribute, INT_LIST cpos) + # OUTPUT: CQI_DATA_INT_LIST + # returns left boundary of s-attribute region enclosing cpos, -1 if not + # in region + # print('CL_CPOS2LBOUND') + self.__send_WORD(CQi.CL_CPOS2LBOUND) + self.__send_STRING(attribute) + self.__send_INT_LIST(cpos) + return self.__recv_response() + + def cl_cpos2rbound(self, attribute, cpos): + # INPUT: (STRING attribute, INT_LIST cpos) + # OUTPUT: CQI_DATA_INT_LIST + # returns right boundary of s-attribute region enclosing cpos, -1 if + # not in region + # print('CL_CPOS2RBOUND') + self.__send_WORD(CQi.CL_CPOS2RBOUND) + self.__send_STRING(attribute) + self.__send_INT_LIST(cpos) + return self.__recv_response() + + def cl_cpos2alg(self, attribute, cpos): + # INPUT: (STRING attribute, INT_LIST cpos) + # OUTPUT: CQI_DATA_INT_LIST + # returns -1 for every corpus position not inside an alignment + # print('CL_CPOS2ALG') + self.__send_WORD(CQi.CL_CPOS2ALG) + self.__send_STRING(attribute) + self.__send_INT_LIST(cpos) + return self.__recv_response() + + def cl_struc2str(self, attribute, strucs): + # INPUT: (STRING attribute, INT_LIST strucs) + # OUTPUT: CQI_DATA_STRING_LIST + # returns annotated string values of structure regions in ; "" + # if out of range + # check CQI_CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES() first + # print('CL_STRUC2STR') + self.__send_WORD(CQi.CL_STRUC2STR) + self.__send_STRING(attribute) + self.__send_INT_LIST(strucs) + return self.__recv_response() + + """ + " NOTE: the following mappings take a single argument and return multiple + " values, including lists of arbitrary size + """ + + def cl_id2cpos(self, attribute, id): + # INPUT: (STRING attribute, INT id) + # OUTPUT: CQI_DATA_INT_LIST + # returns all corpus positions where the given token occurs + # print('CL_ID2CPOS') + self.__send_WORD(CQi.CL_ID2CPOS) + self.__send_STRING(attribute) + self.__send_INT(id) + return self.__recv_response() + + def cl_idlist2cpos(self, attribute, id_list): + # INPUT: (STRING attribute, INT_LIST id_list) + # OUTPUT: CQI_DATA_INT_LIST + # returns all corpus positions where one of the tokens in + # occurs; the returned list is sorted as a whole, not per token id + # print('CL_IDLIST2CPOS') + self.__send_WORD(CQi.CL_IDLIST2CPOS) + self.__send_STRING(attribute) + self.__send_INT_LIST(id_list) + return self.__recv_response() + + def cl_regex2id(self, attribute, regex): + # INPUT: (STRING attribute, STRING regex) + # OUTPUT: CQI_DATA_INT_LIST + # returns lexicon IDs of all tokens that match ; the returned + # list may be empty (size 0); + # print('CL_REGEX2ID') + self.__send_WORD(CQi.CL_REGEX2ID) + self.__send_STRING(attribute) + self.__send_STRING(regex) + return self.__recv_response() + + def cl_struc2cpos(self, attribute, struc): + # INPUT: (STRING attribute, INT struc) + # OUTPUT: CQI_DATA_INT_INT + # returns start and end corpus positions of structure region + # print('CL_STRUC2CPOS') + self.__send_WORD(CQi.CL_STRUC2CPOS) + self.__send_STRING(attribute) + self.__send_INT(struc) + return self.__recv_response() + + def cl_alg2cpos(self, attribute, alg): + # INPUT: (STRING attribute, INT alg) + # OUTPUT: CQI_DATA_INT_INT_INT_INT + # returns (src_start, src_end, target_start, target_end) + # print('CL_ALG2CPOS') + self.__send_WORD(CQi.CL_ALG2CPOS) + self.__send_STRING(attribute) + self.__send_INT(alg) + return self.__recv_response() + + def cqp_query(self, mother_corpus, subcorpus_name, query): + # INPUT: (STRING mother_corpus, STRING subcorpus_name, STRING query) + # OUTPUT: CQI_STATUS_OK + # must include the ';' character terminating the query. + # print('CQP_QUERY') + self.__send_WORD(CQi.CQP_QUERY) + self.__send_STRING(mother_corpus) + self.__send_STRING(subcorpus_name) + self.__send_STRING(query) + self.__recv_WORD() + + def cqp_list_subcorpora(self, corpus): + # INPUT: (STRING corpus) + # OUTPUT: CQI_DATA_STRING_LIST + # print('CQP_LIST_SUBCORPORA') + self.__send_WORD(CQi.CQP_LIST_SUBCORPORA) + self.__send_STRING(corpus) + return self.__recv_response() + + def cqp_subcorpus_size(self, subcorpus): + # INPUT: (STRING subcorpus) + # OUTPUT: CQI_DATA_INT + # print('CQP_SUBCORPUS_SIZE') + self.__send_WORD(CQi.CQP_SUBCORPUS_SIZE) + self.__send_STRING(subcorpus) + return self.__recv_response() + + def cqp_subcorpus_has_field(self, subcorpus, field): + # INPUT: (STRING subcorpus, BYTE field) + # OUTPUT: CQI_DATA_BOOL + # print('CQP_SUBCORPUS_HAS_FIELD') + self.__send_WORD(CQi.CQP_SUBCORPUS_HAS_FIELD) + self.__send_STRING(subcorpus) + self.__send_BYTE(field) + return self.__recv_response() + + def cqp_dump_subcorpus(self, subcorpus, field, first, last): + # INPUT: (STRING subcorpus, BYTE field, INT first, INT last) + # OUTPUT: CQI_DATA_INT_LIST + # Dump the values of for match ranges .. + # in . is one of the CQI_CONST_FIELD_* constants. + # print('CQP_DUMP_SUBCORPUS') + self.__send_WORD(CQi.CQP_DUMP_SUBCORPUS) + self.__send_STRING(subcorpus) + self.__send_BYTE(field) + self.__send_INT(first) + self.__send_INT(last) + return self.__recv_response() + + def cqp_drop_subcorpus(self, subcorpus): + # INPUT: (STRING subcorpus) + # OUTPUT: CQI_STATUS_OK + # delete a subcorpus from memory + # print('CQP_DROP_SUBCORPUS') + self.__send_WORD(CQi.CQP_DROP_SUBCORPUS) + self.__send_STRING(subcorpus) + self.__recv_response() + + """ + " NOTE: The following two functions are temporarily included for the + " Euralex 2000 tutorial demo + """ + + def cqp_fdist_1(self, subcorpus, cutoff, field, attribute): + """ NOTE: frequency distribution of single tokens """ + # INPUT: (STRING subcorpus, INT cutoff, BYTE field, STRING attribute) + # OUTPUT: CQI_DATA_INT_LIST + # returns (id, frequency) pairs flattened into a list of size 2* + # field is one of CQI_CONST_FIELD_MATCH, CQI_CONST_FIELD_TARGET, + # CQI_CONST_FIELD_KEYWORD + # NB: pairs are sorted by frequency desc. + # print('CQP_FDIST_1') + self.__send_WORD(CQi.CQP_FDIST_1) + self.__send_STRING(subcorpus) + self.__send_INT(cutoff) + self.__send_BYTE(field) + self.__send_STRING(attribute) + return self.__recv_response() + + def cqp_fdist_2(self, subcorpus, cutoff, field1, attribute1, field2, + attribute2): + """ NOTE: frequency distribution of pairs of tokens """ + # INPUT: (STRING subcorpus, INT cutoff, BYTE field1, STRING attribute1, + # BYTE field2, STRING attribute2) + # OUTPUT: CQI_DATA_INT_LIST + # returns (id1, id2, frequency) pairs flattened into a list of size + # 3* + # NB: triples are sorted by frequency desc. + # print('CQP_FDIST_2') + self.__send_WORD(CQi.CQP_FDIST_2) + self.__send_STRING(subcorpus) + self.__send_INT(cutoff) + self.__send_BYTE(field1) + self.__send_STRING(attribute1) + self.__send_BYTE(field2) + self.__send_STRING(attribute2) + return self.__recv_response() + + def __recv_response(self): + byte_data = self.__recv_WORD() + response_type = byte_data >> 8 + if response_type == CQi.STATUS: + response = byte_data + elif response_type == CQi.ERROR: + raise Exception(CQi.lookup[byte_data]) + elif response_type == CQi.DATA: + response = self.__recv_DATA(byte_data) + elif response_type == CQi.CL_ERROR: + raise Exception(CQi.lookup[byte_data]) + elif response_type == CQi.CQP_ERROR: + raise Exception(CQi.lookup[byte_data]) + else: + raise Exception( + 'Unknown response type: {}'.format(hex(response_type)) + ) + return response + + def __recv_DATA(self, data_type): + if data_type == CQi.DATA_BYTE: + data = self.__recv_DATA_BYTE() + elif data_type == CQi.DATA_BOOL: + data = self.__recv_DATA_BOOL() + elif data_type == CQi.DATA_INT: + data = self.__recv_DATA_INT() + elif data_type == CQi.DATA_STRING: + data = self.__recv_DATA_STRING() + elif data_type == CQi.DATA_BYTE_LIST: + data = self.__recv_DATA_BYTE_LIST() + elif data_type == CQi.DATA_BOOL_LIST: + data = self.__recv_DATA_BOOL_LIST() + elif data_type == CQi.DATA_INT_LIST: + data = self.__recv_DATA_INT_LIST() + elif data_type == CQi.DATA_STRING_LIST: + data = self.__recv_DATA_STRING_LIST() + elif data_type == CQi.DATA_INT_INT: + data = self.__recv_DATA_INT_INT() + elif data_type == CQi.DATA_INT_INT_INT_INT: + data = self.__recv_DATA_INT_INT_INT_INT() + elif data_type == CQi.DATA_INT_TABLE: + data = self.__recv_DATA_INT_TABLE() + else: + raise Exception('Unknown data type: {}'.format(hex(data_type))) + return data + + def __recv_DATA_BYTE(self): + byte_data = self.connection.recv(1) + return struct.unpack('!B', byte_data)[0] + + def __recv_DATA_BOOL(self): + byte_data = self.connection.recv(1) + return struct.unpack('!?', byte_data)[0] + + def __recv_DATA_INT(self): + byte_data = self.connection.recv(4) + return struct.unpack('!i', byte_data)[0] + + def __recv_DATA_STRING(self): + n = self.__recv_WORD() + byte_data = self.connection.recv(n) + return struct.unpack('!{}s'.format(n), byte_data)[0].decode() + + def __recv_DATA_BYTE_LIST(self): + data = [] + n = self.__recv_DATA_INT() + while n > 0: + data.append(self.__recv_DATA_BYTE()) + n -= 1 + return data + + def __recv_DATA_BOOL_LIST(self): + data = [] + n = self.__recv_DATA_INT() + while n > 0: + data.append(self.__recv_DATA_BOOL()) + n -= 1 + return data + + def __recv_DATA_INT_LIST(self): + data = [] + n = self.__recv_DATA_INT() + while n > 0: + data.append(self.__recv_DATA_INT()) + n -= 1 + return data + + def __recv_DATA_STRING_LIST(self): + data = [] + n = self.__recv_DATA_INT() + while n > 0: + data.append(self.__recv_DATA_STRING()) + n -= 1 + return data + + def __recv_DATA_INT_INT(self): + return (self.__recv_INT(), self.__recv_INT()) + + def __recv_DATA_INT_INT_INT_INT(self): + return (self.__recv_INT(), + self.__recv_INT(), + self.__recv_INT(), + self.__recv_INT()) + + def __recv_DATA_INT_TABLE(self): + rows = self.__recv_DATA_INT() + columns = self.__recv_DATA_INT() + data = [] + for i in range(0, rows): + row = [] + for j in range(0, columns): + row.append(self.__recv_DATA_INT()) + data.append(row) + return data + + def __recv_WORD(self): + byte_data = self.connection.recv(2) + return struct.unpack('!H', byte_data)[0] + + def __send_BYTE(self, byte_data): + data = struct.pack('!B', byte_data) + self.connection.sendall(data) + + def __send_BOOL(self, bool_data): + data = struct.pack('!?', bool_data) + self.connection.sendall(data) + + def __send_INT(self, int_data): + data = struct.pack('!i', int_data) + self.connection.sendall(data) + + def __send_STRING(self, string_data): + encoded_string_data = string_data.encode('utf-8') + n = len(encoded_string_data) + data = struct.pack('!H{}s'.format(n), n, encoded_string_data) + self.connection.sendall(data) + + def __send_INT_LIST(self, int_list_data): + n = len(int_list_data) + self.__send_INT(n) + for int_data in int_list_data: + self.__send_INT(int_data) + + def __send_STRING_LIST(self, string_list_data): + n = len(string_list_data) + self.__send_INT(n) + for string_data in string_list_data: + self.__send_STRING(string_data) + + def __send_WORD(self, word_data): + data = struct.pack('!H', word_data) + self.connection.sendall(data) diff --git a/app/corpora/CQiWrapper/CQiWrapper.py b/app/corpora/CQiWrapper/CQiWrapper.py new file mode 100644 index 00000000..4739a655 --- /dev/null +++ b/app/corpora/CQiWrapper/CQiWrapper.py @@ -0,0 +1,240 @@ +from .CQiClient import CQiClient +import multiprocessing +import collections +import socket + + +class CQiWrapper(CQiClient): + """ + CQIiWrapper object + + High level wrapper that groups and renames some functions of CQiClient + for ease of use. Also structures recieved data into python dictionaries. + + Keyword arguments: + username -- username used to connect to the cqp server + password -- password of the user to connect to the cqp server + """ + + SUBCORPUS_NAMES = [] + + def __init__(self, host='127.0.0.1', port=4877, username='opaque', + password='opaque'): + super(CQiWrapper, self).__init__(host=host, port=port) + self.username = username + self.password = password + + def connect(self): + """ + Connect with CQP server + + Connects via socket to the CQP server using the given username and + password from class initiation. + """ + self.ctrl_connect(self.username, self.password) + + def create_attribute_strings(self, corpus_name): + self.word_str = corpus_name + '.word' + self.lemma_str = corpus_name + '.lemma' + self.pos_str = corpus_name + '.pos' + self.sem_str = corpus_name + '.sem' + self.entry_str = corpus_name + '.entry' + self.entry_author_str = self.entry_str + '_author' + self.entry_title_str = self.entry_str + '_title' + self.attributes = [self.word_str, + self.lemma_str, + self.pos_str, + self.sem_str, + self.entry_str, + self.entry_author_str, + self.entry_title_str] + + def disconnect(self): + """ + Disconnect from CQP server + + Disconnects from the CQP server. Closes used socket after disconnect. + """ + self.ctrl_bye() + self.connection.close() + + def query_subcorpus(self, corpus_name, result_subcorpus_name, query): + """ + Create subcorpus + + Input query will be used to create a subcorpus holding all cpos match + positions for that query. + + Keyword arguments: + corpus_name -- name of the corpus the query will be used on + result_subcorpus_name -- user set name of the subcorpus which holds all + cpos match positions, produced by the query + query -- query written in cqp query language + """ + self.cqp_query(corpus_name, result_subcorpus_name, query) + self.result_subcorpus_ns = (corpus_name + + ':' + + result_subcorpus_name) + self.SUBCORPUS_NAMES.append(self.result_subcorpus_ns) + self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus_ns) + print('Nr of all matches is:', self.nr_matches) + + def show_subcorpora(self): + print('Known subcorpora:', self.SUBCORPUS_NAMES) + return self.SUBCORPUS_NAMES + + def show_results(self, + corpus_name, + result_start_count=0, + result_max_count=50, + context_len=10,): + """ + Show query results + + Shows the actual matched strings produce by the query. Uses the cpos + match indexes to grab those strings. saves them into an orderd + dictionary. Also saves coresponding tags, lemmas and context: + OrderedDict([ + (0, + { + 'tokens': ['Big', 'Brother', 'himself'], + 'lemmas': ['big', 'brother', 'himself'], + 'pos_tags': ['JJ', 'NN1', 'PPX1'], + 'sem_tags': ['|A11.1+|N3.2+|N5+|', '|S2.2m|S4m|S9/S2.2m|', + '|Z8m|'], + 'context_before': ['figures', 'of', 'the', 'Party', ',', + 'almost', 'on', 'a', 'level', 'with'], + 'context_after': [',', 'and', 'then', 'had', 'engaged', + 'in', 'counter-revolu-', 'tionary', + 'activities', ','], + 'entry_title': '1984', 'entry_author': + 'george_orwell', + 'cpos_start': 110490, + 'cpos_end': 110492 + } + ) + ]) + + Keyword arguments: + corpus_name -- name of the parent corpus the subcorpus is part of + result_start_count -- start position of the dumped subcorpus. + (default 0) If it is 0 matches 0 to 50 will be shown. If it is 50 + matches 50 to 100 will be shown. + result_max_count -- defines how many matches at once will be shown. + (default 50) + context_len -- defines how many words before and after a match will be + shown (default 10) + """ + self.context_len = context_len + word_str = corpus_name + '.word' + self.corpus_max_len = self.cl_attribute_size(word_str) + if self.nr_matches == 0: + print('Query resulted in 0 matches.') + else: + if self.nr_matches <= 50: + matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns, + 0x10, + 0, + self.nr_matches - 1) + matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns, + 0x11, + 0, self.nr_matches - 1) + else: + matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns, + 0x10, + result_start_count, + result_max_count - 1) + matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns, + 0x11, + result_start_count, + result_max_count - 1) + match_indexes = zip(matches_start, matches_end) + + matches = [] + manager = multiprocessing.Manager() + return_dict = manager.dict() + for i, index_pair in enumerate(match_indexes): + match = multiprocessing.Process(target=self.__get_matches, + args=(i, + index_pair, + corpus_name, + return_dict)) + matches.append(match) + match.start() + for match in matches: + match.join() + # sort matches into ordered dict + ordered_results = collections.OrderedDict() + for key in sorted(return_dict.keys()): + ordered_results[key] = return_dict[key] + print('ORDERED_RESULTS', ordered_results) + + def __get_matches(self, i, index_pair, corpus_name, return_dict): + """ + Get matches as readable output + + Gets the actual match strings of cpos match indexes. Private helper + method used in show_results. + + Keyword arguments: + i -- serial number for match at given cpos + index_pair -- match start and match end cpos + corpus_name -- name of the parent corpus + return_dict -- dictionary created with manager.dict() that holds the + extracted strings tags etc. + """ + print('START:', index_pair[0]) + print('END:', index_pair[1]) + print('=============================') + tmp_session = CQiWrapper(username=self.username, password=self.password, + host=self.host, port=self.port) + tmp_session.connect() + tokens = tmp_session.cl_cpos2str(self.word_str, + range(index_pair[0], + index_pair[1] + 1)) + lemmas = tmp_session.cl_cpos2str(self.lemma_str, + range(index_pair[0], + index_pair[1] + 1)) + pos_tags = tmp_session.cl_cpos2str(self.pos_str, + range(index_pair[0], + index_pair[1] + 1)) + sem_tags = tmp_session.cl_cpos2str(self.sem_str, + range(index_pair[0], + index_pair[1] + 1)) + struc_entry = tmp_session.cl_cpos2struc(self.entry_str, + range(index_pair[0], + index_pair[1] + 1)) + before_index = max([0, index_pair[0] - self.context_len]) + after_index = min([self.corpus_max_len, + index_pair[1] + self.context_len]) + context_before = tmp_session.cl_cpos2str(self.word_str, + range(before_index, + index_pair[0])) + context_after = tmp_session.cl_cpos2str(self.word_str, + range(index_pair[1] + 1, + after_index + 1)) + entry_titles = tmp_session.cl_struc2str(self.entry_title_str, + struc_entry) + entry_authors = tmp_session.cl_struc2str(self.entry_author_str, + struc_entry) + return_dict[i] = {'tokens': tokens, + 'lemmas': lemmas, + 'pos_tags': pos_tags, + 'sem_tags': sem_tags, + 'context_before': context_before, + 'context_after': context_after, + 'entry_title': entry_titles[0], + 'entry_author': entry_authors[0], + 'cpos_start': index_pair[0], + 'cpos_end': index_pair[1]} + tmp_session.disconnect() + + def get_cpos_info(self, cpos): + match_dict = collections.OrderedDict() + for attribute in self.attributes: + if '.entry' not in attribute: + match_str = self.cl_cpos2str(attribute, range(cpos[0], cpos[1])) + match_dict[attribute] = match_str + else: + continue + print(match_dict)