diff --git a/app/corpora/CQiWrapper/CQi.py b/app/corpora/CQiWrapper/CQi.py
new file mode 100644
index 00000000..5d39395a
--- /dev/null
+++ b/app/corpora/CQiWrapper/CQi.py
@@ -0,0 +1,406 @@
+# ########################################################################### #
+# IMS CQi specification #
+# #
+# Version: 0.1a ;o) #
+# Author: Stefan Evert (evert@ims.uni-stuttgart.de) #
+# Modified by: Patrick Jentsch
#
+# ########################################################################### #
+
+
+""" 1. padding """
+PAD = 0x00
+
+
+""" 2. CQi responses """
+""" 2.1 CQI_STATUS_* """
+STATUS = 0x01
+STATUS_OK = 0x0101
+STATUS_CONNECT_OK = 0x0102
+STATUS_BYE_OK = 0x0103
+STATUS_PING_OK = 0x0104
+
+""" 2.2 CQI_ERROR_* """
+ERROR = 0x02
+ERROR_GENERAL_ERROR = 0x0201
+ERROR_CONNECT_REFUSED = 0x0202
+ERROR_USER_ABORT = 0x0203
+ERROR_SYNTAX_ERROR = 0x0204
+# includes corpus/attribute/subcorpus specifier syntax
+
+""" 2.3 CQI_DATA_* """
+DATA = 0x03
+DATA_BYTE = 0x0301
+DATA_BOOL = 0x0302
+DATA_INT = 0x0303
+DATA_STRING = 0x0304
+DATA_BYTE_LIST = 0x0305
+DATA_BOOL_LIST = 0x0306
+DATA_INT_LIST = 0x0307
+DATA_STRING_LIST = 0x0308
+DATA_INT_INT = 0x0309
+DATA_INT_INT_INT_INT = 0x030A
+DATA_INT_TABLE = 0x030B
+
+""" 2.4 CQI_CL_ERROR_* """
+"""
+" NOTE: some CL error codes are not represented in the CQi specs
+" - usually because they're not used in the CL any more
+" - CDA_ENOSTRING is not considered an error (returns -1)
+" - CDA_EARGS: dynamic attribute calls not yet supported
+"""
+CL_ERROR = 0x04
+CL_ERROR_NO_SUCH_ATTRIBUTE = 0x0401
+# returned if CQi server couldn't open attribute
+CL_ERROR_WRONG_ATTRIBUTE_TYPE = 0x0402
+# CDA_EATTTYPE
+CL_ERROR_OUT_OF_RANGE = 0x0403
+# CDA_EIDORNG, CDA_EIDXORNG, CDA_EPOSORNG
+CL_ERROR_REGEX = 0x0404
+# CDA_EPATTERN (not used), CDA_EBADREGEX
+CL_ERROR_CORPUS_ACCESS = 0x0405
+# CDA_ENODATA
+CL_ERROR_OUT_OF_MEMORY = 0x0406
+# CDA_ENOMEM
+# this means the CQi server has run out of memory;
+# try discarding some other corpora and/or subcorpora
+CL_ERROR_INTERNAL = 0x0407
+# CDA_EOTHER, CDA_ENYI
+# this is the classical 'please contact technical support' error
+
+""" 2.5 CQI_CQP_ERROR_* """
+CQP_ERROR = 0x05
+# CQP error messages yet to be defined
+CQP_ERROR_GENERAL = 0x0501
+CQP_ERROR_NO_SUCH_CORPUS = 0x0502
+CQP_ERROR_INVALID_FIELD = 0x0503
+CQP_ERROR_OUT_OF_RANGE = 0x0504
+# various cases where a number is out of range
+
+
+""" 3. CQi commands """
+""" 3.1 CQI_CTRL_* """
+CTRL = 0x11
+CTRL_CONNECT = 0x1101
+# INPUT: (STRING username, STRING password)
+# OUTPUT: CQI_STATUS_CONNECT_OK, CQI_ERROR_CONNECT_REFUSED
+CTRL_BYE = 0x1102
+# INPUT: ()
+# OUTPUT: CQI_STATUS_BYE_OK
+CTRL_USER_ABORT = 0x1103
+# INPUT: ()
+# OUTPUT:
+CTRL_PING = 0x1104
+# INPUT: ()
+# OUTPUT: CQI_STATUS_PING_OK
+CTRL_LAST_GENERAL_ERROR = 0x1105
+# INPUT: ()
+# OUTPUT: CQI_DATA_STRING
+# full-text error message for the last general error reported by the CQi server
+
+""" 3.2 CQI_ASK_FEATURE_* """
+ASK_FEATURE = 0x12
+ASK_FEATURE_CQI_1_0 = 0x1201
+# INPUT: ()
+# OUTPUT: CQI_DATA_BOOL
+ASK_FEATURE_CL_2_3 = 0x1202
+# INPUT: ()
+# OUTPUT: CQI_DATA_BOOL
+ASK_FEATURE_CQP_2_3 = 0x1203
+# INPUT: ()
+# OUTPUT: CQI_DATA_BOOL
+
+""" 3.3 CQI_CORPUS_* """
+CORPUS = 0x13
+CORPUS_LIST_CORPORA = 0x1301
+# INPUT: ()
+# OUTPUT: CQI_DATA_STRING_LIST
+CORPUS_CHARSET = 0x1303
+# INPUT: (STRING corpus)
+# OUTPUT: CQI_DATA_STRING
+CORPUS_PROPERTIES = 0x1304
+# INPUT: (STRING corpus)
+# OUTPUT: CQI_DATA_STRING_LIST
+CORPUS_POSITIONAL_ATTRIBUTES = 0x1305
+# INPUT: (STRING corpus)
+# OUTPUT: CQI_DATA_STRING_LIST
+CORPUS_STRUCTURAL_ATTRIBUTES = 0x1306
+# INPUT: (STRING corpus)
+# OUTPUT: CQI_DATA_STRING_LIST
+CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES = 0x1307
+# INPUT: (STRING attribute)
+# OUTPUT: CQI_DATA_BOOL
+CORPUS_ALIGNMENT_ATTRIBUTES = 0x1308
+# INPUT: (STRING corpus)
+# OUTPUT: CQI_DATA_STRING_LIST
+CORPUS_FULL_NAME = 0x1309
+# INPUT: (STRING corpus)
+# OUTPUT: CQI_DATA_STRING
+# the full name of as specified in its registry entry
+CORPUS_INFO = 0x130A
+# INPUT: (STRING corpus)
+# OUTPUT: CQI_DATA_STRING_LIST
+# returns the contents of the .info file of as a list of lines
+CORPUS_DROP_CORPUS = 0x130B
+# INPUT: (STRING corpus)
+# OUTPUT: CQI_STATUS_OK
+# try to unload a corpus and all its attributes from memory
+
+""" 3.4 CQI_CL_* """
+CL = 0x14
+# low-level corpus access (CL functions)
+CL_ATTRIBUTE_SIZE = 0x1401
+# INPUT: (STRING attribute)
+# OUTPUT: CQI_DATA_INT
+# returns the size of :
+# - number of tokens (positional)
+# - number of regions (structural)
+# - number of alignments (alignment)
+CL_LEXICON_SIZE = 0x1402
+# INPUT: (STRING attribute)
+# OUTPUT: CQI_DATA_INT
+# returns the number of entries in the lexicon of a positional attribute;
+# valid lexicon IDs range from 0 .. (lexicon_size - 1)
+CL_DROP_ATTRIBUTE = 0x1403
+# INPUT: (STRING attribute)
+# OUTPUT: CQI_STATUS_OK
+# unload attribute from memory
+"""
+" NOTE: simple (scalar) mappings are applied to lists (the returned list has
+" exactly the same length as the list passed as an argument)
+"""
+CL_STR2ID = 0x1404
+# INPUT: (STRING attribute, STRING_LIST strings)
+# OUTPUT: CQI_DATA_INT_LIST
+# returns -1 for every string in that is not found in the lexicon
+CL_ID2STR = 0x1405
+# INPUT: (STRING attribute, INT_LIST id)
+# OUTPUT: CQI_DATA_STRING_LIST
+# returns "" for every ID in that is out of range
+CL_ID2FREQ = 0x1406
+# INPUT: (STRING attribute, INT_LIST id)
+# OUTPUT: CQI_DATA_INT_LIST
+# returns 0 for every ID in that is out of range
+CL_CPOS2ID = 0x1407
+# INPUT: (STRING attribute, INT_LIST cpos)
+# OUTPUT: CQI_DATA_INT_LIST
+# returns -1 for every corpus position in that is out of range
+CL_CPOS2STR = 0x1408
+# INPUT: (STRING attribute, INT_LIST cpos)
+# OUTPUT: CQI_DATA_STRING_LIST
+# returns "" for every corpus position in that is out of range
+CL_CPOS2STRUC = 0x1409
+# INPUT: (STRING attribute, INT_LIST cpos)
+# OUTPUT: CQI_DATA_INT_LIST
+# returns -1 for every corpus position not inside a structure region
+"""
+" NOTE: temporary addition for the Euralex2000 tutorial, but should probably be
+" included in CQi specs
+"""
+CL_CPOS2LBOUND = 0x1420
+# INPUT: (STRING attribute, INT_LIST cpos)
+# OUTPUT: CQI_DATA_INT_LIST
+# returns left boundary of s-attribute region enclosing cpos, -1 if not in
+# region
+CL_CPOS2RBOUND = 0x1421
+# INPUT: (STRING attribute, INT_LIST cpos)
+# OUTPUT: CQI_DATA_INT_LIST
+# returns right boundary of s-attribute region enclosing cpos, -1 if not in
+# region
+CL_CPOS2ALG = 0x140A
+# INPUT: (STRING attribute, INT_LIST cpos)
+# OUTPUT: CQI_DATA_INT_LIST
+# returns -1 for every corpus position not inside an alignment
+CL_STRUC2STR = 0x140B
+# INPUT: (STRING attribute, INT_LIST strucs)
+# OUTPUT: CQI_DATA_STRING_LIST
+# returns annotated string values of structure regions in ; "" if out
+# of range
+# check CQI_CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES() first
+"""
+" NOTE: the following mappings take a single argument and return multiple
+" values, including lists of arbitrary size
+"""
+CL_ID2CPOS = 0x140C
+# INPUT: (STRING attribute, INT id)
+# OUTPUT: CQI_DATA_INT_LIST
+# returns all corpus positions where the given token occurs
+CL_IDLIST2CPOS = 0x140D
+# INPUT: (STRING attribute, INT_LIST id_list)
+# OUTPUT: CQI_DATA_INT_LIST
+# returns all corpus positions where one of the tokens in
+# occurs; the returned list is sorted as a whole, not per token id
+CL_REGEX2ID = 0x140E
+# INPUT: (STRING attribute, STRING regex)
+# OUTPUT: CQI_DATA_INT_LIST
+# returns lexicon IDs of all tokens that match ; the returned
+# list may be empty (size 0);
+CL_STRUC2CPOS = 0x140F
+# INPUT: (STRING attribute, INT struc)
+# OUTPUT: CQI_DATA_INT_INT
+# returns start and end corpus positions of structure region
+CL_ALG2CPOS = 0x1410
+# INPUT: (STRING attribute, INT alg)
+# OUTPUT: CQI_DATA_INT_INT_INT_INT
+# returns (src_start, src_end, target_start, target_end)
+
+""" 3.5 CQI_CQP_* """
+CQP = 0x15
+CQP_QUERY = 0x1501
+# INPUT: (STRING mother_corpus, STRING subcorpus_name, STRING query)
+# OUTPUT: CQI_STATUS_OK
+# must include the ';' character terminating the query.
+CQP_LIST_SUBCORPORA = 0x1502
+# INPUT: (STRING corpus)
+# OUTPUT: CQI_DATA_STRING_LIST
+CQP_SUBCORPUS_SIZE = 0x1503
+# INPUT: (STRING subcorpus)
+# OUTPUT: CQI_DATA_INT
+CQP_SUBCORPUS_HAS_FIELD = 0x1504
+# INPUT: (STRING subcorpus, BYTE field)
+# OUTPUT: CQI_DATA_BOOL
+CQP_DUMP_SUBCORPUS = 0x1505
+# INPUT: (STRING subcorpus, BYTE field, INT first, INT last)
+# OUTPUT: CQI_DATA_INT_LIST
+# Dump the values of for match ranges .. in .
+# is one of the CQI_CONST_FIELD_* constants.
+CQP_DROP_SUBCORPUS = 0x1509
+# INPUT: (STRING subcorpus)
+# OUTPUT: CQI_STATUS_OK
+# delete a subcorpus from memory
+"""
+" NOTE: The following two functions are temporarily included for the Euralex
+" 2000 tutorial demo
+"""
+""" NOTE: frequency distribution of single tokens """
+CQP_FDIST_1 = 0x1510
+# INPUT: (STRING subcorpus, INT cutoff, BYTE field, STRING attribute)
+# OUTPUT: CQI_DATA_INT_LIST
+# returns (id, frequency) pairs flattened into a list of size 2*
+# field is one of CQI_CONST_FIELD_MATCH, CQI_CONST_FIELD_TARGET,
+# CQI_CONST_FIELD_KEYWORD
+# NB: pairs are sorted by frequency desc.
+""" NOTE: frequency distribution of pairs of tokens """
+CQP_FDIST_2 = 0x1511
+# INPUT: (STRING subcorpus, INT cutoff, BYTE field1, STRING attribute1,
+# BYTE field2, STRING attribute2)
+# OUTPUT: CQI_DATA_INT_LIST
+# returns (id1, id2, frequency) pairs flattened into a list of size 3*
+# NB: triples are sorted by frequency desc.
+
+
+""" 4. Constant Definitions """
+CONST_FALSE = 0x00
+CONST_NO = 0x00
+CONST_TRUE = 0x01
+CONST_YES = 0x01
+"""
+" NOTE: The following constants specify which field will be returned by
+" CQI_CQP_DUMP_SUBCORPUS and some other subcorpus commands.
+"""
+CONST_FIELD_MATCH = 0x10
+CONST_FIELD_MATCHEND = 0x11
+"""
+" NOTE: The constants specifiying target0 .. target9 are guaranteed to have the
+" numerical values 0 .. 9, so clients do not need to look up the constant
+" values if they're handling arbitrary targets.
+"""
+CONST_FIELD_TARGET_0 = 0x00
+CONST_FIELD_TARGET_1 = 0x01
+CONST_FIELD_TARGET_2 = 0x02
+CONST_FIELD_TARGET_3 = 0x03
+CONST_FIELD_TARGET_4 = 0x04
+CONST_FIELD_TARGET_5 = 0x05
+CONST_FIELD_TARGET_6 = 0x06
+CONST_FIELD_TARGET_7 = 0x07
+CONST_FIELD_TARGET_8 = 0x08
+CONST_FIELD_TARGET_9 = 0x09
+"""
+" NOTE: The following constants are provided for backward compatibility with
+" traditional CQP field names & while the generalised target concept
+" isn't yet implemented in the CQPserver.
+"""
+CONST_FIELD_TARGET = 0x00
+CONST_FIELD_KEYWORD = 0x09
+""" NOTE: CQi version is CQI_MAJOR_VERSION.CQI_MINOR_VERSION """
+MAJOR_VERSION = 0x00
+MINOR_VERSION = 0x01
+
+
+""" 5. CQi lookup dictionary. """
+lookup = {
+ 257: 'CQI_STATUS_OK',
+ 258: 'CQI_STATUS_CONNECT_OK',
+ 259: 'CQI_STATUS_BYE_OK',
+ 260: 'CQI_STATUS_PING_OK',
+ 513: 'CQI_ERROR_GENERAL_ERROR',
+ 514: 'CQI_ERROR_CONNECT_REFUSED',
+ 515: 'CQI_ERROR_USER_ABORT',
+ 516: 'CQI_ERROR_SYNTAX_ERROR',
+ 769: 'CQI_DATA_BYTE',
+ 770: 'CQI_DATA_BOOL',
+ 771: 'CQI_DATA_INT',
+ 772: 'CQI_DATA_STRING',
+ 773: 'CQI_DATA_BYTE_LIST',
+ 774: 'CQI_DATA_BOOL_LIST',
+ 775: 'CQI_DATA_INT_LIST',
+ 776: 'CQI_DATA_STRING_LIST',
+ 777: 'CQI_DATA_INT_INT',
+ 778: 'CQI_DATA_INT_INT_INT_INT',
+ 779: 'CQI_DATA_INT_TABLE',
+ 1025: 'CQI_CL_ERROR_NO_SUCH_ATTRIBUTE',
+ 1026: 'CQI_CL_ERROR_WRONG_ATTRIBUTE_TYPE',
+ 1027: 'CQI_CL_ERROR_OUT_OF_RANGE',
+ 1028: 'CQI_CL_ERROR_REGEX',
+ 1029: 'CQI_CL_ERROR_CORPUS_ACCESS',
+ 1030: 'CQI_CL_ERROR_OUT_OF_MEMORY',
+ 1031: 'CQI_CL_ERROR_INTERNAL',
+ 1281: 'CQI_CQP_ERROR_GENERAL',
+ 1282: 'CQI_CQP_ERROR_NO_SUCH_CORPUS',
+ 1283: 'CQI_CQP_ERROR_INVALID_FIELD',
+ 1284: 'CQI_CQP_ERROR_OUT_OF_RANGE',
+ 4353: 'CQI_CTRL_CONNECT',
+ 4354: 'CQI_CTRL_BYE',
+ 4355: 'CQI_CTRL_USER_ABORT',
+ 4356: 'CQI_CTRL_PING',
+ 4357: 'CQI_CTRL_LAST_GENERAL_ERROR',
+ 4609: 'CQI_ASK_FEATURE_CQI_1_0',
+ 4610: 'CQI_ASK_FEATURE_CL_2_3',
+ 4611: 'CQI_ASK_FEATURE_CQP_2_3',
+ 4865: 'CQI_CORPUS_LIST_CORPORA',
+ 4867: 'CQI_CORPUS_CHARSET',
+ 4868: 'CQI_CORPUS_PROPERTIES',
+ 4869: 'CQI_CORPUS_POSITIONAL_ATTRIBUTES',
+ 4870: 'CQI_CORPUS_STRUCTURAL_ATTRIBUTES',
+ 4871: 'CQI_CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES',
+ 4872: 'CQI_CORPUS_ALIGNMENT_ATTRIBUTES',
+ 4873: 'CQI_CORPUS_FULL_NAME',
+ 4874: 'CQI_CORPUS_INFO',
+ 4875: 'CQI_CORPUS_DROP_CORPUS',
+ 5121: 'CQI_CL_ATTRIBUTE_SIZE',
+ 5122: 'CQI_CL_LEXICON_SIZE',
+ 5123: 'CQI_CL_DROP_ATTRIBUTE',
+ 5124: 'CQI_CL_STR2ID',
+ 5125: 'CQI_CL_ID2STR',
+ 5126: 'CQI_CL_ID2FREQ',
+ 5127: 'CQI_CL_CPOS2ID',
+ 5128: 'CQI_CL_CPOS2STR',
+ 5129: 'CQI_CL_CPOS2STRUC',
+ 5130: 'CQI_CL_CPOS2ALG',
+ 5131: 'CQI_CL_STRUC2STR',
+ 5132: 'CQI_CL_ID2CPOS',
+ 5133: 'CQI_CL_IDLIST2CPOS',
+ 5134: 'CQI_CL_REGEX2ID',
+ 5135: 'CQI_CL_STRUC2CPOS',
+ 5136: 'CQI_CL_ALG2CPOS',
+ 5152: 'CQI_CL_CPOS2LBOUND',
+ 5153: 'CQI_CL_CPOS2RBOUND',
+ 5377: 'CQI_CQP_QUERY',
+ 5378: 'CQI_CQP_LIST_SUBCORPORA',
+ 5379: 'CQI_CQP_SUBCORPUS_SIZE',
+ 5380: 'CQI_CQP_SUBCORPUS_HAS_FIELD',
+ 5381: 'CQI_CQP_DUMP_SUBCORPUS',
+ 5385: 'CQI_CQP_DROP_SUBCORPUS',
+ 5392: 'CQI_CQP_FDIST_1',
+ 5393: 'CQI_CQP_FDIST_2'
+}
diff --git a/app/corpora/CQiWrapper/CQiClient.py b/app/corpora/CQiWrapper/CQiClient.py
new file mode 100644
index 00000000..39a24c4c
--- /dev/null
+++ b/app/corpora/CQiWrapper/CQiClient.py
@@ -0,0 +1,611 @@
+from . import CQi
+import socket
+import struct
+
+
+class CQiClient:
+ def __init__(self, host='127.0.0.1', port=4877):
+ self.host = host
+ self.port = port
+ self.connection = socket.socket()
+ self.connection.connect((self.host, self.port))
+
+ def ctrl_connect(self, username, password):
+ # INPUT: (STRING username, STRING password)
+ # OUTPUT: CQI_STATUS_CONNECT_OK, CQI_ERROR_CONNECT_REFUSED
+ # print('CTRL_CONNECT')
+ self.__send_WORD(CQi.CTRL_CONNECT)
+ self.__send_STRING(username)
+ self.__send_STRING(password)
+ self.__recv_response()
+
+ def ctrl_bye(self):
+ # INPUT: ()
+ # OUTPUT: CQI_STATUS_BYE_OK
+ # print('CTRL_BYE')
+ self.__send_WORD(CQi.CTRL_BYE)
+ self.__recv_response()
+
+ def ctrl_user_abort(self):
+ # INPUT: ()
+ # OUTPUT:
+ # print('CTRL_USER_ABORT')
+ self.__send_WORD(CQi.CTRL_USER_ABORT)
+
+ def ctrl_ping(self):
+ # INPUT: ()
+ # OUTPUT: CQI_STATUS_PING_OK
+ # print('CTRL_PING')
+ self.__send_WORD(CQi.CTRL_PING)
+ self.__recv_response()
+
+ def ctrl_last_general_error(self):
+ # INPUT: ()
+ # OUTPUT: CQI_DATA_STRING
+ # full-text error message for the last general error reported by the
+ # CQi server
+ # print('CTRL_LAST_GENERAL_ERROR')
+ self.__send_WORD(CQi.CTRL_LAST_GENERAL_ERROR)
+ return self.__recv_response()
+
+ def ask_feature_cqi_1_0(self):
+ # INPUT: ()
+ # OUTPUT: CQI_DATA_BOOL
+ # print('ASK_FEATURE_CQI_1_0')
+ self.__send_WORD(CQi.ASK_FEATURE_CQI_1_0)
+ return self.__recv_response()
+
+ def ask_feature_cl_2_3(self):
+ # INPUT: ()
+ # OUTPUT: CQI_DATA_BOOL
+ # print('ASK_FEATURE_CL_2_3')
+ self.__send_WORD(CQi.ASK_FEATURE_CL_2_3)
+ return self.__recv_response()
+
+ def ask_feature_cqp_2_3(self):
+ # INPUT: ()
+ # OUTPUT: CQI_DATA_BOOL
+ # print('ASK_FEATURE_CL_2_3')
+ self.__send_WORD(CQi.ASK_FEATURE_CL_2_3)
+ return self.__recv_response()
+
+ def corpus_list_coprora(self):
+ # INPUT: ()
+ # OUTPUT: CQI_DATA_STRING_LIST
+ # print('CORPUS_LIST_CORPORA')
+ self.__send_WORD(CQi.CORPUS_LIST_CORPORA)
+ return self.__recv_response()
+
+ def corpus_charset(self, corpus):
+ # INPUT: (STRING corpus)
+ # OUTPUT: CQI_DATA_STRING
+ # print('CORPUS_CHARSET')
+ self.__send_WORD(CQi.CORPUS_CHARSET)
+ self.__send_STRING(corpus)
+ return self.__recv_response()
+
+ def corpus_properties(self, corpus):
+ # INPUT: (STRING corpus)
+ # OUTPUT: CQI_DATA_STRING_LIST
+ # print('CORPUS_PROPERTIES')
+ self.__send_WORD(CQi.CORPUS_PROPERTIES)
+ self.__send_STRING(corpus)
+ return self.__recv_response()
+
+ def corpus_positional_attributes(self, corpus):
+ # INPUT: (STRING corpus)
+ # OUTPUT: CQI_DATA_STRING_LIST
+ # print('CORPUS_POSITIONAL_ATTRIBUTES')
+ self.__send_WORD(CQi.CORPUS_POSITIONAL_ATTRIBUTES)
+ self.__send_STRING(corpus)
+ return self.__recv_response()
+
+ def corpus_structural_attributes(self, corpus):
+ # INPUT: (STRING corpus)
+ # OUTPUT: CQI_DATA_STRING_LIST
+ # print('CORPUS_STRUCTURAL_ATTRIBUTES')
+ self.__send_WORD(CQi.CORPUS_STRUCTURAL_ATTRIBUTES)
+ self.__send_STRING(corpus)
+ return self.__recv_response()
+
+ def corpus_structural_attribute_has_values(self, attribute):
+ # INPUT: (STRING attribute)
+ # OUTPUT: CQI_DATA_BOOL
+ # print('CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES')
+ self.__send_WORD(CQi.CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES)
+ self.__send_STRING(attribute)
+ return self.__recv_response()
+
+ def corpus_alignment_attributes(self, corpus):
+ # INPUT: (STRING corpus)
+ # OUTPUT: CQI_DATA_STRING_LIST
+ # print('CORPUS_ALIGNMENT_ATTRIBUTES')
+ self.__send_WORD(CQi.CORPUS_ALIGNMENT_ATTRIBUTES)
+ self.__send_STRING(corpus)
+ return self.__recv_response()
+
+ def corpus_full_name(self, corpus):
+ # INPUT: (STRING corpus)
+ # OUTPUT: CQI_DATA_STRING
+ # the full name of as specified in its registry entry
+ # print('CORPUS_FULL_NAME')
+ self.__send_WORD(CQi.CORPUS_FULL_NAME)
+ self.__send_STRING(corpus)
+ return self.__recv_response()
+
+ def corpus_info(self, corpus):
+ # INPUT: (STRING corpus)
+ # OUTPUT: CQI_DATA_STRING_LIST
+ # returns the contents of the .info file of as a list of lines
+ # print('CORPUS_INFO')
+ self.__send_WORD(CQi.CORPUS_INFO)
+ self.__send_STRING(corpus)
+ return self.__recv_response()
+
+ def corpus_drop_corpus(self, corpus):
+ '''
+ ' Broken
+ ' TODO: Check what type of return value is provided by the server.
+ '''
+ # INPUT: (STRING corpus)
+ # OUTPUT: CQI_STATUS_OK
+ # try to unload a corpus and all its attributes from memory
+ # print('CORPUS_DROP_CORPUS')
+ self.__send_WORD(CQi.CORPUS_DROP_CORPUS)
+ self.__send_STRING(corpus)
+ self.__recv_response()
+
+ def cl_attribute_size(self, attribute):
+ # INPUT: (STRING attribute)
+ # OUTPUT: CQI_DATA_INT
+ # returns the size of :
+ # number of tokens (positional)
+ # number of regions (structural)
+ # number of alignments (alignment)
+ # print('CL_ATTRIBUTE_SIZE')
+ self.__send_WORD(CQi.CL_ATTRIBUTE_SIZE)
+ self.__send_STRING(attribute)
+ return self.__recv_response()
+
+ def cl_lexicon_size(self, attribute):
+ # INPUT: (STRING attribute)
+ # OUTPUT: CQI_DATA_INT
+ # returns the number of entries in the lexicon of a positional
+ # attribute;
+ # valid lexicon IDs range from 0 .. (lexicon_size - 1)
+ # print('CL_LEXICON_SIZE')
+ self.__send_WORD(CQi.CL_LEXICON_SIZE)
+ self.__send_STRING(attribute)
+ return self.__recv_response()
+
+ def cl_drop_attribute(self, attribute):
+ # INPUT: (STRING attribute)
+ # OUTPUT: CQI_STATUS_OK
+ # unload attribute from memory
+ # print('CL_DROP_ATTRIBUTE')
+ self.__send_WORD(CQi.CL_LEXICON_SIZE)
+ self.__send_STRING(attribute)
+ self.__recv_response()
+
+ """
+ " NOTE: simple (scalar) mappings are applied to lists (the returned list
+ " has exactly the same length as the list passed as an argument)
+ """
+
+ def cl_str2id(self, attribute, strings):
+ # INPUT: (STRING attribute, STRING_LIST strings)
+ # OUTPUT: CQI_DATA_INT_LIST
+ # returns -1 for every string in that is not found in the
+ # lexicon
+ # print('CL_STR2ID')
+ self.__send_WORD(CQi.CL_LEXICON_SIZE)
+ self.__send_STRING(attribute)
+ self.__send_STRING_LIST(strings)
+ return self.__recv_response()
+
+ def cl_id2str(self, attribute, id):
+ # INPUT: (STRING attribute, INT_LIST id)
+ # OUTPUT: CQI_DATA_STRING_LIST
+ # returns "" for every ID in that is out of range
+ # print('CL_ID2STR')
+ self.__send_WORD(CQi.CL_ID2STR)
+ self.__send_STRING(attribute)
+ self.__send_INT_LIST(id)
+ return self.__recv_response()
+
+ def cl_id2freq(self, attribute, id):
+ # INPUT: (STRING attribute, INT_LIST id)
+ # OUTPUT: CQI_DATA_INT_LIST
+ # returns 0 for every ID in that is out of range
+ # print('CL_ID2FREQ')
+ self.__send_WORD(CQi.CL_ID2FREQ)
+ self.__send_STRING(attribute)
+ self.__send_INT_LIST(id)
+ return self.__recv_response()
+
+ def cl_cpos2id(self, attribute, cpos):
+ # INPUT: (STRING attribute, INT_LIST cpos)
+ # OUTPUT: CQI_DATA_INT_LIST
+ # returns -1 for every corpus position in that is out of range
+ # print('CL_CPOS2ID')
+ self.__send_WORD(CQi.CL_ID2FREQ)
+ self.__send_STRING(attribute)
+ self.__send_INT_LIST(cpos)
+ return self.__recv_response()
+
+ def cl_cpos2str(self, attribute, cpos):
+ # INPUT: (STRING attribute, INT_LIST cpos)
+ # OUTPUT: CQI_DATA_STRING_LIST
+ # returns "" for every corpus position in that is out of range
+ # print('CL_CPOS2STR')
+ self.__send_WORD(CQi.CL_CPOS2STR)
+ self.__send_STRING(attribute)
+ self.__send_INT_LIST(cpos)
+ return self.__recv_response()
+
+ def cl_cpos2struc(self, attribute, cpos):
+ # INPUT: (STRING attribute, INT_LIST cpos)
+ # OUTPUT: CQI_DATA_INT_LIST
+ # returns -1 for every corpus position not inside a structure region
+ # print('CL_CPOS2STRUC')
+ self.__send_WORD(CQi.CL_CPOS2STRUC)
+ self.__send_STRING(attribute)
+ self.__send_INT_LIST(cpos)
+ return self.__recv_response()
+
+ """
+ " NOTE: temporary addition for the Euralex2000 tutorial, but should
+ " probably be included in CQi specs
+ """
+
+ def cl_cpos2lbound(self, attribute, cpos):
+ # INPUT: (STRING attribute, INT_LIST cpos)
+ # OUTPUT: CQI_DATA_INT_LIST
+ # returns left boundary of s-attribute region enclosing cpos, -1 if not
+ # in region
+ # print('CL_CPOS2LBOUND')
+ self.__send_WORD(CQi.CL_CPOS2LBOUND)
+ self.__send_STRING(attribute)
+ self.__send_INT_LIST(cpos)
+ return self.__recv_response()
+
+ def cl_cpos2rbound(self, attribute, cpos):
+ # INPUT: (STRING attribute, INT_LIST cpos)
+ # OUTPUT: CQI_DATA_INT_LIST
+ # returns right boundary of s-attribute region enclosing cpos, -1 if
+ # not in region
+ # print('CL_CPOS2RBOUND')
+ self.__send_WORD(CQi.CL_CPOS2RBOUND)
+ self.__send_STRING(attribute)
+ self.__send_INT_LIST(cpos)
+ return self.__recv_response()
+
+ def cl_cpos2alg(self, attribute, cpos):
+ # INPUT: (STRING attribute, INT_LIST cpos)
+ # OUTPUT: CQI_DATA_INT_LIST
+ # returns -1 for every corpus position not inside an alignment
+ # print('CL_CPOS2ALG')
+ self.__send_WORD(CQi.CL_CPOS2ALG)
+ self.__send_STRING(attribute)
+ self.__send_INT_LIST(cpos)
+ return self.__recv_response()
+
+ def cl_struc2str(self, attribute, strucs):
+ # INPUT: (STRING attribute, INT_LIST strucs)
+ # OUTPUT: CQI_DATA_STRING_LIST
+ # returns annotated string values of structure regions in ; ""
+ # if out of range
+ # check CQI_CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES() first
+ # print('CL_STRUC2STR')
+ self.__send_WORD(CQi.CL_STRUC2STR)
+ self.__send_STRING(attribute)
+ self.__send_INT_LIST(strucs)
+ return self.__recv_response()
+
+ """
+ " NOTE: the following mappings take a single argument and return multiple
+ " values, including lists of arbitrary size
+ """
+
+ def cl_id2cpos(self, attribute, id):
+ # INPUT: (STRING attribute, INT id)
+ # OUTPUT: CQI_DATA_INT_LIST
+ # returns all corpus positions where the given token occurs
+ # print('CL_ID2CPOS')
+ self.__send_WORD(CQi.CL_ID2CPOS)
+ self.__send_STRING(attribute)
+ self.__send_INT(id)
+ return self.__recv_response()
+
+ def cl_idlist2cpos(self, attribute, id_list):
+ # INPUT: (STRING attribute, INT_LIST id_list)
+ # OUTPUT: CQI_DATA_INT_LIST
+ # returns all corpus positions where one of the tokens in
+ # occurs; the returned list is sorted as a whole, not per token id
+ # print('CL_IDLIST2CPOS')
+ self.__send_WORD(CQi.CL_IDLIST2CPOS)
+ self.__send_STRING(attribute)
+ self.__send_INT_LIST(id_list)
+ return self.__recv_response()
+
+ def cl_regex2id(self, attribute, regex):
+ # INPUT: (STRING attribute, STRING regex)
+ # OUTPUT: CQI_DATA_INT_LIST
+ # returns lexicon IDs of all tokens that match ; the returned
+ # list may be empty (size 0);
+ # print('CL_REGEX2ID')
+ self.__send_WORD(CQi.CL_REGEX2ID)
+ self.__send_STRING(attribute)
+ self.__send_STRING(regex)
+ return self.__recv_response()
+
+ def cl_struc2cpos(self, attribute, struc):
+ # INPUT: (STRING attribute, INT struc)
+ # OUTPUT: CQI_DATA_INT_INT
+ # returns start and end corpus positions of structure region
+ # print('CL_STRUC2CPOS')
+ self.__send_WORD(CQi.CL_STRUC2CPOS)
+ self.__send_STRING(attribute)
+ self.__send_INT(struc)
+ return self.__recv_response()
+
+ def cl_alg2cpos(self, attribute, alg):
+ # INPUT: (STRING attribute, INT alg)
+ # OUTPUT: CQI_DATA_INT_INT_INT_INT
+ # returns (src_start, src_end, target_start, target_end)
+ # print('CL_ALG2CPOS')
+ self.__send_WORD(CQi.CL_ALG2CPOS)
+ self.__send_STRING(attribute)
+ self.__send_INT(alg)
+ return self.__recv_response()
+
+ def cqp_query(self, mother_corpus, subcorpus_name, query):
+ # INPUT: (STRING mother_corpus, STRING subcorpus_name, STRING query)
+ # OUTPUT: CQI_STATUS_OK
+ # must include the ';' character terminating the query.
+ # print('CQP_QUERY')
+ self.__send_WORD(CQi.CQP_QUERY)
+ self.__send_STRING(mother_corpus)
+ self.__send_STRING(subcorpus_name)
+ self.__send_STRING(query)
+ self.__recv_WORD()
+
+ def cqp_list_subcorpora(self, corpus):
+ # INPUT: (STRING corpus)
+ # OUTPUT: CQI_DATA_STRING_LIST
+ # print('CQP_LIST_SUBCORPORA')
+ self.__send_WORD(CQi.CQP_LIST_SUBCORPORA)
+ self.__send_STRING(corpus)
+ return self.__recv_response()
+
+ def cqp_subcorpus_size(self, subcorpus):
+ # INPUT: (STRING subcorpus)
+ # OUTPUT: CQI_DATA_INT
+ # print('CQP_SUBCORPUS_SIZE')
+ self.__send_WORD(CQi.CQP_SUBCORPUS_SIZE)
+ self.__send_STRING(subcorpus)
+ return self.__recv_response()
+
+ def cqp_subcorpus_has_field(self, subcorpus, field):
+ # INPUT: (STRING subcorpus, BYTE field)
+ # OUTPUT: CQI_DATA_BOOL
+ # print('CQP_SUBCORPUS_HAS_FIELD')
+ self.__send_WORD(CQi.CQP_SUBCORPUS_HAS_FIELD)
+ self.__send_STRING(subcorpus)
+ self.__send_BYTE(field)
+ return self.__recv_response()
+
+ def cqp_dump_subcorpus(self, subcorpus, field, first, last):
+ # INPUT: (STRING subcorpus, BYTE field, INT first, INT last)
+ # OUTPUT: CQI_DATA_INT_LIST
+ # Dump the values of for match ranges ..
+ # in . is one of the CQI_CONST_FIELD_* constants.
+ # print('CQP_DUMP_SUBCORPUS')
+ self.__send_WORD(CQi.CQP_DUMP_SUBCORPUS)
+ self.__send_STRING(subcorpus)
+ self.__send_BYTE(field)
+ self.__send_INT(first)
+ self.__send_INT(last)
+ return self.__recv_response()
+
+ def cqp_drop_subcorpus(self, subcorpus):
+ # INPUT: (STRING subcorpus)
+ # OUTPUT: CQI_STATUS_OK
+ # delete a subcorpus from memory
+ # print('CQP_DROP_SUBCORPUS')
+ self.__send_WORD(CQi.CQP_DROP_SUBCORPUS)
+ self.__send_STRING(subcorpus)
+ self.__recv_response()
+
+ """
+ " NOTE: The following two functions are temporarily included for the
+ " Euralex 2000 tutorial demo
+ """
+
+ def cqp_fdist_1(self, subcorpus, cutoff, field, attribute):
+ """ NOTE: frequency distribution of single tokens """
+ # INPUT: (STRING subcorpus, INT cutoff, BYTE field, STRING attribute)
+ # OUTPUT: CQI_DATA_INT_LIST
+ # returns (id, frequency) pairs flattened into a list of size 2*
+ # field is one of CQI_CONST_FIELD_MATCH, CQI_CONST_FIELD_TARGET,
+ # CQI_CONST_FIELD_KEYWORD
+ # NB: pairs are sorted by frequency desc.
+ # print('CQP_FDIST_1')
+ self.__send_WORD(CQi.CQP_FDIST_1)
+ self.__send_STRING(subcorpus)
+ self.__send_INT(cutoff)
+ self.__send_BYTE(field)
+ self.__send_STRING(attribute)
+ return self.__recv_response()
+
+ def cqp_fdist_2(self, subcorpus, cutoff, field1, attribute1, field2,
+ attribute2):
+ """ NOTE: frequency distribution of pairs of tokens """
+ # INPUT: (STRING subcorpus, INT cutoff, BYTE field1, STRING attribute1,
+ # BYTE field2, STRING attribute2)
+ # OUTPUT: CQI_DATA_INT_LIST
+ # returns (id1, id2, frequency) pairs flattened into a list of size
+ # 3*
+ # NB: triples are sorted by frequency desc.
+ # print('CQP_FDIST_2')
+ self.__send_WORD(CQi.CQP_FDIST_2)
+ self.__send_STRING(subcorpus)
+ self.__send_INT(cutoff)
+ self.__send_BYTE(field1)
+ self.__send_STRING(attribute1)
+ self.__send_BYTE(field2)
+ self.__send_STRING(attribute2)
+ return self.__recv_response()
+
+ def __recv_response(self):
+ byte_data = self.__recv_WORD()
+ response_type = byte_data >> 8
+ if response_type == CQi.STATUS:
+ response = byte_data
+ elif response_type == CQi.ERROR:
+ raise Exception(CQi.lookup[byte_data])
+ elif response_type == CQi.DATA:
+ response = self.__recv_DATA(byte_data)
+ elif response_type == CQi.CL_ERROR:
+ raise Exception(CQi.lookup[byte_data])
+ elif response_type == CQi.CQP_ERROR:
+ raise Exception(CQi.lookup[byte_data])
+ else:
+ raise Exception(
+ 'Unknown response type: {}'.format(hex(response_type))
+ )
+ return response
+
+ def __recv_DATA(self, data_type):
+ if data_type == CQi.DATA_BYTE:
+ data = self.__recv_DATA_BYTE()
+ elif data_type == CQi.DATA_BOOL:
+ data = self.__recv_DATA_BOOL()
+ elif data_type == CQi.DATA_INT:
+ data = self.__recv_DATA_INT()
+ elif data_type == CQi.DATA_STRING:
+ data = self.__recv_DATA_STRING()
+ elif data_type == CQi.DATA_BYTE_LIST:
+ data = self.__recv_DATA_BYTE_LIST()
+ elif data_type == CQi.DATA_BOOL_LIST:
+ data = self.__recv_DATA_BOOL_LIST()
+ elif data_type == CQi.DATA_INT_LIST:
+ data = self.__recv_DATA_INT_LIST()
+ elif data_type == CQi.DATA_STRING_LIST:
+ data = self.__recv_DATA_STRING_LIST()
+ elif data_type == CQi.DATA_INT_INT:
+ data = self.__recv_DATA_INT_INT()
+ elif data_type == CQi.DATA_INT_INT_INT_INT:
+ data = self.__recv_DATA_INT_INT_INT_INT()
+ elif data_type == CQi.DATA_INT_TABLE:
+ data = self.__recv_DATA_INT_TABLE()
+ else:
+ raise Exception('Unknown data type: {}'.format(hex(data_type)))
+ return data
+
+ def __recv_DATA_BYTE(self):
+ byte_data = self.connection.recv(1)
+ return struct.unpack('!B', byte_data)[0]
+
+ def __recv_DATA_BOOL(self):
+ byte_data = self.connection.recv(1)
+ return struct.unpack('!?', byte_data)[0]
+
+ def __recv_DATA_INT(self):
+ byte_data = self.connection.recv(4)
+ return struct.unpack('!i', byte_data)[0]
+
+ def __recv_DATA_STRING(self):
+ n = self.__recv_WORD()
+ byte_data = self.connection.recv(n)
+ return struct.unpack('!{}s'.format(n), byte_data)[0].decode()
+
+ def __recv_DATA_BYTE_LIST(self):
+ data = []
+ n = self.__recv_DATA_INT()
+ while n > 0:
+ data.append(self.__recv_DATA_BYTE())
+ n -= 1
+ return data
+
+ def __recv_DATA_BOOL_LIST(self):
+ data = []
+ n = self.__recv_DATA_INT()
+ while n > 0:
+ data.append(self.__recv_DATA_BOOL())
+ n -= 1
+ return data
+
+ def __recv_DATA_INT_LIST(self):
+ data = []
+ n = self.__recv_DATA_INT()
+ while n > 0:
+ data.append(self.__recv_DATA_INT())
+ n -= 1
+ return data
+
+ def __recv_DATA_STRING_LIST(self):
+ data = []
+ n = self.__recv_DATA_INT()
+ while n > 0:
+ data.append(self.__recv_DATA_STRING())
+ n -= 1
+ return data
+
+ def __recv_DATA_INT_INT(self):
+ return (self.__recv_INT(), self.__recv_INT())
+
+ def __recv_DATA_INT_INT_INT_INT(self):
+ return (self.__recv_INT(),
+ self.__recv_INT(),
+ self.__recv_INT(),
+ self.__recv_INT())
+
+ def __recv_DATA_INT_TABLE(self):
+ rows = self.__recv_DATA_INT()
+ columns = self.__recv_DATA_INT()
+ data = []
+ for i in range(0, rows):
+ row = []
+ for j in range(0, columns):
+ row.append(self.__recv_DATA_INT())
+ data.append(row)
+ return data
+
+ def __recv_WORD(self):
+ byte_data = self.connection.recv(2)
+ return struct.unpack('!H', byte_data)[0]
+
+ def __send_BYTE(self, byte_data):
+ data = struct.pack('!B', byte_data)
+ self.connection.sendall(data)
+
+ def __send_BOOL(self, bool_data):
+ data = struct.pack('!?', bool_data)
+ self.connection.sendall(data)
+
+ def __send_INT(self, int_data):
+ data = struct.pack('!i', int_data)
+ self.connection.sendall(data)
+
+ def __send_STRING(self, string_data):
+ encoded_string_data = string_data.encode('utf-8')
+ n = len(encoded_string_data)
+ data = struct.pack('!H{}s'.format(n), n, encoded_string_data)
+ self.connection.sendall(data)
+
+ def __send_INT_LIST(self, int_list_data):
+ n = len(int_list_data)
+ self.__send_INT(n)
+ for int_data in int_list_data:
+ self.__send_INT(int_data)
+
+ def __send_STRING_LIST(self, string_list_data):
+ n = len(string_list_data)
+ self.__send_INT(n)
+ for string_data in string_list_data:
+ self.__send_STRING(string_data)
+
+ def __send_WORD(self, word_data):
+ data = struct.pack('!H', word_data)
+ self.connection.sendall(data)
diff --git a/app/corpora/CQiWrapper/CQiWrapper.py b/app/corpora/CQiWrapper/CQiWrapper.py
new file mode 100644
index 00000000..4739a655
--- /dev/null
+++ b/app/corpora/CQiWrapper/CQiWrapper.py
@@ -0,0 +1,240 @@
+from .CQiClient import CQiClient
+import multiprocessing
+import collections
+import socket
+
+
+class CQiWrapper(CQiClient):
+ """
+ CQIiWrapper object
+
+ High level wrapper that groups and renames some functions of CQiClient
+ for ease of use. Also structures recieved data into python dictionaries.
+
+ Keyword arguments:
+ username -- username used to connect to the cqp server
+ password -- password of the user to connect to the cqp server
+ """
+
+ SUBCORPUS_NAMES = []
+
+ def __init__(self, host='127.0.0.1', port=4877, username='opaque',
+ password='opaque'):
+ super(CQiWrapper, self).__init__(host=host, port=port)
+ self.username = username
+ self.password = password
+
+ def connect(self):
+ """
+ Connect with CQP server
+
+ Connects via socket to the CQP server using the given username and
+ password from class initiation.
+ """
+ self.ctrl_connect(self.username, self.password)
+
+ def create_attribute_strings(self, corpus_name):
+ self.word_str = corpus_name + '.word'
+ self.lemma_str = corpus_name + '.lemma'
+ self.pos_str = corpus_name + '.pos'
+ self.sem_str = corpus_name + '.sem'
+ self.entry_str = corpus_name + '.entry'
+ self.entry_author_str = self.entry_str + '_author'
+ self.entry_title_str = self.entry_str + '_title'
+ self.attributes = [self.word_str,
+ self.lemma_str,
+ self.pos_str,
+ self.sem_str,
+ self.entry_str,
+ self.entry_author_str,
+ self.entry_title_str]
+
+ def disconnect(self):
+ """
+ Disconnect from CQP server
+
+ Disconnects from the CQP server. Closes used socket after disconnect.
+ """
+ self.ctrl_bye()
+ self.connection.close()
+
+ def query_subcorpus(self, corpus_name, result_subcorpus_name, query):
+ """
+ Create subcorpus
+
+ Input query will be used to create a subcorpus holding all cpos match
+ positions for that query.
+
+ Keyword arguments:
+ corpus_name -- name of the corpus the query will be used on
+ result_subcorpus_name -- user set name of the subcorpus which holds all
+ cpos match positions, produced by the query
+ query -- query written in cqp query language
+ """
+ self.cqp_query(corpus_name, result_subcorpus_name, query)
+ self.result_subcorpus_ns = (corpus_name
+ + ':'
+ + result_subcorpus_name)
+ self.SUBCORPUS_NAMES.append(self.result_subcorpus_ns)
+ self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus_ns)
+ print('Nr of all matches is:', self.nr_matches)
+
+ def show_subcorpora(self):
+ print('Known subcorpora:', self.SUBCORPUS_NAMES)
+ return self.SUBCORPUS_NAMES
+
+ def show_results(self,
+ corpus_name,
+ result_start_count=0,
+ result_max_count=50,
+ context_len=10,):
+ """
+ Show query results
+
+ Shows the actual matched strings produce by the query. Uses the cpos
+ match indexes to grab those strings. saves them into an orderd
+ dictionary. Also saves coresponding tags, lemmas and context:
+ OrderedDict([
+ (0,
+ {
+ 'tokens': ['Big', 'Brother', 'himself'],
+ 'lemmas': ['big', 'brother', 'himself'],
+ 'pos_tags': ['JJ', 'NN1', 'PPX1'],
+ 'sem_tags': ['|A11.1+|N3.2+|N5+|', '|S2.2m|S4m|S9/S2.2m|',
+ '|Z8m|'],
+ 'context_before': ['figures', 'of', 'the', 'Party', ',',
+ 'almost', 'on', 'a', 'level', 'with'],
+ 'context_after': [',', 'and', 'then', 'had', 'engaged',
+ 'in', 'counter-revolu-', 'tionary',
+ 'activities', ','],
+ 'entry_title': '1984', 'entry_author':
+ 'george_orwell',
+ 'cpos_start': 110490,
+ 'cpos_end': 110492
+ }
+ )
+ ])
+
+ Keyword arguments:
+ corpus_name -- name of the parent corpus the subcorpus is part of
+ result_start_count -- start position of the dumped subcorpus.
+ (default 0) If it is 0 matches 0 to 50 will be shown. If it is 50
+ matches 50 to 100 will be shown.
+ result_max_count -- defines how many matches at once will be shown.
+ (default 50)
+ context_len -- defines how many words before and after a match will be
+ shown (default 10)
+ """
+ self.context_len = context_len
+ word_str = corpus_name + '.word'
+ self.corpus_max_len = self.cl_attribute_size(word_str)
+ if self.nr_matches == 0:
+ print('Query resulted in 0 matches.')
+ else:
+ if self.nr_matches <= 50:
+ matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
+ 0x10,
+ 0,
+ self.nr_matches - 1)
+ matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
+ 0x11,
+ 0, self.nr_matches - 1)
+ else:
+ matches_start = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
+ 0x10,
+ result_start_count,
+ result_max_count - 1)
+ matches_end = self.cqp_dump_subcorpus(self.result_subcorpus_ns,
+ 0x11,
+ result_start_count,
+ result_max_count - 1)
+ match_indexes = zip(matches_start, matches_end)
+
+ matches = []
+ manager = multiprocessing.Manager()
+ return_dict = manager.dict()
+ for i, index_pair in enumerate(match_indexes):
+ match = multiprocessing.Process(target=self.__get_matches,
+ args=(i,
+ index_pair,
+ corpus_name,
+ return_dict))
+ matches.append(match)
+ match.start()
+ for match in matches:
+ match.join()
+ # sort matches into ordered dict
+ ordered_results = collections.OrderedDict()
+ for key in sorted(return_dict.keys()):
+ ordered_results[key] = return_dict[key]
+ print('ORDERED_RESULTS', ordered_results)
+
+ def __get_matches(self, i, index_pair, corpus_name, return_dict):
+ """
+ Get matches as readable output
+
+ Gets the actual match strings of cpos match indexes. Private helper
+ method used in show_results.
+
+ Keyword arguments:
+ i -- serial number for match at given cpos
+ index_pair -- match start and match end cpos
+ corpus_name -- name of the parent corpus
+ return_dict -- dictionary created with manager.dict() that holds the
+ extracted strings tags etc.
+ """
+ print('START:', index_pair[0])
+ print('END:', index_pair[1])
+ print('=============================')
+ tmp_session = CQiWrapper(username=self.username, password=self.password,
+ host=self.host, port=self.port)
+ tmp_session.connect()
+ tokens = tmp_session.cl_cpos2str(self.word_str,
+ range(index_pair[0],
+ index_pair[1] + 1))
+ lemmas = tmp_session.cl_cpos2str(self.lemma_str,
+ range(index_pair[0],
+ index_pair[1] + 1))
+ pos_tags = tmp_session.cl_cpos2str(self.pos_str,
+ range(index_pair[0],
+ index_pair[1] + 1))
+ sem_tags = tmp_session.cl_cpos2str(self.sem_str,
+ range(index_pair[0],
+ index_pair[1] + 1))
+ struc_entry = tmp_session.cl_cpos2struc(self.entry_str,
+ range(index_pair[0],
+ index_pair[1] + 1))
+ before_index = max([0, index_pair[0] - self.context_len])
+ after_index = min([self.corpus_max_len,
+ index_pair[1] + self.context_len])
+ context_before = tmp_session.cl_cpos2str(self.word_str,
+ range(before_index,
+ index_pair[0]))
+ context_after = tmp_session.cl_cpos2str(self.word_str,
+ range(index_pair[1] + 1,
+ after_index + 1))
+ entry_titles = tmp_session.cl_struc2str(self.entry_title_str,
+ struc_entry)
+ entry_authors = tmp_session.cl_struc2str(self.entry_author_str,
+ struc_entry)
+ return_dict[i] = {'tokens': tokens,
+ 'lemmas': lemmas,
+ 'pos_tags': pos_tags,
+ 'sem_tags': sem_tags,
+ 'context_before': context_before,
+ 'context_after': context_after,
+ 'entry_title': entry_titles[0],
+ 'entry_author': entry_authors[0],
+ 'cpos_start': index_pair[0],
+ 'cpos_end': index_pair[1]}
+ tmp_session.disconnect()
+
+ def get_cpos_info(self, cpos):
+ match_dict = collections.OrderedDict()
+ for attribute in self.attributes:
+ if '.entry' not in attribute:
+ match_str = self.cl_cpos2str(attribute, range(cpos[0], cpos[1]))
+ match_dict[attribute] = match_str
+ else:
+ continue
+ print(match_dict)