Add package implementation of cqi

This commit is contained in:
Patrick Jentsch
2020-03-23 09:10:35 +01:00
parent 7752e7fb57
commit acfcc0321b
13 changed files with 353 additions and 33 deletions

View File

@ -0,0 +1,9 @@
# flake8: noqa
from .api import APIClient
from .client import CQiClient
from .wrapper import CQiWrapper
from .version import version, version_info
__title__ = 'CQi'
__version__ = version

View File

@ -0,0 +1,2 @@
# flake8: noqa
from .client import APIClient

View File

@ -0,0 +1,999 @@
from time import sleep
import socket
import struct
# ########################################################################### #
# IMS CQi specification #
# #
# Version: 0.1a ;o) #
# Author: Stefan Evert (evert@ims.uni-stuttgart.de) #
# Modified by (codestyle): Patrick Jentsch (p.jentsch@uni-bielefeld.de) #
# Modified date: Thurs Oct 10 #
# ########################################################################### #
""" 1. padding """
PAD = 0x00
""" 2. CQi responses """
""" 2.1 CQI_STATUS_* """
STATUS = 0x01
STATUS_OK = 0x0101
STATUS_CONNECT_OK = 0x0102
STATUS_BYE_OK = 0x0103
STATUS_PING_OK = 0x0104
""" 2.2 CQI_ERROR_* """
ERROR = 0x02
ERROR_GENERAL_ERROR = 0x0201
ERROR_CONNECT_REFUSED = 0x0202
ERROR_USER_ABORT = 0x0203
ERROR_SYNTAX_ERROR = 0x0204
# includes corpus/attribute/subcorpus specifier syntax
""" 2.3 CQI_DATA_* """
DATA = 0x03
DATA_BYTE = 0x0301
DATA_BOOL = 0x0302
DATA_INT = 0x0303
DATA_STRING = 0x0304
DATA_BYTE_LIST = 0x0305
DATA_BOOL_LIST = 0x0306
DATA_INT_LIST = 0x0307
DATA_STRING_LIST = 0x0308
DATA_INT_INT = 0x0309
DATA_INT_INT_INT_INT = 0x030A
DATA_INT_TABLE = 0x030B
""" 2.4 CQI_CL_ERROR_* """
"""
" NOTE: some CL error codes are not represented in the CQi specs
" - usually because they're not used in the CL any more
" - CDA_ENOSTRING is not considered an error (returns -1)
" - CDA_EARGS: dynamic attribute calls not yet supported
"""
CL_ERROR = 0x04
CL_ERROR_NO_SUCH_ATTRIBUTE = 0x0401
# returned if CQi server couldn't open attribute
CL_ERROR_WRONG_ATTRIBUTE_TYPE = 0x0402
# CDA_EATTTYPE
CL_ERROR_OUT_OF_RANGE = 0x0403
# CDA_EIDORNG, CDA_EIDXORNG, CDA_EPOSORNG
CL_ERROR_REGEX = 0x0404
# CDA_EPATTERN (not used), CDA_EBADREGEX
CL_ERROR_CORPUS_ACCESS = 0x0405
# CDA_ENODATA
CL_ERROR_OUT_OF_MEMORY = 0x0406
# CDA_ENOMEM
# this means the CQi server has run out of memory;
# try discarding some other corpora and/or subcorpora
CL_ERROR_INTERNAL = 0x0407
# CDA_EOTHER, CDA_ENYI
# this is the classical 'please contact technical support' error
""" 2.5 CQI_CQP_ERROR_* """
CQP_ERROR = 0x05
# CQP error messages yet to be defined
CQP_ERROR_GENERAL = 0x0501
CQP_ERROR_NO_SUCH_CORPUS = 0x0502
CQP_ERROR_INVALID_FIELD = 0x0503
CQP_ERROR_OUT_OF_RANGE = 0x0504
# various cases where a number is out of range
""" 3. CQi commands """
""" 3.1 CQI_CTRL_* """
CTRL = 0x11
CTRL_CONNECT = 0x1101
# INPUT: (STRING username, STRING password)
# OUTPUT: CQI_STATUS_CONNECT_OK, CQI_ERROR_CONNECT_REFUSED
CTRL_BYE = 0x1102
# INPUT: ()
# OUTPUT: CQI_STATUS_BYE_OK
CTRL_USER_ABORT = 0x1103
# INPUT: ()
# OUTPUT:
CTRL_PING = 0x1104
# INPUT: ()
# OUTPUT: CQI_STATUS_PING_OK
CTRL_LAST_GENERAL_ERROR = 0x1105
# INPUT: ()
# OUTPUT: CQI_DATA_STRING
# full-text error message for the last general error reported by the CQi server
""" 3.2 CQI_ASK_FEATURE_* """
ASK_FEATURE = 0x12
ASK_FEATURE_CQI_1_0 = 0x1201
# INPUT: ()
# OUTPUT: CQI_DATA_BOOL
ASK_FEATURE_CL_2_3 = 0x1202
# INPUT: ()
# OUTPUT: CQI_DATA_BOOL
ASK_FEATURE_CQP_2_3 = 0x1203
# INPUT: ()
# OUTPUT: CQI_DATA_BOOL
""" 3.3 CQI_CORPUS_* """
CORPUS = 0x13
CORPUS_LIST_CORPORA = 0x1301
# INPUT: ()
# OUTPUT: CQI_DATA_STRING_LIST
CORPUS_CHARSET = 0x1303
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING
CORPUS_PROPERTIES = 0x1304
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING_LIST
CORPUS_POSITIONAL_ATTRIBUTES = 0x1305
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING_LIST
CORPUS_STRUCTURAL_ATTRIBUTES = 0x1306
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING_LIST
CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES = 0x1307
# INPUT: (STRING attribute)
# OUTPUT: CQI_DATA_BOOL
CORPUS_ALIGNMENT_ATTRIBUTES = 0x1308
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING_LIST
CORPUS_FULL_NAME = 0x1309
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING
# the full name of <corpus> as specified in its registry entry
CORPUS_INFO = 0x130A
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING_LIST
# returns the contents of the .info file of <corpus> as a list of lines
CORPUS_DROP_CORPUS = 0x130B
# INPUT: (STRING corpus)
# OUTPUT: CQI_STATUS_OK
# try to unload a corpus and all its attributes from memory
""" 3.4 CQI_CL_* """
CL = 0x14
# low-level corpus access (CL functions)
CL_ATTRIBUTE_SIZE = 0x1401
# INPUT: (STRING attribute)
# OUTPUT: CQI_DATA_INT
# returns the size of <attribute>:
# - number of tokens (positional)
# - number of regions (structural)
# - number of alignments (alignment)
CL_LEXICON_SIZE = 0x1402
# INPUT: (STRING attribute)
# OUTPUT: CQI_DATA_INT
# returns the number of entries in the lexicon of a positional attribute;
# valid lexicon IDs range from 0 .. (lexicon_size - 1)
CL_DROP_ATTRIBUTE = 0x1403
# INPUT: (STRING attribute)
# OUTPUT: CQI_STATUS_OK
# unload attribute from memory
"""
" NOTE: simple (scalar) mappings are applied to lists (the returned list has
" exactly the same length as the list passed as an argument)
"""
CL_STR2ID = 0x1404
# INPUT: (STRING attribute, STRING_LIST strings)
# OUTPUT: CQI_DATA_INT_LIST
# returns -1 for every string in <strings> that is not found in the lexicon
CL_ID2STR = 0x1405
# INPUT: (STRING attribute, INT_LIST id)
# OUTPUT: CQI_DATA_STRING_LIST
# returns "" for every ID in <id> that is out of range
CL_ID2FREQ = 0x1406
# INPUT: (STRING attribute, INT_LIST id)
# OUTPUT: CQI_DATA_INT_LIST
# returns 0 for every ID in <id> that is out of range
CL_CPOS2ID = 0x1407
# INPUT: (STRING attribute, INT_LIST cpos)
# OUTPUT: CQI_DATA_INT_LIST
# returns -1 for every corpus position in <cpos> that is out of range
CL_CPOS2STR = 0x1408
# INPUT: (STRING attribute, INT_LIST cpos)
# OUTPUT: CQI_DATA_STRING_LIST
# returns "" for every corpus position in <cpos> that is out of range
CL_CPOS2STRUC = 0x1409
# INPUT: (STRING attribute, INT_LIST cpos)
# OUTPUT: CQI_DATA_INT_LIST
# returns -1 for every corpus position not inside a structure region
"""
" NOTE: temporary addition for the Euralex2000 tutorial, but should probably be
" included in CQi specs
"""
CL_CPOS2LBOUND = 0x1420
# INPUT: (STRING attribute, INT_LIST cpos)
# OUTPUT: CQI_DATA_INT_LIST
# returns left boundary of s-attribute region enclosing cpos, -1 if not in
# region
CL_CPOS2RBOUND = 0x1421
# INPUT: (STRING attribute, INT_LIST cpos)
# OUTPUT: CQI_DATA_INT_LIST
# returns right boundary of s-attribute region enclosing cpos, -1 if not in
# region
CL_CPOS2ALG = 0x140A
# INPUT: (STRING attribute, INT_LIST cpos)
# OUTPUT: CQI_DATA_INT_LIST
# returns -1 for every corpus position not inside an alignment
CL_STRUC2STR = 0x140B
# INPUT: (STRING attribute, INT_LIST strucs)
# OUTPUT: CQI_DATA_STRING_LIST
# returns annotated string values of structure regions in <strucs>; "" if out
# of range
# check CQI_CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES(<attribute>) first
"""
" NOTE: the following mappings take a single argument and return multiple
" values, including lists of arbitrary size
"""
CL_ID2CPOS = 0x140C
# INPUT: (STRING attribute, INT id)
# OUTPUT: CQI_DATA_INT_LIST
# returns all corpus positions where the given token occurs
CL_IDLIST2CPOS = 0x140D
# INPUT: (STRING attribute, INT_LIST id_list)
# OUTPUT: CQI_DATA_INT_LIST
# returns all corpus positions where one of the tokens in <id_list>
# occurs; the returned list is sorted as a whole, not per token id
CL_REGEX2ID = 0x140E
# INPUT: (STRING attribute, STRING regex)
# OUTPUT: CQI_DATA_INT_LIST
# returns lexicon IDs of all tokens that match <regex>; the returned
# list may be empty (size 0);
CL_STRUC2CPOS = 0x140F
# INPUT: (STRING attribute, INT struc)
# OUTPUT: CQI_DATA_INT_INT
# returns start and end corpus positions of structure region <struc>
CL_ALG2CPOS = 0x1410
# INPUT: (STRING attribute, INT alg)
# OUTPUT: CQI_DATA_INT_INT_INT_INT
# returns (src_start, src_end, target_start, target_end)
""" 3.5 CQI_CQP_* """
CQP = 0x15
CQP_QUERY = 0x1501
# INPUT: (STRING mother_corpus, STRING subcorpus_name, STRING query)
# OUTPUT: CQI_STATUS_OK
# <query> must include the ';' character terminating the query.
CQP_LIST_SUBCORPORA = 0x1502
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING_LIST
CQP_SUBCORPUS_SIZE = 0x1503
# INPUT: (STRING subcorpus)
# OUTPUT: CQI_DATA_INT
CQP_SUBCORPUS_HAS_FIELD = 0x1504
# INPUT: (STRING subcorpus, BYTE field)
# OUTPUT: CQI_DATA_BOOL
CQP_DUMP_SUBCORPUS = 0x1505
# INPUT: (STRING subcorpus, BYTE field, INT first, INT last)
# OUTPUT: CQI_DATA_INT_LIST
# Dump the values of <field> for match ranges <first> .. <last> in <subcorpus>.
# <field> is one of the CQI_CONST_FIELD_* constants.
CQP_DROP_SUBCORPUS = 0x1509
# INPUT: (STRING subcorpus)
# OUTPUT: CQI_STATUS_OK
# delete a subcorpus from memory
"""
" NOTE: The following two functions are temporarily included for the Euralex
" 2000 tutorial demo
"""
""" NOTE: frequency distribution of single tokens """
CQP_FDIST_1 = 0x1510
# INPUT: (STRING subcorpus, INT cutoff, BYTE field, STRING attribute)
# OUTPUT: CQI_DATA_INT_LIST
# returns <n> (id, frequency) pairs flattened into a list of size 2*<n>
# field is one of CQI_CONST_FIELD_MATCH, CQI_CONST_FIELD_TARGET,
# CQI_CONST_FIELD_KEYWORD
# NB: pairs are sorted by frequency desc.
""" NOTE: frequency distribution of pairs of tokens """
CQP_FDIST_2 = 0x1511
# INPUT: (STRING subcorpus, INT cutoff, BYTE field1, STRING attribute1,
# BYTE field2, STRING attribute2)
# OUTPUT: CQI_DATA_INT_LIST
# returns <n> (id1, id2, frequency) pairs flattened into a list of size 3*<n>
# NB: triples are sorted by frequency desc.
""" 4. Constant Definitions """
CONST_FALSE = 0x00
CONST_NO = 0x00
CONST_TRUE = 0x01
CONST_YES = 0x01
"""
" NOTE: The following constants specify which field will be returned by
" CQI_CQP_DUMP_SUBCORPUS and some other subcorpus commands.
"""
CONST_FIELD_MATCH = 0x10
CONST_FIELD_MATCHEND = 0x11
"""
" NOTE: The constants specifiying target0 .. target9 are guaranteed to have the
" numerical values 0 .. 9, so clients do not need to look up the constant
" values if they're handling arbitrary targets.
"""
CONST_FIELD_TARGET_0 = 0x00
CONST_FIELD_TARGET_1 = 0x01
CONST_FIELD_TARGET_2 = 0x02
CONST_FIELD_TARGET_3 = 0x03
CONST_FIELD_TARGET_4 = 0x04
CONST_FIELD_TARGET_5 = 0x05
CONST_FIELD_TARGET_6 = 0x06
CONST_FIELD_TARGET_7 = 0x07
CONST_FIELD_TARGET_8 = 0x08
CONST_FIELD_TARGET_9 = 0x09
"""
" NOTE: The following constants are provided for backward compatibility with
" traditional CQP field names & while the generalised target concept
" isn't yet implemented in the CQPserver.
"""
CONST_FIELD_TARGET = 0x00
CONST_FIELD_KEYWORD = 0x09
""" NOTE: CQi version is CQI_MAJOR_VERSION.CQI_MINOR_VERSION """
MAJOR_VERSION = 0x00
MINOR_VERSION = 0x01
""" 5. CQi lookup dictionary. """
lookup = {
257: 'CQI_STATUS_OK',
258: 'CQI_STATUS_CONNECT_OK',
259: 'CQI_STATUS_BYE_OK',
260: 'CQI_STATUS_PING_OK',
513: 'CQI_ERROR_GENERAL_ERROR',
514: 'CQI_ERROR_CONNECT_REFUSED',
515: 'CQI_ERROR_USER_ABORT',
516: 'CQI_ERROR_SYNTAX_ERROR',
769: 'CQI_DATA_BYTE',
770: 'CQI_DATA_BOOL',
771: 'CQI_DATA_INT',
772: 'CQI_DATA_STRING',
773: 'CQI_DATA_BYTE_LIST',
774: 'CQI_DATA_BOOL_LIST',
775: 'CQI_DATA_INT_LIST',
776: 'CQI_DATA_STRING_LIST',
777: 'CQI_DATA_INT_INT',
778: 'CQI_DATA_INT_INT_INT_INT',
779: 'CQI_DATA_INT_TABLE',
1025: 'CQI_CL_ERROR_NO_SUCH_ATTRIBUTE',
1026: 'CQI_CL_ERROR_WRONG_ATTRIBUTE_TYPE',
1027: 'CQI_CL_ERROR_OUT_OF_RANGE',
1028: 'CQI_CL_ERROR_REGEX',
1029: 'CQI_CL_ERROR_CORPUS_ACCESS',
1030: 'CQI_CL_ERROR_OUT_OF_MEMORY',
1031: 'CQI_CL_ERROR_INTERNAL',
1281: 'CQI_CQP_ERROR_GENERAL',
1282: 'CQI_CQP_ERROR_NO_SUCH_CORPUS',
1283: 'CQI_CQP_ERROR_INVALID_FIELD',
1284: 'CQI_CQP_ERROR_OUT_OF_RANGE',
4353: 'CQI_CTRL_CONNECT',
4354: 'CQI_CTRL_BYE',
4355: 'CQI_CTRL_USER_ABORT',
4356: 'CQI_CTRL_PING',
4357: 'CQI_CTRL_LAST_GENERAL_ERROR',
4609: 'CQI_ASK_FEATURE_CQI_1_0',
4610: 'CQI_ASK_FEATURE_CL_2_3',
4611: 'CQI_ASK_FEATURE_CQP_2_3',
4865: 'CQI_CORPUS_LIST_CORPORA',
4867: 'CQI_CORPUS_CHARSET',
4868: 'CQI_CORPUS_PROPERTIES',
4869: 'CQI_CORPUS_POSITIONAL_ATTRIBUTES',
4870: 'CQI_CORPUS_STRUCTURAL_ATTRIBUTES',
4871: 'CQI_CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES',
4872: 'CQI_CORPUS_ALIGNMENT_ATTRIBUTES',
4873: 'CQI_CORPUS_FULL_NAME',
4874: 'CQI_CORPUS_INFO',
4875: 'CQI_CORPUS_DROP_CORPUS',
5121: 'CQI_CL_ATTRIBUTE_SIZE',
5122: 'CQI_CL_LEXICON_SIZE',
5123: 'CQI_CL_DROP_ATTRIBUTE',
5124: 'CQI_CL_STR2ID',
5125: 'CQI_CL_ID2STR',
5126: 'CQI_CL_ID2FREQ',
5127: 'CQI_CL_CPOS2ID',
5128: 'CQI_CL_CPOS2STR',
5129: 'CQI_CL_CPOS2STRUC',
5130: 'CQI_CL_CPOS2ALG',
5131: 'CQI_CL_STRUC2STR',
5132: 'CQI_CL_ID2CPOS',
5133: 'CQI_CL_IDLIST2CPOS',
5134: 'CQI_CL_REGEX2ID',
5135: 'CQI_CL_STRUC2CPOS',
5136: 'CQI_CL_ALG2CPOS',
5152: 'CQI_CL_CPOS2LBOUND',
5153: 'CQI_CL_CPOS2RBOUND',
5377: 'CQI_CQP_QUERY',
5378: 'CQI_CQP_LIST_SUBCORPORA',
5379: 'CQI_CQP_SUBCORPUS_SIZE',
5380: 'CQI_CQP_SUBCORPUS_HAS_FIELD',
5381: 'CQI_CQP_DUMP_SUBCORPUS',
5385: 'CQI_CQP_DROP_SUBCORPUS',
5392: 'CQI_CQP_FDIST_1',
5393: 'CQI_CQP_FDIST_2'
}
# ########################################################################### #
# IMS CQi client #
# #
# Version: 0.1a #
# Author: Patrick Jentsch (p.jentsch@uni-bielefeld.de) #
# ########################################################################### #
class APIClient:
def __init__(self, host, port=4877):
self.host = host
self.port = port
self.socket = socket.socket()
def setup(self):
self.socket.connect((self.host, self.port))
def teardown(self):
self.socket.close()
def ctrl_connect(self, username, password):
# INPUT: (STRING username, STRING password)
# OUTPUT: CQI_STATUS_CONNECT_OK, CQI_ERROR_CONNECT_REFUSED
self.__send_WORD(CTRL_CONNECT)
self.__send_STRING(username)
self.__send_STRING(password)
return self.__recv_response()
def ctrl_bye(self):
# INPUT: ()
# OUTPUT: CQI_STATUS_BYE_OK
self.__send_WORD(CTRL_BYE)
return self.__recv_response()
def ctrl_user_abort(self):
# INPUT: ()
# OUTPUT:
self.__send_WORD(CTRL_USER_ABORT)
def ctrl_ping(self):
# INPUT: ()
# OUTPUT: CQI_STATUS_PING_OK
self.__send_WORD(CTRL_PING)
return self.__recv_response()
def ctrl_last_general_error(self):
# INPUT: ()
# OUTPUT: CQI_DATA_STRING
# full-text error message for the last general error reported by the
# CQi server
self.__send_WORD(CTRL_LAST_GENERAL_ERROR)
return self.__recv_response()
def ask_feature_cqi_1_0(self):
# INPUT: ()
# OUTPUT: CQI_DATA_BOOL
self.__send_WORD(ASK_FEATURE_CQI_1_0)
return self.__recv_response()
def ask_feature_cl_2_3(self):
# INPUT: ()
# OUTPUT: CQI_DATA_BOOL
self.__send_WORD(ASK_FEATURE_CL_2_3)
return self.__recv_response()
def ask_feature_cqp_2_3(self):
# INPUT: ()
# OUTPUT: CQI_DATA_BOOL
self.__send_WORD(ASK_FEATURE_CL_2_3)
return self.__recv_response()
def corpus_list_coprora(self):
# INPUT: ()
# OUTPUT: CQI_DATA_STRING_LIST
self.__send_WORD(CORPUS_LIST_CORPORA)
return self.__recv_response()
def corpus_charset(self, corpus):
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING
self.__send_WORD(CORPUS_CHARSET)
self.__send_STRING(corpus)
return self.__recv_response()
def corpus_properties(self, corpus):
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING_LIST
self.__send_WORD(CORPUS_PROPERTIES)
self.__send_STRING(corpus)
return self.__recv_response()
def corpus_positional_attributes(self, corpus):
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING_LIST
self.__send_WORD(CORPUS_POSITIONAL_ATTRIBUTES)
self.__send_STRING(corpus)
return self.__recv_response()
def corpus_structural_attributes(self, corpus):
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING_LIST
self.__send_WORD(CORPUS_STRUCTURAL_ATTRIBUTES)
self.__send_STRING(corpus)
return self.__recv_response()
def corpus_structural_attribute_has_values(self, attribute):
# INPUT: (STRING attribute)
# OUTPUT: CQI_DATA_BOOL
self.__send_WORD(CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES)
self.__send_STRING(attribute)
return self.__recv_response()
def corpus_alignment_attributes(self, corpus):
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING_LIST
self.__send_WORD(CORPUS_ALIGNMENT_ATTRIBUTES)
self.__send_STRING(corpus)
return self.__recv_response()
def corpus_full_name(self, corpus):
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING
# the full name of <corpus> as specified in its registry entry
self.__send_WORD(CORPUS_FULL_NAME)
self.__send_STRING(corpus)
return self.__recv_response()
def corpus_info(self, corpus):
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING_LIST
# returns the contents of the .info file of <corpus> as a list of lines
self.__send_WORD(CORPUS_INFO)
self.__send_STRING(corpus)
return self.__recv_response()
def corpus_drop_corpus(self, corpus):
# INPUT: (STRING corpus)
# OUTPUT: CQI_STATUS_OK
# try to unload a corpus and all its attributes from memory
self.__send_WORD(CORPUS_DROP_CORPUS)
self.__send_STRING(corpus)
return self.__recv_response()
def cl_attribute_size(self, attribute):
# INPUT: (STRING attribute)
# OUTPUT: CQI_DATA_INT
# returns the size of <attribute>:
# number of tokens (positional)
# number of regions (structural)
# number of alignments (alignment)
self.__send_WORD(CL_ATTRIBUTE_SIZE)
self.__send_STRING(attribute)
return self.__recv_response()
def cl_lexicon_size(self, attribute):
# INPUT: (STRING attribute)
# OUTPUT: CQI_DATA_INT
# returns the number of entries in the lexicon of a positional
# attribute;
# valid lexicon IDs range from 0 .. (lexicon_size - 1)
self.__send_WORD(CL_LEXICON_SIZE)
self.__send_STRING(attribute)
return self.__recv_response()
def cl_drop_attribute(self, attribute):
# INPUT: (STRING attribute)
# OUTPUT: CQI_STATUS_OK
# unload attribute from memory
self.__send_WORD(CL_LEXICON_SIZE)
self.__send_STRING(attribute)
return self.__recv_response()
"""
" NOTE: simple (scalar) mappings are applied to lists (the returned list
" has exactly the same length as the list passed as an argument)
"""
def cl_str2id(self, attribute, strings):
# INPUT: (STRING attribute, STRING_LIST strings)
# OUTPUT: CQI_DATA_INT_LIST
# returns -1 for every string in <strings> that is not found in the
# lexicon
self.__send_WORD(CL_LEXICON_SIZE)
self.__send_STRING(attribute)
self.__send_STRING_LIST(strings)
return self.__recv_response()
def cl_id2str(self, attribute, id):
# INPUT: (STRING attribute, INT_LIST id)
# OUTPUT: CQI_DATA_STRING_LIST
# returns "" for every ID in <id> that is out of range
self.__send_WORD(CL_ID2STR)
self.__send_STRING(attribute)
self.__send_INT_LIST(id)
return self.__recv_response()
def cl_id2freq(self, attribute, id):
# INPUT: (STRING attribute, INT_LIST id)
# OUTPUT: CQI_DATA_INT_LIST
# returns 0 for every ID in <id> that is out of range
self.__send_WORD(CL_ID2FREQ)
self.__send_STRING(attribute)
self.__send_INT_LIST(id)
return self.__recv_response()
def cl_cpos2id(self, attribute, cpos):
# INPUT: (STRING attribute, INT_LIST cpos)
# OUTPUT: CQI_DATA_INT_LIST
# returns -1 for every corpus position in <cpos> that is out of range
self.__send_WORD(CL_ID2FREQ)
self.__send_STRING(attribute)
self.__send_INT_LIST(cpos)
return self.__recv_response()
def cl_cpos2str(self, attribute, cpos):
# INPUT: (STRING attribute, INT_LIST cpos)
# OUTPUT: CQI_DATA_STRING_LIST
# returns "" for every corpus position in <cpos> that is out of range
self.__send_WORD(CL_CPOS2STR)
self.__send_STRING(attribute)
self.__send_INT_LIST(cpos)
return self.__recv_response()
def cl_cpos2struc(self, attribute, cpos):
# INPUT: (STRING attribute, INT_LIST cpos)
# OUTPUT: CQI_DATA_INT_LIST
# returns -1 for every corpus position not inside a structure region
self.__send_WORD(CL_CPOS2STRUC)
self.__send_STRING(attribute)
self.__send_INT_LIST(cpos)
return self.__recv_response()
"""
" NOTE: temporary addition for the Euralex2000 tutorial, but should
" probably be included in CQi specs
"""
def cl_cpos2lbound(self, attribute, cpos):
# INPUT: (STRING attribute, INT_LIST cpos)
# OUTPUT: CQI_DATA_INT_LIST
# returns left boundary of s-attribute region enclosing cpos, -1 if not
# in region
self.__send_WORD(CL_CPOS2LBOUND)
self.__send_STRING(attribute)
self.__send_INT_LIST(cpos)
return self.__recv_response()
def cl_cpos2rbound(self, attribute, cpos):
# INPUT: (STRING attribute, INT_LIST cpos)
# OUTPUT: CQI_DATA_INT_LIST
# returns right boundary of s-attribute region enclosing cpos, -1 if
# not in region
self.__send_WORD(CL_CPOS2RBOUND)
self.__send_STRING(attribute)
self.__send_INT_LIST(cpos)
return self.__recv_response()
def cl_cpos2alg(self, attribute, cpos):
# INPUT: (STRING attribute, INT_LIST cpos)
# OUTPUT: CQI_DATA_INT_LIST
# returns -1 for every corpus position not inside an alignment
self.__send_WORD(CL_CPOS2ALG)
self.__send_STRING(attribute)
self.__send_INT_LIST(cpos)
return self.__recv_response()
def cl_struc2str(self, attribute, strucs):
# INPUT: (STRING attribute, INT_LIST strucs)
# OUTPUT: CQI_DATA_STRING_LIST
# returns annotated string values of structure regions in <strucs>; ""
# if out of range
# check CQI_CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES(<attribute>) first
self.__send_WORD(CL_STRUC2STR)
self.__send_STRING(attribute)
self.__send_INT_LIST(strucs)
return self.__recv_response()
"""
" NOTE: the following mappings take a single argument and return multiple
" values, including lists of arbitrary size
"""
def cl_id2cpos(self, attribute, id):
# INPUT: (STRING attribute, INT id)
# OUTPUT: CQI_DATA_INT_LIST
# returns all corpus positions where the given token occurs
self.__send_WORD(CL_ID2CPOS)
self.__send_STRING(attribute)
self.__send_INT(id)
return self.__recv_response()
def cl_idlist2cpos(self, attribute, id_list):
# INPUT: (STRING attribute, INT_LIST id_list)
# OUTPUT: CQI_DATA_INT_LIST
# returns all corpus positions where one of the tokens in <id_list>
# occurs; the returned list is sorted as a whole, not per token id
self.__send_WORD(CL_IDLIST2CPOS)
self.__send_STRING(attribute)
self.__send_INT_LIST(id_list)
return self.__recv_response()
def cl_regex2id(self, attribute, regex):
# INPUT: (STRING attribute, STRING regex)
# OUTPUT: CQI_DATA_INT_LIST
# returns lexicon IDs of all tokens that match <regex>; the returned
# list may be empty (size 0);
self.__send_WORD(CL_REGEX2ID)
self.__send_STRING(attribute)
self.__send_STRING(regex)
return self.__recv_response()
def cl_struc2cpos(self, attribute, struc):
# INPUT: (STRING attribute, INT struc)
# OUTPUT: CQI_DATA_INT_INT
# returns start and end corpus positions of structure region <struc>
self.__send_WORD(CL_STRUC2CPOS)
self.__send_STRING(attribute)
self.__send_INT(struc)
return self.__recv_response()
def cl_alg2cpos(self, attribute, alg):
# INPUT: (STRING attribute, INT alg)
# OUTPUT: CQI_DATA_INT_INT_INT_INT
# returns (src_start, src_end, target_start, target_end)
self.__send_WORD(CL_ALG2CPOS)
self.__send_STRING(attribute)
self.__send_INT(alg)
return self.__recv_response()
def cqp_query(self, mother_corpus, subcorpus_name, query):
# INPUT: (STRING mother_corpus, STRING subcorpus_name, STRING query)
# OUTPUT: CQI_STATUS_OK
# <query> must include the ';' character terminating the query.
self.__send_WORD(CQP_QUERY)
self.__send_STRING(mother_corpus)
self.__send_STRING(subcorpus_name)
self.__send_STRING(query)
return self.__recv_response()
def cqp_list_subcorpora(self, corpus):
# INPUT: (STRING corpus)
# OUTPUT: CQI_DATA_STRING_LIST
self.__send_WORD(CQP_LIST_SUBCORPORA)
self.__send_STRING(corpus)
return self.__recv_response()
def cqp_subcorpus_size(self, subcorpus):
# INPUT: (STRING subcorpus)
# OUTPUT: CQI_DATA_INT
self.__send_WORD(CQP_SUBCORPUS_SIZE)
self.__send_STRING(subcorpus)
return self.__recv_response()
def cqp_subcorpus_has_field(self, subcorpus, field):
# INPUT: (STRING subcorpus, BYTE field)
# OUTPUT: CQI_DATA_BOOL
self.__send_WORD(CQP_SUBCORPUS_HAS_FIELD)
self.__send_STRING(subcorpus)
self.__send_BYTE(field)
return self.__recv_response()
def cqp_dump_subcorpus(self, subcorpus, field, first, last):
# INPUT: (STRING subcorpus, BYTE field, INT first, INT last)
# OUTPUT: CQI_DATA_INT_LIST
# Dump the values of <field> for match ranges <first> .. <last>
# in <subcorpus>. <field> is one of the CQI_CONST_FIELD_* constants.
self.__send_WORD(CQP_DUMP_SUBCORPUS)
self.__send_STRING(subcorpus)
self.__send_BYTE(field)
self.__send_INT(first)
self.__send_INT(last)
return self.__recv_response()
def cqp_drop_subcorpus(self, subcorpus):
# INPUT: (STRING subcorpus)
# OUTPUT: CQI_STATUS_OK
# delete a subcorpus from memory
self.__send_WORD(CQP_DROP_SUBCORPUS)
self.__send_STRING(subcorpus)
return self.__recv_response()
"""
" NOTE: The following two functions are temporarily included for the
" Euralex 2000 tutorial demo
"""
def cqp_fdist_1(self, subcorpus, cutoff, field, attribute):
""" NOTE: frequency distribution of single tokens """
# INPUT: (STRING subcorpus, INT cutoff, BYTE field, STRING attribute)
# OUTPUT: CQI_DATA_INT_LIST
# returns <n> (id, frequency) pairs flattened into a list of size 2*<n>
# field is one of CQI_CONST_FIELD_MATCH, CQI_CONST_FIELD_TARGET,
# CQI_CONST_FIELD_KEYWORD
# NB: pairs are sorted by frequency desc.
self.__send_WORD(CQP_FDIST_1)
self.__send_STRING(subcorpus)
self.__send_INT(cutoff)
self.__send_BYTE(field)
self.__send_STRING(attribute)
return self.__recv_response()
def cqp_fdist_2(self, subcorpus, cutoff, field1, attribute1, field2,
attribute2):
""" NOTE: frequency distribution of pairs of tokens """
# INPUT: (STRING subcorpus, INT cutoff, BYTE field1, STRING attribute1,
# BYTE field2, STRING attribute2)
# OUTPUT: CQI_DATA_INT_LIST
# returns <n> (id1, id2, frequency) pairs flattened into a list of size
# 3*<n>
# NB: triples are sorted by frequency desc.
self.__send_WORD(CQP_FDIST_2)
self.__send_STRING(subcorpus)
self.__send_INT(cutoff)
self.__send_BYTE(field1)
self.__send_STRING(attribute1)
self.__send_BYTE(field2)
self.__send_STRING(attribute2)
return self.__recv_response()
def __recv_response(self):
byte_data = self.__recv_WORD()
response_type = byte_data >> 8
if response_type == CL_ERROR:
raise Exception(lookup[byte_data])
elif response_type == CQP_ERROR:
raise Exception(lookup[byte_data])
elif response_type == DATA:
return self.__recv_DATA(byte_data)
elif response_type == ERROR:
raise Exception(lookup[byte_data])
elif response_type == STATUS:
return byte_data
else:
raise Exception(
'Unknown response type: {}'.format(hex(response_type))
)
def __recv_DATA(self, data_type):
if data_type == DATA_BYTE:
data = self.__recv_DATA_BYTE()
elif data_type == DATA_BOOL:
data = self.__recv_DATA_BOOL()
elif data_type == DATA_INT:
data = self.__recv_DATA_INT()
elif data_type == DATA_STRING:
data = self.__recv_DATA_STRING()
elif data_type == DATA_BYTE_LIST:
data = self.__recv_DATA_BYTE_LIST()
elif data_type == DATA_BOOL_LIST:
data = self.__recv_DATA_BOOL_LIST()
elif data_type == DATA_INT_LIST:
data = self.__recv_DATA_INT_LIST()
elif data_type == DATA_STRING_LIST:
data = self.__recv_DATA_STRING_LIST()
elif data_type == DATA_INT_INT:
data = self.__recv_DATA_INT_INT()
elif data_type == DATA_INT_INT_INT_INT:
data = self.__recv_DATA_INT_INT_INT_INT()
elif data_type == DATA_INT_TABLE:
data = self.__recv_DATA_INT_TABLE()
else:
raise Exception('Unknown data type: {}'.format(hex(data_type)))
return data
def __recv_DATA_BYTE(self):
while True:
if (len(self.socket.recv(1, socket.MSG_PEEK)) == 1):
byte_data = self.socket.recv(1)
break
sleep(0.1)
return struct.unpack('!B', byte_data)[0]
def __recv_DATA_BOOL(self):
while True:
if (len(self.socket.recv(1, socket.MSG_PEEK)) == 1):
byte_data = self.socket.recv(1)
break
sleep(0.1)
return struct.unpack('!?', byte_data)[0]
def __recv_DATA_INT(self):
while True:
if (len(self.socket.recv(4, socket.MSG_PEEK)) == 4):
byte_data = self.socket.recv(4)
break
sleep(0.1)
return struct.unpack('!i', byte_data)[0]
def __recv_DATA_STRING(self):
n = self.__recv_WORD()
while True:
if (len(self.socket.recv(n, socket.MSG_PEEK)) == n):
byte_data = self.socket.recv(n)
break
sleep(0.1)
return struct.unpack('!{}s'.format(n), byte_data)[0].decode()
def __recv_DATA_BYTE_LIST(self):
data = []
n = self.__recv_DATA_INT()
while n > 0:
data.append(self.__recv_DATA_BYTE())
n -= 1
return data
def __recv_DATA_BOOL_LIST(self):
data = []
n = self.__recv_DATA_INT()
while n > 0:
data.append(self.__recv_DATA_BOOL())
n -= 1
return data
def __recv_DATA_INT_LIST(self):
data = []
n = self.__recv_DATA_INT()
while n > 0:
data.append(self.__recv_DATA_INT())
n -= 1
return data
def __recv_DATA_STRING_LIST(self):
data = []
n = self.__recv_DATA_INT()
while n > 0:
data.append(self.__recv_DATA_STRING())
n -= 1
return data
def __recv_DATA_INT_INT(self):
return (self.__recv_DATA_INT(), self.__recv_DATA_INT())
def __recv_DATA_INT_INT_INT_INT(self):
return (self.__recv_DATA_INT(),
self.__recv_DATA_INT(),
self.__recv_DATA_INT(),
self.__recv_DATA_INT())
def __recv_DATA_INT_TABLE(self):
rows = self.__recv_DATA_INT()
columns = self.__recv_DATA_INT()
data = []
for i in range(0, rows):
row = []
for j in range(0, columns):
row.append(self.__recv_DATA_INT())
data.append(row)
return data
def __recv_WORD(self):
while True:
if (len(self.socket.recv(2, socket.MSG_PEEK)) == 2):
byte_data = self.socket.recv(2)
break
sleep(0.1)
return struct.unpack('!H', byte_data)[0]
def __send_BYTE(self, byte_data):
data = struct.pack('!B', byte_data)
self.socket.sendall(data)
def __send_BOOL(self, bool_data):
data = struct.pack('!?', bool_data)
self.socket.sendall(data)
def __send_INT(self, int_data):
data = struct.pack('!i', int_data)
self.socket.sendall(data)
def __send_STRING(self, string_data):
encoded_string_data = string_data.encode('utf-8')
n = len(encoded_string_data)
data = struct.pack('!H{}s'.format(n), n, encoded_string_data)
self.socket.sendall(data)
def __send_INT_LIST(self, int_list_data):
n = len(int_list_data)
self.__send_INT(n)
for int_data in int_list_data:
self.__send_INT(int_data)
def __send_STRING_LIST(self, string_list_data):
n = len(string_list_data)
self.__send_INT(n)
for string_data in string_list_data:
self.__send_STRING(string_data)
def __send_WORD(self, word_data):
data = struct.pack('!H', word_data)
self.socket.sendall(data)

31
app/corpora/cqi/client.py Normal file
View File

@ -0,0 +1,31 @@
from .api import APIClient
from .constants import MAJOR_VERSION, MINOR_VERSION
from .models.corpora import CorpusCollection
class CQiClient(APIClient):
def __init__(self, host, port=4877):
super(CQiClient, self).__init__(host, port=port)
def connect(self, username='anonymous', password=''):
super(CQiClient, self).setup()
self.ctrl_connect(username, password)
self.__load()
def disconnect(self):
self.ctrl_bye()
super(CQiClient, self).teardown()
def __load(self):
self.corpora = CorpusCollection(self)
self.info = {'version': '{}.{}'.format(MAJOR_VERSION, MINOR_VERSION)}
def features(self):
features = []
if self.ask_feature_cqi_1_0():
features.append('cqi_1_0')
if self.ask_feature_cl_2_3():
features.append('cl_2_3')
if self.ask_feature_cqp_2_3():
features.append('cqp_2_3')
return features

View File

@ -0,0 +1,36 @@
""" 4. Constant Definitions """
CONST_FALSE = 0x00
CONST_NO = 0x00
CONST_TRUE = 0x01
CONST_YES = 0x01
"""
" NOTE: The following constants specify which field will be returned by
" CQI_CQP_DUMP_SUBCORPUS and some other subcorpus commands.
"""
CONST_FIELD_MATCH = 0x10
CONST_FIELD_MATCHEND = 0x11
"""
" NOTE: The constants specifiying target0 .. target9 are guaranteed to have the
" numerical values 0 .. 9, so clients do not need to look up the constant
" values if they're handling arbitrary targets.
"""
CONST_FIELD_TARGET_0 = 0x00
CONST_FIELD_TARGET_1 = 0x01
CONST_FIELD_TARGET_2 = 0x02
CONST_FIELD_TARGET_3 = 0x03
CONST_FIELD_TARGET_4 = 0x04
CONST_FIELD_TARGET_5 = 0x05
CONST_FIELD_TARGET_6 = 0x06
CONST_FIELD_TARGET_7 = 0x07
CONST_FIELD_TARGET_8 = 0x08
CONST_FIELD_TARGET_9 = 0x09
"""
" NOTE: The following constants are provided for backward compatibility with
" traditional CQP field names & while the generalised target concept
" isn't yet implemented in the CQPserver.
"""
CONST_FIELD_TARGET = 0x00
CONST_FIELD_KEYWORD = 0x09
""" NOTE: CQi version is CQI_MAJOR_VERSION.CQI_MINOR_VERSION """
MAJOR_VERSION = 0x00
MINOR_VERSION = 0x01

View File

View File

@ -0,0 +1,102 @@
from .subcorpora import SubcorpusCollection
class CorpusCollection:
def __init__(self, client):
self.client = client
def get(self, name):
return Corpus(self.client, name)
def list(self):
return [Corpus(self.client, corpus) for corpus
in self.client.corpus_list_coprora()]
class Corpus:
def __init__(self, client, name):
self.client = client
self.name = name
self.__load()
def __load(self):
self.size = self.client.cl_attribute_size('{}.word'.format(self.name))
# self.info = client.corpus_info(self.name)
self.charset = self.client.corpus_charset(self.name)
# self.full_name = self.client.corpus_full_name(self.name)
self.properties = self.client.corpus_properties(self.name)
self.alignment_attributes = \
self.client.corpus_alignment_attributes(self.name)
self.structural_attributes = \
self.client.corpus_structural_attributes(self.name)
self.positional_attributes = \
self.client.corpus_positional_attributes(self.name)
self.subcorpora = SubcorpusCollection(self.client, self)
def alg2cpos(self, attribute, alg):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_alg2cpos(__attribute, alg)
def cpos2alg(self, attribute, cpos_list):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_cpos2alg(__attribute, cpos_list)
def cpos2id(self, attribute, cpos_list):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_cpos2id(__attribute, cpos_list)
def cpos2lbound(self, attribute, cpos_list):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_cpos2lbound(__attribute, cpos_list)
def cpos2rbound(self, attribute, cpos_list):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_cpos2rbound(__attribute, cpos_list)
def cpos2str(self, attribute, cpos_list):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_cpos2str(__attribute, cpos_list)
def cpos2struc(self, attribute, cpos_list):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_cpos2struc(__attribute, cpos_list)
def id2cpos(self, attribute, id):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_id2cpos(__attribute, id)
def idlist2cpos(self, attribute, ids):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_idlist2cpos(__attribute, ids)
def id2freq(self, attribute, ids):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_id2freq(__attribute, ids)
def id2str(self, attribute, ids):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_id2str(__attribute, ids)
def query(self, query, subcorpus_name='Results'):
self.client.cqp_query(self.name, subcorpus_name, query)
return self.subcorpora.get('Results')
def regex2id(self, attribute, regex):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_regex2id(__attribute, regex)
def structural_attribute_has_values(self, attribute):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.corpus_structural_attribute_has_values(__attribute)
def str2id(self, attribute, strings):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_str2id(__attribute, strings)
def struc2cpos(self, attribute, struc):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_struc2cpos(__attribute, struc)
def struc2str(self, attribute, strucs):
__attribute = '{}.{}'.format(self.name, attribute)
return self.client.cl_struc2str(__attribute, strucs)

View File

@ -0,0 +1,112 @@
from ..constants import (CONST_FIELD_KEYWORD, CONST_FIELD_MATCH,
CONST_FIELD_MATCHEND, CONST_FIELD_TARGET)
class SubcorpusCollection:
def __init__(self, client, parent_corpus):
self.client = client
self.parent_corpus = parent_corpus
def get(self, name):
return Subcorpus(self.client, self.parent_corpus, name)
def list(self):
return [Subcorpus(self.client, self.parent_corpus, subcorpus)
for subcorpus
in self.client.cqp_list_subcorpora(self.parent_corpus.name)]
class Subcorpus:
def __init__(self, client, parent_corpus, name):
self.client = client
self.parent_corpus = parent_corpus
self.name = name
self.__name = '{}:{}'.format(parent_corpus.name, name)
self.__load()
def __load(self):
self.fields = {}
if self.client.cqp_subcorpus_has_field(self.__name, CONST_FIELD_MATCH):
self.fields['match'] = CONST_FIELD_MATCH
if self.client.cqp_subcorpus_has_field(self.__name,
CONST_FIELD_MATCHEND):
self.fields['matchend'] = CONST_FIELD_MATCHEND
if self.client.cqp_subcorpus_has_field(self.__name,
CONST_FIELD_TARGET):
self.fields['target'] = CONST_FIELD_TARGET
if self.client.cqp_subcorpus_has_field(self.__name,
CONST_FIELD_KEYWORD):
self.fields['keyword'] = CONST_FIELD_KEYWORD
self.size = self.client.cqp_subcorpus_size(self.__name)
def drop(self):
return self.client.cqp_drop_subcorpus(self.__name)
def dump(self, field, first, last):
return self.client.cqp_dump_subcorpus(self.__name, field, first, last)
def dump_values(self, context=25, first_result=0,
num_results=float('inf')):
first_result = max(0, first_result)
last_result = min((first_result + num_results), (self.size - 1))
matches = []
match_boundaries = zip(self.dump(self.fields['match'], first_result,
last_result),
self.dump(self.fields['matchend'], first_result,
last_result))
for match_start, match_end in match_boundaries:
left_start = max(0, match_start - context)
right_end = min(self.parent_corpus.size, (match_end + 1 + context))
matches.append({'left': list(range(left_start, match_start)),
'hit': list(range(match_start, match_end + 1)),
'right': list(range(match_end + 1, right_end))})
cpos_list = []
for match in matches:
cpos_list += match['left'] + match['hit'] + match['right']
cpos_list = list(set(cpos_list))
lookups = {}
if len(cpos_list) > 0:
lookups['cpos_lookup'] = {}
for cpos in cpos_list:
lookups['cpos_lookup'][cpos] = {}
for attr in self.parent_corpus.positional_attributes:
cpos_attr_values = self.parent_corpus.cpos2str(attr, cpos_list)
for i, cpos in enumerate(cpos_list):
lookups['cpos_lookup'][cpos][attr] = cpos_attr_values[i]
for attr in self.parent_corpus.structural_attributes:
if self.parent_corpus.structural_attribute_has_values(attr):
continue
cpos_attr_ids = self.parent_corpus.cpos2struc(attr, cpos_list)
for i, cpos in enumerate(cpos_list):
if cpos_attr_ids[i] != -1:
lookups['cpos_lookup'][cpos][attr] = cpos_attr_ids[i]
occured_attr_ids = list(set(cpos_attr_ids))
occured_attr_ids = list(filter(lambda x: x != -1,
occured_attr_ids))
if len(occured_attr_ids) == 0:
continue
attr_subattrs = \
list(filter(lambda x: x.startswith(attr + '_'),
self.parent_corpus.structural_attributes))
attr_subattrs = list(map(lambda x: x.split('_', 1)[1],
attr_subattrs))
if len(attr_subattrs) == 0:
continue
lookups[attr + '_lookup'] = {}
for attr_id in occured_attr_ids:
lookups[attr + '_lookup'][attr_id] = {}
for subattr in attr_subattrs:
__subattr = attr + '_' + subattr
subattr_values = \
self.parent_corpus.struc2str(__subattr, occured_attr_ids)
for i, value in enumerate(subattr_values):
lookups[attr + '_lookup'][occured_attr_ids[i]][subattr] = \
value
return {'matches': matches, **lookups}
def fdist_1(self, cutoff, field, attribute):
return self.client.cqp_fdist_1(self.__name, cutoff, field, attribute)
def fdist_2(self, cutoff, field_1, attribute_1, field_2, attribute_2):
return self.client.cqp_fdist_2(self.__name, cutoff, field_1,
attribute_1, field_2, attribute_2)

View File

@ -0,0 +1,5 @@
from .constants import MAJOR_VERSION, MINOR_VERSION
version = '{}.{}'.format(MAJOR_VERSION, MINOR_VERSION)
version_info = (MAJOR_VERSION, MINOR_VERSION)

323
app/corpora/cqi/wrapper.py Normal file
View File

@ -0,0 +1,323 @@
from .api import APIClient
from .constants import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND
import time
class CQiWrapper(APIClient):
'''
CQIiWrapper object
High level wrapper that groups and renames some functions of CQiClient
for ease of use. Also structures recieved data into python dictionaries.
Keyword arguments:
host -- host IP adress or hostname wher the cqp server is running
port -- port of the cqp server
username -- username used to connect to the cqp server
password -- password of the user to connect to the cqp server
'''
SUBCORPUS_NAMES = []
def __init__(self, host='127.0.0.1', port=4877, username='anonymous',
password=''):
super(CQiWrapper, self).__init__(host, port=port)
self.username = username
self.password = password
def connect(self):
'''
Connect with CQP server
Connects via socket to the CQP server using the given username and
password from class initiation.
'''
super(CQiWrapper, self).setup()
self.ctrl_connect(self.username, self.password)
def __create_attribute_strings(self):
'''
Creates all needed attribute strings to query for word, lemma etc. in
the given corpus.
For example: CORPUS_NAME.word to query words
Automaticalle creates strings for all pre defined tags.
'''
p_attrs = self.corpus_positional_attributes(self.corpus_name)
struct_attrs = self.corpus_structural_attributes(self.corpus_name)
self.attr_strings = {}
self.attr_strings['positional_attrs'] = {}
self.attr_strings['struct_attrs'] = {}
for p_attr in p_attrs:
self.attr_strings['positional_attrs'][p_attr] = (self.corpus_name
+ '.'
+ p_attr)
for struct_attr in struct_attrs:
self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
+ '.'
+ struct_attr)
print(('All positional and '
'structural attributes: {}').format(self.attr_strings))
def select_corpus(self, corpus_name):
'''
Checks if given copus name exists. If it exists set it as the main
corpus name used to create the needed query attribute strings like
CORPUS_NAME.word.
'''
if corpus_name in self.corpus_list_coprora():
self.corpus_name = corpus_name
self.__create_attribute_strings()
print('{} does exist.'.format(corpus_name))
else:
print('{} does not exist.'.format(corpus_name))
raise Exception('Given Corpus Name is not in corpora list.')
def disconnect(self):
'''
Disconnect from CQP server
Disconnects from the CQP server. Closes used socket after disconnect.
'''
self.ctrl_bye()
super(CQiWrapper, self).teardown()
print('Disconnected from cqp server.')
def query_subcorpus(self, query, result_subcorpus_name='Query-results'):
'''
Create subcorpus
Input query will be used to create a subcorpus holding all cpos match
positions for that query.
Keyword arguments:
result_subcorpus_name -- set name of the subcorpus which holds all
cpos match positions, produced by the query
query -- query written in cqp query language
'''
self.query = query
self.cqp_query(self.corpus_name, result_subcorpus_name, query)
self.result_subcorpus = (self.corpus_name
+ ':'
+ result_subcorpus_name)
self.SUBCORPUS_NAMES.append(self.result_subcorpus)
self.match_count = self.cqp_subcorpus_size(self.result_subcorpus)
print('Nr of all matches is: {}'.format(self.match_count))
def show_subcorpora(self):
'''
Show all subcorpora currently saved by the cqp server.
'''
return self.cqp_list_subcorpora(self.corpus_name)
def show_query_results(self,
context_len=10,
result_len=1000,
result_offset=0):
'''
Show query results
Shows the actual matched strings produce by the query. Uses the cpos
match indexes to grab those strings. saves them into an orderd
dictionary. Also saves coresponding tags, lemmas and context. Gets those
informations using the corresponding cpos.
Keyword arguments:
context_len -- defines how many words before and after a match will be
shown (default 10)
result_len -- defines for how many matches all informations like lemma
and POS are being grabbed
result_offset -- defines the offset of the matches being requested. If
the offset is 100 informations for matches 100 to result_len are being
grabbed
'''
t0 = time.time()
self.context_len = context_len
self.corpus_max_len = self.cl_attribute_size(
self.attr_strings['positional_attrs']['word']
)
self.nr_matches = min(result_len, self.match_count)
if self.match_count == 0:
print('Query resulted in 0 matches.')
self.results = {'code': 0,
'result': {'matches': [],
'match_count': self.match_count,
'cpos_lookup': {},
'text_lookup': {}}
}
return self.results
else:
# Get match cpos boundries
# match_boundries shows the start and end cpos of one match as a
# pair of cpositions
# [(1355, 1357), (1477, 1479)] Example for two boundry pairs
offset_start = 0 if result_offset == 0 else result_offset
print('Offset start is: {}'.format(offset_start))
offset_end = min((self.nr_matches + result_offset - 1), self.match_count - 1)
print('Offset end is: {}'.format(offset_end))
match_boundaries = zip(self.cqp_dump_subcorpus(self.result_subcorpus,
CONST_FIELD_MATCH,
offset_start,
offset_end),
self.cqp_dump_subcorpus(self.result_subcorpus,
CONST_FIELD_MATCHEND,
offset_start,
offset_end))
# Generate all cpos between match boundries including start and end
# boundries.
# Also generate cpos for left and right context.
# Save those cpos into dict as lists for the keys 'lc', 'hit' and 'rc'
# Also collect all cpos together in one list for the final request of
# all cpos informations
all_matches = []
all_cpos = []
for start, end in match_boundaries:
end += 1
lc_cpos = list(range(max([0, start - self.context_len]), start))
lc = {'lc': lc_cpos}
match_cpos = list(range(start, end))
match = {'hit': match_cpos}
rc_cpos = list(range(end, min([self.corpus_max_len,
end + self.context_len])))
rc = {'rc': rc_cpos}
lc.update(match)
lc.update(rc)
all_cpos.extend(lc_cpos + match_cpos + rc_cpos)
all_matches.append(lc)
all_cpos = list(set(all_cpos)) # get rid of cpos duplicates
len_all_cpos = len(all_cpos)
t1 = time.time()
t_total = t1 - t0
print('Time to create all CPOS for query: {}'.format(t_total))
print('Requesting {} CPOS with one query.'.format(len_all_cpos))
# Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
# all cpos entries in all_cpos_list
# Also saves these informations into self.results dict
t2 = time.time()
all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
t3 = time.time()
t_final = t3 - t2
print('Got infos for {} CPOS in {} seconds:'.format(len_all_cpos,
t_final))
self.results = {'code': 0,
'result': {'matches': all_matches,
'match_count': self.match_count,
'cpos_lookup': all_cpos_infos,
'text_lookup': text_lookup}
}
return self.results
def get_cpos_infos(self, all_cpos):
'''
Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
all cpos entries specified in the parameter all_cpos.
'''
# Get all positional attribute informations
cpos_infos = {}
for p_attr_key in self.attr_strings['positional_attrs'].keys():
match_strs = self.cl_cpos2str(self.attr_strings['positional_attrs'][p_attr_key], all_cpos)
cpos_infos[p_attr_key] = match_strs
# Get all strucutural attribute informations
tmp_info = {}
structs_to_check = []
for struct_attr_key in self.attr_strings['struct_attrs'].keys():
key = self.attr_strings['struct_attrs'][struct_attr_key]
has_value = self.corpus_structural_attribute_has_values(key)
struct_ids = self.cl_cpos2struc(key, all_cpos)
if has_value is False: # Get IDs of strucutural elements without values (this means get IDs of XML tags. Struct elements only have values if they are XML attributes)
tmp_info[struct_attr_key] = []
for id in struct_ids:
tmp_info[struct_attr_key].append(id)
else:
structs_to_check.append({key: struct_attr_key})
print('Structs to check: {}'.format(structs_to_check))
struct_attr_values = list(tmp_info.values())
# print('Struct attr value list: {}'.format(struct_attr_values))
struct_attr_keys = list(tmp_info.keys())
# print('Struct attr key list: {}'.format(struct_attr_keys))
# Build textlookup dictionary
text_lookup_ids = list(set(struct_attr_values[0])) # every CPOS is associated with one text id. A set is build to only gather text_lookup informations for every unique text id
text_lookup = {} # final dict containing all info of one text identified by its id
for d in structs_to_check:
s_key, s_value = zip(*d.items())
print('dict entries: {}: {}'.format(s_key, s_value))
s_value = s_value[0].split('_', 1)[-1]
print('S_VALUE: {}'.format(s_value))
struct_values = self.cl_struc2str(s_key[0], text_lookup_ids)
print('Extracted Value with key {}: {}'.format(s_key[0], struct_values))
zipped = dict(zip(text_lookup_ids, struct_values))
for zip_key, zip_value in zipped.items():
print('Text id as key is: {}'.format(zip_key))
print('Value of this text is: {}'.format(zip_value))
check = text_lookup.get(zip_key)
print('check: {}'.format(check))
if check is None:
text_lookup[zip_key] = {s_value: zip_value}
else:
text_lookup[zip_key].update({s_value: zip_value})
# zip keys and values together
attr_values_list = []
attr_keys_list = []
for key in cpos_infos.keys():
attr_values_list.append(cpos_infos[key])
attr_keys_list.append(key)
attr_keys_list.extend(struct_attr_keys)
attr_values_list.extend(struct_attr_values)
joined_cpos_infos = zip(all_cpos, *attr_values_list)
dict_cpos_infos = {}
for info in joined_cpos_infos:
dict_cpos_infos[info[0]] = dict(zip(attr_keys_list, info[1:]))
return dict_cpos_infos, text_lookup
def get_sentences(self,
match_cpos_list,
get_surrounding_s=False,
l_r_s_context_additional_len=1):
'''
Get sentence informations for one match also set if and how much left
right context sentences should be grabbed surrounding the given CPOS.
'''
t0 = time.time()
key = self.corpus_name + '.s'
first_cpos, last_cpos = match_cpos_list[0], match_cpos_list[-1]
context_sentences = {}
s_ids = self.cl_cpos2struc(key, [first_cpos, last_cpos])
print('s id match: {}'.format(s_ids))
for s_id in s_ids:
s_start, s_end = self.cl_struc2cpos(key, s_id)
s_cpos = list(range(s_start, s_end + 1))
context_sentences[s_id] = s_cpos
if get_surrounding_s:
max_s_id = self.cl_attribute_size(key) - 1
print('max sid: {}'.format(max_s_id))
additional_s_ids = []
additional_s = list(range(max(s_ids[0]
- l_r_s_context_additional_len,
0),
min(s_ids[-1]
+ l_r_s_context_additional_len,
max_s_id) + 1))
additional_s_ids.extend(additional_s)
for s_id in additional_s_ids:
print('s id additional: {}'.format(s_id))
s_start, s_end = self.cl_struc2cpos(key, s_id)
s_cpos = list(range(s_start, s_end + 1))
context_sentences[s_id] = s_cpos
all_cpos = []
for key in context_sentences.keys():
all_cpos.extend(context_sentences[key])
all_cpos = list(set(all_cpos))
all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
t1 = time.time()
t_total = t1 - t0
print('Got all sentences informations in {} seconds'. format(t_total))
match_context = {'context_s_cpos': context_sentences,
'cpos_lookup': all_cpos_infos,
'text_lookup': text_lookup,
'match_cpos_list': match_cpos_list}
return match_context