diff --git a/app/corpora/CQiWrapper/CQi.py b/app/corpora/CQiWrapper/CQi.py index 306e6732..7fcc8c4f 100644 --- a/app/corpora/CQiWrapper/CQi.py +++ b/app/corpora/CQiWrapper/CQi.py @@ -6,10 +6,9 @@ # Modified by: Patrick Jentsch # # ########################################################################### # -from app import logger +from time import sleep import socket import struct -import time """ 1. padding """ @@ -420,7 +419,6 @@ class Client: def ctrl_connect(self, username, password): # INPUT: (STRING username, STRING password) # OUTPUT: CQI_STATUS_CONNECT_OK, CQI_ERROR_CONNECT_REFUSED - # print('CTRL_CONNECT') self.__send_WORD(CTRL_CONNECT) self.__send_STRING(username) self.__send_STRING(password) @@ -429,20 +427,17 @@ class Client: def ctrl_bye(self): # INPUT: () # OUTPUT: CQI_STATUS_BYE_OK - # print('CTRL_BYE') self.__send_WORD(CTRL_BYE) return self.__recv_response() def ctrl_user_abort(self): # INPUT: () # OUTPUT: - # print('CTRL_USER_ABORT') self.__send_WORD(CTRL_USER_ABORT) def ctrl_ping(self): # INPUT: () # OUTPUT: CQI_STATUS_PING_OK - # print('CTRL_PING') self.__send_WORD(CTRL_PING) return self.__recv_response() @@ -451,42 +446,36 @@ class Client: # OUTPUT: CQI_DATA_STRING # full-text error message for the last general error reported by the # CQi server - # print('CTRL_LAST_GENERAL_ERROR') self.__send_WORD(CTRL_LAST_GENERAL_ERROR) return self.__recv_response() def ask_feature_cqi_1_0(self): # INPUT: () # OUTPUT: CQI_DATA_BOOL - # print('ASK_FEATURE_CQI_1_0') self.__send_WORD(ASK_FEATURE_CQI_1_0) return self.__recv_response() def ask_feature_cl_2_3(self): # INPUT: () # OUTPUT: CQI_DATA_BOOL - # print('ASK_FEATURE_CL_2_3') self.__send_WORD(ASK_FEATURE_CL_2_3) return self.__recv_response() def ask_feature_cqp_2_3(self): # INPUT: () # OUTPUT: CQI_DATA_BOOL - # print('ASK_FEATURE_CL_2_3') self.__send_WORD(ASK_FEATURE_CL_2_3) return self.__recv_response() def corpus_list_coprora(self): # INPUT: () # OUTPUT: CQI_DATA_STRING_LIST - # print('CORPUS_LIST_CORPORA') self.__send_WORD(CORPUS_LIST_CORPORA) return self.__recv_response() def corpus_charset(self, corpus): # INPUT: (STRING corpus) # OUTPUT: CQI_DATA_STRING - # print('CORPUS_CHARSET') self.__send_WORD(CORPUS_CHARSET) self.__send_STRING(corpus) return self.__recv_response() @@ -494,7 +483,6 @@ class Client: def corpus_properties(self, corpus): # INPUT: (STRING corpus) # OUTPUT: CQI_DATA_STRING_LIST - # print('CORPUS_PROPERTIES') self.__send_WORD(CORPUS_PROPERTIES) self.__send_STRING(corpus) return self.__recv_response() @@ -502,7 +490,6 @@ class Client: def corpus_positional_attributes(self, corpus): # INPUT: (STRING corpus) # OUTPUT: CQI_DATA_STRING_LIST - # print('CORPUS_POSITIONAL_ATTRIBUTES') self.__send_WORD(CORPUS_POSITIONAL_ATTRIBUTES) self.__send_STRING(corpus) return self.__recv_response() @@ -510,7 +497,6 @@ class Client: def corpus_structural_attributes(self, corpus): # INPUT: (STRING corpus) # OUTPUT: CQI_DATA_STRING_LIST - # print('CORPUS_STRUCTURAL_ATTRIBUTES') self.__send_WORD(CORPUS_STRUCTURAL_ATTRIBUTES) self.__send_STRING(corpus) return self.__recv_response() @@ -518,7 +504,6 @@ class Client: def corpus_structural_attribute_has_values(self, attribute): # INPUT: (STRING attribute) # OUTPUT: CQI_DATA_BOOL - # print('CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES') self.__send_WORD(CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES) self.__send_STRING(attribute) return self.__recv_response() @@ -526,7 +511,6 @@ class Client: def corpus_alignment_attributes(self, corpus): # INPUT: (STRING corpus) # OUTPUT: CQI_DATA_STRING_LIST - # print('CORPUS_ALIGNMENT_ATTRIBUTES') self.__send_WORD(CORPUS_ALIGNMENT_ATTRIBUTES) self.__send_STRING(corpus) return self.__recv_response() @@ -535,7 +519,6 @@ class Client: # INPUT: (STRING corpus) # OUTPUT: CQI_DATA_STRING # the full name of as specified in its registry entry - # print('CORPUS_FULL_NAME') self.__send_WORD(CORPUS_FULL_NAME) self.__send_STRING(corpus) return self.__recv_response() @@ -544,20 +527,14 @@ class Client: # INPUT: (STRING corpus) # OUTPUT: CQI_DATA_STRING_LIST # returns the contents of the .info file of as a list of lines - # print('CORPUS_INFO') self.__send_WORD(CORPUS_INFO) self.__send_STRING(corpus) return self.__recv_response() def corpus_drop_corpus(self, corpus): - ''' - ' Broken - ' TODO: Check what type of return value is provided by the server. - ''' # INPUT: (STRING corpus) # OUTPUT: CQI_STATUS_OK # try to unload a corpus and all its attributes from memory - # print('CORPUS_DROP_CORPUS') self.__send_WORD(CORPUS_DROP_CORPUS) self.__send_STRING(corpus) return self.__recv_response() @@ -569,7 +546,6 @@ class Client: # number of tokens (positional) # number of regions (structural) # number of alignments (alignment) - # print('CL_ATTRIBUTE_SIZE') self.__send_WORD(CL_ATTRIBUTE_SIZE) self.__send_STRING(attribute) return self.__recv_response() @@ -580,7 +556,6 @@ class Client: # returns the number of entries in the lexicon of a positional # attribute; # valid lexicon IDs range from 0 .. (lexicon_size - 1) - # print('CL_LEXICON_SIZE') self.__send_WORD(CL_LEXICON_SIZE) self.__send_STRING(attribute) return self.__recv_response() @@ -589,7 +564,6 @@ class Client: # INPUT: (STRING attribute) # OUTPUT: CQI_STATUS_OK # unload attribute from memory - # print('CL_DROP_ATTRIBUTE') self.__send_WORD(CL_LEXICON_SIZE) self.__send_STRING(attribute) return self.__recv_response() @@ -604,7 +578,6 @@ class Client: # OUTPUT: CQI_DATA_INT_LIST # returns -1 for every string in that is not found in the # lexicon - # print('CL_STR2ID') self.__send_WORD(CL_LEXICON_SIZE) self.__send_STRING(attribute) self.__send_STRING_LIST(strings) @@ -614,7 +587,6 @@ class Client: # INPUT: (STRING attribute, INT_LIST id) # OUTPUT: CQI_DATA_STRING_LIST # returns "" for every ID in that is out of range - # print('CL_ID2STR') self.__send_WORD(CL_ID2STR) self.__send_STRING(attribute) self.__send_INT_LIST(id) @@ -624,7 +596,6 @@ class Client: # INPUT: (STRING attribute, INT_LIST id) # OUTPUT: CQI_DATA_INT_LIST # returns 0 for every ID in that is out of range - # print('CL_ID2FREQ') self.__send_WORD(CL_ID2FREQ) self.__send_STRING(attribute) self.__send_INT_LIST(id) @@ -634,7 +605,6 @@ class Client: # INPUT: (STRING attribute, INT_LIST cpos) # OUTPUT: CQI_DATA_INT_LIST # returns -1 for every corpus position in that is out of range - # print('CL_CPOS2ID') self.__send_WORD(CL_ID2FREQ) self.__send_STRING(attribute) self.__send_INT_LIST(cpos) @@ -644,7 +614,6 @@ class Client: # INPUT: (STRING attribute, INT_LIST cpos) # OUTPUT: CQI_DATA_STRING_LIST # returns "" for every corpus position in that is out of range - # print('CL_CPOS2STR') self.__send_WORD(CL_CPOS2STR) self.__send_STRING(attribute) self.__send_INT_LIST(cpos) @@ -654,7 +623,6 @@ class Client: # INPUT: (STRING attribute, INT_LIST cpos) # OUTPUT: CQI_DATA_INT_LIST # returns -1 for every corpus position not inside a structure region - # print('CL_CPOS2STRUC') self.__send_WORD(CL_CPOS2STRUC) self.__send_STRING(attribute) self.__send_INT_LIST(cpos) @@ -670,7 +638,6 @@ class Client: # OUTPUT: CQI_DATA_INT_LIST # returns left boundary of s-attribute region enclosing cpos, -1 if not # in region - # print('CL_CPOS2LBOUND') self.__send_WORD(CL_CPOS2LBOUND) self.__send_STRING(attribute) self.__send_INT_LIST(cpos) @@ -681,7 +648,6 @@ class Client: # OUTPUT: CQI_DATA_INT_LIST # returns right boundary of s-attribute region enclosing cpos, -1 if # not in region - # print('CL_CPOS2RBOUND') self.__send_WORD(CL_CPOS2RBOUND) self.__send_STRING(attribute) self.__send_INT_LIST(cpos) @@ -691,7 +657,6 @@ class Client: # INPUT: (STRING attribute, INT_LIST cpos) # OUTPUT: CQI_DATA_INT_LIST # returns -1 for every corpus position not inside an alignment - # print('CL_CPOS2ALG') self.__send_WORD(CL_CPOS2ALG) self.__send_STRING(attribute) self.__send_INT_LIST(cpos) @@ -703,7 +668,6 @@ class Client: # returns annotated string values of structure regions in ; "" # if out of range # check CQI_CORPUS_STRUCTURAL_ATTRIBUTE_HAS_VALUES() first - # print('CL_STRUC2STR') self.__send_WORD(CL_STRUC2STR) self.__send_STRING(attribute) self.__send_INT_LIST(strucs) @@ -718,7 +682,6 @@ class Client: # INPUT: (STRING attribute, INT id) # OUTPUT: CQI_DATA_INT_LIST # returns all corpus positions where the given token occurs - # print('CL_ID2CPOS') self.__send_WORD(CL_ID2CPOS) self.__send_STRING(attribute) self.__send_INT(id) @@ -729,7 +692,6 @@ class Client: # OUTPUT: CQI_DATA_INT_LIST # returns all corpus positions where one of the tokens in # occurs; the returned list is sorted as a whole, not per token id - # print('CL_IDLIST2CPOS') self.__send_WORD(CL_IDLIST2CPOS) self.__send_STRING(attribute) self.__send_INT_LIST(id_list) @@ -740,7 +702,6 @@ class Client: # OUTPUT: CQI_DATA_INT_LIST # returns lexicon IDs of all tokens that match ; the returned # list may be empty (size 0); - # print('CL_REGEX2ID') self.__send_WORD(CL_REGEX2ID) self.__send_STRING(attribute) self.__send_STRING(regex) @@ -750,7 +711,6 @@ class Client: # INPUT: (STRING attribute, INT struc) # OUTPUT: CQI_DATA_INT_INT # returns start and end corpus positions of structure region - # print('CL_STRUC2CPOS') self.__send_WORD(CL_STRUC2CPOS) self.__send_STRING(attribute) self.__send_INT(struc) @@ -760,7 +720,6 @@ class Client: # INPUT: (STRING attribute, INT alg) # OUTPUT: CQI_DATA_INT_INT_INT_INT # returns (src_start, src_end, target_start, target_end) - # print('CL_ALG2CPOS') self.__send_WORD(CL_ALG2CPOS) self.__send_STRING(attribute) self.__send_INT(alg) @@ -770,7 +729,6 @@ class Client: # INPUT: (STRING mother_corpus, STRING subcorpus_name, STRING query) # OUTPUT: CQI_STATUS_OK # must include the ';' character terminating the query. - # print('CQP_QUERY') self.__send_WORD(CQP_QUERY) self.__send_STRING(mother_corpus) self.__send_STRING(subcorpus_name) @@ -780,7 +738,6 @@ class Client: def cqp_list_subcorpora(self, corpus): # INPUT: (STRING corpus) # OUTPUT: CQI_DATA_STRING_LIST - # print('CQP_LIST_SUBCORPORA') self.__send_WORD(CQP_LIST_SUBCORPORA) self.__send_STRING(corpus) return self.__recv_response() @@ -788,7 +745,6 @@ class Client: def cqp_subcorpus_size(self, subcorpus): # INPUT: (STRING subcorpus) # OUTPUT: CQI_DATA_INT - # print('CQP_SUBCORPUS_SIZE') self.__send_WORD(CQP_SUBCORPUS_SIZE) self.__send_STRING(subcorpus) return self.__recv_response() @@ -796,7 +752,6 @@ class Client: def cqp_subcorpus_has_field(self, subcorpus, field): # INPUT: (STRING subcorpus, BYTE field) # OUTPUT: CQI_DATA_BOOL - # print('CQP_SUBCORPUS_HAS_FIELD') self.__send_WORD(CQP_SUBCORPUS_HAS_FIELD) self.__send_STRING(subcorpus) self.__send_BYTE(field) @@ -807,7 +762,6 @@ class Client: # OUTPUT: CQI_DATA_INT_LIST # Dump the values of for match ranges .. # in . is one of the CQI_CONST_FIELD_* constants. - # print('CQP_DUMP_SUBCORPUS') self.__send_WORD(CQP_DUMP_SUBCORPUS) self.__send_STRING(subcorpus) self.__send_BYTE(field) @@ -819,7 +773,6 @@ class Client: # INPUT: (STRING subcorpus) # OUTPUT: CQI_STATUS_OK # delete a subcorpus from memory - # print('CQP_DROP_SUBCORPUS') self.__send_WORD(CQP_DROP_SUBCORPUS) self.__send_STRING(subcorpus) return self.__recv_response() @@ -837,7 +790,6 @@ class Client: # field is one of CQI_CONST_FIELD_MATCH, CQI_CONST_FIELD_TARGET, # CQI_CONST_FIELD_KEYWORD # NB: pairs are sorted by frequency desc. - # print('CQP_FDIST_1') self.__send_WORD(CQP_FDIST_1) self.__send_STRING(subcorpus) self.__send_INT(cutoff) @@ -854,7 +806,6 @@ class Client: # returns (id1, id2, frequency) pairs flattened into a list of size # 3* # NB: triples are sorted by frequency desc. - # print('CQP_FDIST_2') self.__send_WORD(CQP_FDIST_2) self.__send_STRING(subcorpus) self.__send_INT(cutoff) @@ -914,8 +865,7 @@ class Client: if (len(self.connection.recv(1, socket.MSG_PEEK)) == 1): byte_data = self.connection.recv(1) break - logger.warning('Waiting for data transfer to complete...') - time.sleep(0.1) + sleep(0.1) return struct.unpack('!B', byte_data)[0] def __recv_DATA_BOOL(self): @@ -923,8 +873,7 @@ class Client: if (len(self.connection.recv(1, socket.MSG_PEEK)) == 1): byte_data = self.connection.recv(1) break - logger.warning('Waiting for data transfer to complete...') - time.sleep(0.1) + sleep(0.1) return struct.unpack('!?', byte_data)[0] def __recv_DATA_INT(self): @@ -932,8 +881,7 @@ class Client: if (len(self.connection.recv(4, socket.MSG_PEEK)) == 4): byte_data = self.connection.recv(4) break - logger.warning('Waiting for data transfer to complete...') - time.sleep(0.1) + sleep(0.1) return struct.unpack('!i', byte_data)[0] def __recv_DATA_STRING(self): @@ -942,8 +890,7 @@ class Client: if (len(self.connection.recv(n, socket.MSG_PEEK)) == n): byte_data = self.connection.recv(n) break - logger.warning('Waiting for data transfer to complete...') - time.sleep(0.1) + sleep(0.1) return struct.unpack('!{}s'.format(n), byte_data)[0].decode() def __recv_DATA_BYTE_LIST(self): @@ -1003,8 +950,7 @@ class Client: if (len(self.connection.recv(2, socket.MSG_PEEK)) == 2): byte_data = self.connection.recv(2) break - logger.warning('Waiting for data transfer to complete...') - time.sleep(0.1) + sleep(0.1) return struct.unpack('!H', byte_data)[0] def __send_BYTE(self, byte_data): diff --git a/app/corpora/events.py b/app/corpora/events.py index 190157b1..8449ad08 100644 --- a/app/corpora/events.py +++ b/app/corpora/events.py @@ -26,51 +26,33 @@ def init_corpus_analysis(corpus_id): corpus_id, current_user.id, request.sid) -@socketio.on('corpus_analysis') +@socketio.on('corpus_analysis_query') @login_required -def corpus_analysis(message): +def corpus_analysis_query(query): client = corpus_analysis_clients.get(request.sid) if client is None: socketio.emit('query', '[424]: Failed Dependency', room=request.sid) return - # Prepare and execute a query - corpus_name = 'CORPUS' - query = str(message['query']) - result_len = 200 # int(message['hits_per_page']) - context_len = int(message['context']) - result_offset = 0 - client.select_corpus(corpus_name) + client.select_corpus('CORPUS') try: client.query_subcorpus(query) except Exception as e: logger.warning(e) response = str(e) if response == "CQI_CQP_ERROR_GENERAL": - response = {'code': 1, - 'result': {'matches': [], - 'match_count': 0, - 'cpos_lookup': {}, - 'text_lookup': {}} - } + response = {'code': 1} socketio.emit('corpus_analysis_query', response, room=request.sid) else: - logger.warning('====== Initial query {} ======'.format(query)) - response = client.show_query_results(result_len=result_len, - context_len=context_len, - result_offset=result_offset) - result_offset += result_len # initial offset is plus result len because client.show_query_results has already been executed once - socketio.emit('corpus_analysis_query', response, room=request.sid) - while result_offset < client.match_count: - logger.warning('====== While loop start for {} ======'.format(query)) - logger.warning('result_offset: {}'.format(result_offset)) - response = client.show_query_results(result_len=result_len, - context_len=context_len, - result_offset=result_offset) - result_offset += result_len - result_offset = min(result_offset, client.match_count) - logger.warning('result_offset end of while loop: {}'.format(result_offset)) - socketio.emit('corpus_analysis_query', response, room=request.sid) + chunk_size = 500 + chunk_start = 0 + logger.warning('pjentsch -> Start result transmission for query: {}'.format(query)) + while chunk_start < client.match_count: + chunk = client.show_query_results(result_len=chunk_size, + context_len=100, + result_offset=chunk_start) + socketio.emit('corpus_analysis_query', chunk, room=request.sid) + chunk_start += chunk_size @socketio.on('inspect_match') diff --git a/app/templates/corpora/analyse_corpus.html.j2 b/app/templates/corpora/analyse_corpus.html.j2 index 333b4636..14a0312b 100644 --- a/app/templates/corpora/analyse_corpus.html.j2 +++ b/app/templates/corpora/analyse_corpus.html.j2 @@ -270,7 +270,7 @@ function sendQuery(event) { event.preventDefault(); queryData = getQueryData(queryFormElement); - nopaque.socket.emit("corpus_analysis", queryData); + nopaque.socket.emit("corpus_analysis_query", queryData.query); // full results object declaration, kind of global maybe store it later? // will always be reset if a query is sent, so that only the chunks of the // current query will be saved in it