mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2024-12-24 10:34:17 +00:00
Get results with wrapper 3.0
This commit is contained in:
parent
dec90e30b5
commit
dbd580b3c0
@ -1,6 +1,6 @@
|
|||||||
from .CQiClient import CQiClient
|
from .CQiClient import CQiClient
|
||||||
from .CQi import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND
|
from .CQi import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND
|
||||||
import re
|
import time
|
||||||
from app import logger # only works if imported into opaque web app
|
from app import logger # only works if imported into opaque web app
|
||||||
|
|
||||||
|
|
||||||
@ -94,6 +94,7 @@ class CQiWrapper(CQiClient):
|
|||||||
+ result_subcorpus_name)
|
+ result_subcorpus_name)
|
||||||
self.SUBCORPUS_NAMES.append(self.result_subcorpus)
|
self.SUBCORPUS_NAMES.append(self.result_subcorpus)
|
||||||
self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus)
|
self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus)
|
||||||
|
print('Nr of all matches is:', self.nr_matches)
|
||||||
# logger.warning('Nr of all matches is: {}'.format(self.nr_matches))
|
# logger.warning('Nr of all matches is: {}'.format(self.nr_matches))
|
||||||
|
|
||||||
def show_subcorpora(self):
|
def show_subcorpora(self):
|
||||||
@ -104,7 +105,8 @@ class CQiWrapper(CQiClient):
|
|||||||
|
|
||||||
def show_query_results(self,
|
def show_query_results(self,
|
||||||
context_len=10,
|
context_len=10,
|
||||||
result_len=1000):
|
result_len=1000,
|
||||||
|
result_offset=0):
|
||||||
"""
|
"""
|
||||||
Show query results
|
Show query results
|
||||||
|
|
||||||
@ -131,14 +133,16 @@ class CQiWrapper(CQiClient):
|
|||||||
# match_boundries shows the start and end cpos of one match as a
|
# match_boundries shows the start and end cpos of one match as a
|
||||||
# pair of cpositions
|
# pair of cpositions
|
||||||
# [(1355, 1357), (1477, 1479)] Example for two boundry pairs
|
# [(1355, 1357), (1477, 1479)] Example for two boundry pairs
|
||||||
|
offset_start = 0 + (result_offset + 1) if result_offset != 0 else result_offset
|
||||||
|
offset_end = self.nr_matches + result_offset
|
||||||
match_boundaries = zip(self.cqp_dump_subcorpus(self.result_subcorpus,
|
match_boundaries = zip(self.cqp_dump_subcorpus(self.result_subcorpus,
|
||||||
CONST_FIELD_MATCH,
|
CONST_FIELD_MATCH,
|
||||||
0,
|
offset_start,
|
||||||
self.nr_matches - 1),
|
offset_end),
|
||||||
self.cqp_dump_subcorpus(self.result_subcorpus,
|
self.cqp_dump_subcorpus(self.result_subcorpus,
|
||||||
CONST_FIELD_MATCHEND,
|
CONST_FIELD_MATCHEND,
|
||||||
0,
|
offset_start,
|
||||||
self.nr_matches - 1))
|
offset_end))
|
||||||
|
|
||||||
# Generate all cpos between match boundries including start and end boundries.
|
# Generate all cpos between match boundries including start and end boundries.
|
||||||
# Also generate cpos for left and right context.
|
# Also generate cpos for left and right context.
|
||||||
@ -152,7 +156,7 @@ class CQiWrapper(CQiClient):
|
|||||||
lc = {'lc': lc_cpos}
|
lc = {'lc': lc_cpos}
|
||||||
match_cpos = list(range(start, end + 1))
|
match_cpos = list(range(start, end + 1))
|
||||||
match = {'hit': match_cpos}
|
match = {'hit': match_cpos}
|
||||||
rc_cpos = list(range(end + 1, min([self.corpus_max_len, end + self.context_len + 1])))
|
rc_cpos = list(range(end, min([self.corpus_max_len, end + self.context_len])))
|
||||||
rc = {'rc': rc_cpos}
|
rc = {'rc': rc_cpos}
|
||||||
lc.update(match)
|
lc.update(match)
|
||||||
lc.update(rc)
|
lc.update(rc)
|
||||||
@ -161,81 +165,87 @@ class CQiWrapper(CQiClient):
|
|||||||
# print(all_matches)
|
# print(all_matches)
|
||||||
# print(all_cpos)
|
# print(all_cpos)
|
||||||
|
|
||||||
# Get all sentences IDs for all above collected cpos in all_cpos
|
|
||||||
s_ids = self.cl_cpos2struc('CORPUS.s', all_cpos) # CHANGE to CORPUS.s will always be like this in nopaque
|
|
||||||
# Get all cpos for all sneteces boundries
|
# Get all cpos for all sneteces boundries
|
||||||
s_lookup = {}
|
# s_lookup = {}
|
||||||
for s_id in set(s_ids):
|
# for s_id in set(s_ids):
|
||||||
s_start, s_end = self.cl_struc2cpos('CORPUS.s', s_id) # CHANGE to CORPUS.s will always be like this in nopaque
|
# s_start, s_end = self.cl_struc2cpos('UTOPIEN.s', s_id)
|
||||||
# print(s_start, s_end)
|
# # CHANGE to UTOPIEN.s will always be like this in nopaque
|
||||||
s_cpos = range(s_start, s_end)
|
# s_cpos = range(s_start, s_end)
|
||||||
s_lookup.update({s_id: list(s_cpos)})
|
# s_lookup.update({s_id: list(s_cpos)})
|
||||||
# print(list(s_cpos))
|
# # print(list(s_cpos))
|
||||||
all_cpos.extend(s_cpos)
|
# all_cpos.extend(s_cpos)
|
||||||
|
t0 = time.time()
|
||||||
all_cpos = list(set(all_cpos)) # get rid of cpos duplicates
|
all_cpos = list(set(all_cpos)) # get rid of cpos duplicates
|
||||||
|
t1 = time.time()
|
||||||
|
t_total = t1 - t0
|
||||||
|
print('TIME FOR ALL CPOS:', t_total)
|
||||||
|
print('CPOS SUM:', len(all_cpos))
|
||||||
|
|
||||||
# Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
|
# Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
|
||||||
# all cpos entries in all_cpos_list
|
# all cpos entries in all_cpos_list
|
||||||
# Also saves these informations into self.results dict
|
# Also saves these informations into self.results dict
|
||||||
|
t6 = time.time()
|
||||||
all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
|
all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
|
||||||
|
t7 = time.time()
|
||||||
|
t_final = t7 - t6
|
||||||
|
print('GOT ALL RESULTS IN:', t_final)
|
||||||
|
|
||||||
self.results = {'matches': all_matches, 'cpos_lookup': all_cpos_infos,
|
self.results = {'matches': all_matches, 'cpos_lookup': all_cpos_infos,
|
||||||
's_lookup': s_lookup, 'text_lookup': text_lookup}
|
'text_lookup': text_lookup}
|
||||||
return self.results
|
return self.results
|
||||||
# print(self.results)
|
|
||||||
|
|
||||||
def get_cpos_infos(self, all_cpos):
|
def get_cpos_infos(self, all_cpos):
|
||||||
'''
|
'''
|
||||||
Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
|
Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
|
||||||
all cpos entries specified in the parameter all_cpos.
|
all cpos entries specified in the parameter all_cpos.
|
||||||
'''
|
'''
|
||||||
|
# Get all positional attribute informations
|
||||||
cpos_infos = {}
|
cpos_infos = {}
|
||||||
for p_attr_key in self.attr_strings['positional_attrs'].keys():
|
for p_attr_key in self.attr_strings['positional_attrs'].keys():
|
||||||
match_strs = self.cl_cpos2str(self.attr_strings['positional_attrs'][p_attr_key], all_cpos)
|
match_strs = self.cl_cpos2str(self.attr_strings['positional_attrs'][p_attr_key], all_cpos)
|
||||||
cpos_infos[p_attr_key] = match_strs
|
cpos_infos[p_attr_key] = match_strs
|
||||||
|
|
||||||
tmp_s_info = []
|
# Get all strucutural attribute informations
|
||||||
tmp_text_info = []
|
tmp_info = {}
|
||||||
text_lookup = {}
|
structs_to_check = []
|
||||||
tmp_dict = {}
|
|
||||||
for struct_attr_key in self.attr_strings['struct_attrs'].keys():
|
for struct_attr_key in self.attr_strings['struct_attrs'].keys():
|
||||||
check = self.attr_strings['struct_attrs'][struct_attr_key]
|
key = self.attr_strings['struct_attrs'][struct_attr_key]
|
||||||
if check == 'CORPUS.s':
|
has_value = self.corpus_structural_attribute_has_values(key)
|
||||||
struct_ids = self.cl_cpos2struc(check, all_cpos)
|
struct_ids = self.cl_cpos2struc(key, all_cpos)
|
||||||
|
if has_value is False: # Get IDs of strucutural elements without values (this means get IDs of XML tags. Struct elements only have values if they are XML attributes)
|
||||||
|
tmp_info[struct_attr_key] = []
|
||||||
for id in struct_ids:
|
for id in struct_ids:
|
||||||
tmp_s_info.append({struct_attr_key: id})
|
tmp_info[struct_attr_key].append(id)
|
||||||
elif check == 'CORPUS.text':
|
|
||||||
struct_ids = self.cl_cpos2struc(check, all_cpos)
|
|
||||||
for id in struct_ids:
|
|
||||||
tmp_text_info.append({struct_attr_key: id})
|
|
||||||
else:
|
else:
|
||||||
struct_ids = struct_ids = self.cl_cpos2struc(check, all_cpos)
|
structs_to_check.append({key: struct_attr_key})
|
||||||
struct_values = self.cl_struc2str(self.attr_strings['struct_attrs'][struct_attr_key], struct_ids)
|
struct_attr_values = list(tmp_info.values())
|
||||||
for value in struct_values:
|
struct_attr_keys = list(tmp_info.keys())
|
||||||
for id in struct_ids:
|
|
||||||
tmp_dict.update({id: {struct_attr_key: value}})
|
|
||||||
print(tmp_dict)
|
|
||||||
print(text_lookup)
|
|
||||||
|
|
||||||
# struct_entry = self.cl_cpos2struc(self.attr_strings['struct_attrs'][struct_attr_key], all_cpos)
|
# Build textlookup dictionary
|
||||||
# has_value = self.corpus_structural_attribute_has_values(self.attr_strings['struct_attrs'][struct_attr_key])
|
text_lookup_ids = list(set(struct_attr_values[0])) # First is always one text
|
||||||
# if has_value:
|
text_lookup = {}
|
||||||
# match_strs = self.cl_struc2str(self.attr_strings['struct_attrs'][struct_attr_key], struct_entry)
|
for d in structs_to_check:
|
||||||
# elif self.attr_strings['struct_attrs'][struct_attr_key] == 'CORPUS.s':
|
s_key, s_value = zip(*d.items())
|
||||||
# pass
|
s_value = s_value[0].split('_')[1]
|
||||||
# else:
|
struct_values = self.cl_struc2str(s_key[0], text_lookup_ids)
|
||||||
# match_strs = [None for i in struct_entry]
|
zipped = dict(zip(text_lookup_ids, struct_values))
|
||||||
# cpos_infos[struct_attr_key] = zip(struct_entry, match_strs)
|
for zip_key, zip_value in zipped.items():
|
||||||
tmp_list = []
|
check = text_lookup.get(zip_key)
|
||||||
attr_key_list = []
|
if check is None:
|
||||||
|
text_lookup[zip_key] = {s_value: zip_value}
|
||||||
|
else:
|
||||||
|
text_lookup[zip_key].update({s_value: zip_value})
|
||||||
|
|
||||||
|
# zip keys and values together
|
||||||
|
attr_values_list = []
|
||||||
|
attr_keys_list = []
|
||||||
for key in cpos_infos.keys():
|
for key in cpos_infos.keys():
|
||||||
tmp_list.append(cpos_infos[key])
|
attr_values_list.append(cpos_infos[key])
|
||||||
attr_key_list.append(key)
|
attr_keys_list.append(key)
|
||||||
joined_cpos_infos = zip(all_cpos, *tmp_list)
|
attr_keys_list.extend(struct_attr_keys)
|
||||||
|
attr_values_list.extend(struct_attr_values)
|
||||||
|
joined_cpos_infos = zip(all_cpos, *attr_values_list)
|
||||||
dict_cpos_infos = {}
|
dict_cpos_infos = {}
|
||||||
for info in joined_cpos_infos:
|
for info in joined_cpos_infos:
|
||||||
dict_cpos_infos[info[0]] = dict(zip(attr_key_list, info[1:]))
|
dict_cpos_infos[info[0]] = dict(zip(attr_keys_list, info[1:]))
|
||||||
for key, s_id, text_id in zip(dict_cpos_infos.keys(), tmp_s_info, tmp_text_info):
|
|
||||||
dict_cpos_infos[key].update(s_id)
|
|
||||||
dict_cpos_infos[key].update(text_id)
|
|
||||||
return dict_cpos_infos, text_lookup
|
return dict_cpos_infos, text_lookup
|
||||||
|
@ -4,10 +4,6 @@ from app.models import Corpus
|
|||||||
from flask import current_app, request
|
from flask import current_app, request
|
||||||
from flask_login import current_user, login_required
|
from flask_login import current_user, login_required
|
||||||
from .CQiWrapper.CQiWrapper import CQiWrapper
|
from .CQiWrapper.CQiWrapper import CQiWrapper
|
||||||
import sys
|
|
||||||
import gzip
|
|
||||||
import zlib
|
|
||||||
import json
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
' A dictionary containing lists of, with corpus ids associated, Socket.IO
|
' A dictionary containing lists of, with corpus ids associated, Socket.IO
|
||||||
@ -47,46 +43,13 @@ def corpus_analysis(message):
|
|||||||
room=request.sid)
|
room=request.sid)
|
||||||
return
|
return
|
||||||
""" Prepare and execute a query """
|
""" Prepare and execute a query """
|
||||||
corpus = 'CORPUS'
|
corpus_name = 'CORPUS'
|
||||||
query = (message['query'])
|
query = (message['query'])
|
||||||
query_subcorpus = 'Results'
|
client.select_corpus(corpus_name)
|
||||||
client.cqp_query(corpus, query_subcorpus, query)
|
client.query_subcorpus(query)
|
||||||
|
results = client.show_query_results(result_len=int(message['hits_per_page']), context_len=int(message['context']))
|
||||||
|
|
||||||
data = {'matches': [], 'cpos_lookup': {}, 'text_loopup': {}}
|
socketio.emit('corpus_analysis', results, room=request.sid)
|
||||||
|
|
||||||
""" Evaluate query results """
|
|
||||||
match_corpus = '{}:{}'.format(corpus, query_subcorpus)
|
|
||||||
match_num = min(int(message['hits_per_page']), client.cqp_subcorpus_size(match_corpus))
|
|
||||||
match_boundaries = zip(client.cqp_dump_subcorpus(match_corpus,
|
|
||||||
0x10,
|
|
||||||
0, match_num - 1),
|
|
||||||
client.cqp_dump_subcorpus(match_corpus,
|
|
||||||
0x11,
|
|
||||||
0, match_num - 1))
|
|
||||||
context = 15
|
|
||||||
corpus_len = 10000
|
|
||||||
for match_start, match_end in match_boundaries:
|
|
||||||
data['matches'].append({'lc': list(range(max(0, match_start - int(message['context'])), match_start)),
|
|
||||||
'hit': list(range(match_start, match_end + 1)),
|
|
||||||
'rc': list(range(match_end + 1, min(corpus_len, match_end + 1 + int(message['context']))))})
|
|
||||||
cpos_list = []
|
|
||||||
for match in data['matches']:
|
|
||||||
cpos_list += match['lc'] + match['hit'] + match['rc']
|
|
||||||
cpos_list = list(set(cpos_list))
|
|
||||||
lemma_list = client.cl_cpos2str('{}.lemma'.format(corpus), cpos_list)
|
|
||||||
pos_list = client.cl_cpos2str('{}.pos'.format(corpus), cpos_list)
|
|
||||||
simple_pos_list = client.cl_cpos2str('{}.simple_pos'.format(corpus), cpos_list)
|
|
||||||
s_id_list = client.cl_cpos2struc('{}.s'.format(corpus), cpos_list)
|
|
||||||
text_id_list = client.cl_cpos2struc('{}.text'.format(corpus), cpos_list)
|
|
||||||
word_list = client.cl_cpos2str('{}.word'.format(corpus), cpos_list)
|
|
||||||
for cpos, lemma, pos, simple_pos, s_id, text_id, word in zip(cpos_list, lemma_list, pos_list, simple_pos_list, s_id_list, text_id_list, word_list):
|
|
||||||
data['cpos_lookup'][cpos] = {'lemma': lemma, 'pos': pos, 'simple_pos': simple_pos, 's_id': s_id, 'text_id': text_id, 'word': word}
|
|
||||||
text_author_list = client.cl_struc2str('{}.text_author'.format(corpus), text_id_list)
|
|
||||||
text_publishing_year_list = client.cl_struc2str('{}.text_publishing_year'.format(corpus), text_id_list)
|
|
||||||
text_title_list = client.cl_struc2str('{}.text_title'.format(corpus), text_id_list)
|
|
||||||
for text_id, text_author, text_publishing_year, text_title in zip(text_id_list, text_author_list, text_publishing_year_list, text_title_list):
|
|
||||||
data['text_loopup'][text_id] = {'author': text_author, 'publishing_year': text_publishing_year, 'title': text_title}
|
|
||||||
socketio.emit('corpus_analysis', data, room=request.sid)
|
|
||||||
|
|
||||||
|
|
||||||
def corpus_analysis_session_handler(app, corpus_id, session_id):
|
def corpus_analysis_session_handler(app, corpus_id, session_id):
|
||||||
|
@ -182,6 +182,7 @@
|
|||||||
});
|
});
|
||||||
|
|
||||||
socket.on("corpus_analysis", function(message) {
|
socket.on("corpus_analysis", function(message) {
|
||||||
|
console.log(message);
|
||||||
var matchElement;
|
var matchElement;
|
||||||
var matchTextTitlesElement;
|
var matchTextTitlesElement;
|
||||||
var matchLeftContextElement;
|
var matchLeftContextElement;
|
||||||
@ -234,7 +235,7 @@
|
|||||||
matchHitElement.append(tokenElement);
|
matchHitElement.append(tokenElement);
|
||||||
matchHitElement.append(document.createTextNode(" "));
|
matchHitElement.append(document.createTextNode(" "));
|
||||||
tokenElements.push(tokenElement);
|
tokenElements.push(tokenElement);
|
||||||
textTitles.add(result["text_loopup"][token["text_id"]]["title"]);
|
textTitles.add(result["text_lookup"][token["text"]]["title"]);
|
||||||
}
|
}
|
||||||
matchTextTitlesElement.innerText = [...textTitles].join(",");
|
matchTextTitlesElement.innerText = [...textTitles].join(",");
|
||||||
matchElement.append(matchHitElement);
|
matchElement.append(matchHitElement);
|
||||||
@ -274,9 +275,9 @@
|
|||||||
simple_pos: ${token["simple_pos"]}
|
simple_pos: ${token["simple_pos"]}
|
||||||
</td>
|
</td>
|
||||||
<td class="left-align">
|
<td class="left-align">
|
||||||
Title: ${result["text_loopup"][token["text_id"]]["title"]}<br>
|
Title: ${result["text_lookup"][token["text"]]["title"]}<br>
|
||||||
Author: ${result["text_loopup"][token["text_id"]]["title"]}<br>
|
Author: ${result["text_lookup"][token["text"]]["title"]}<br>
|
||||||
Publishing year: ${result["text_loopup"][token["text_id"]]["publishing_year"]}
|
Publishing year: ${result["text_lookup"][token["text"]]["publishing_year"]}
|
||||||
</td>
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
</table>`,
|
</table>`,
|
||||||
|
Loading…
Reference in New Issue
Block a user