From dbd580b3c037645560bd7c9bccb932a3280d85df Mon Sep 17 00:00:00 2001
From: Stephan Porada <sporada@uni-bielefeld.de>
Date: Thu, 28 Nov 2019 14:14:56 +0100
Subject: [PATCH] Get results with wrapper 3.0

---
 app/corpora/CQiWrapper/CQiWrapper.py         | 122 ++++++++++---------
 app/corpora/events.py                        |  47 +------
 app/templates/corpora/analyse_corpus.html.j2 |   9 +-
 3 files changed, 76 insertions(+), 102 deletions(-)

diff --git a/app/corpora/CQiWrapper/CQiWrapper.py b/app/corpora/CQiWrapper/CQiWrapper.py
index d8b133ac..ea0acff9 100644
--- a/app/corpora/CQiWrapper/CQiWrapper.py
+++ b/app/corpora/CQiWrapper/CQiWrapper.py
@@ -1,6 +1,6 @@
 from .CQiClient import CQiClient
 from .CQi import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND
-import re
+import time
 from app import logger  # only works if imported into opaque web app
 
 
@@ -94,6 +94,7 @@ class CQiWrapper(CQiClient):
                                  + result_subcorpus_name)
         self.SUBCORPUS_NAMES.append(self.result_subcorpus)
         self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus)
+        print('Nr of all matches is:', self.nr_matches)
         # logger.warning('Nr of all matches is: {}'.format(self.nr_matches))
 
     def show_subcorpora(self):
@@ -104,7 +105,8 @@ class CQiWrapper(CQiClient):
 
     def show_query_results(self,
                            context_len=10,
-                           result_len=1000):
+                           result_len=1000,
+                           result_offset=0):
         """
         Show query results
 
@@ -131,14 +133,16 @@ class CQiWrapper(CQiClient):
             # match_boundries shows the start and end cpos of one match as a
             # pair of cpositions
             # [(1355, 1357), (1477, 1479)] Example for two boundry pairs
+            offset_start = 0 + (result_offset + 1) if result_offset != 0 else result_offset
+            offset_end = self.nr_matches + result_offset
             match_boundaries = zip(self.cqp_dump_subcorpus(self.result_subcorpus,
                                                            CONST_FIELD_MATCH,
-                                                           0,
-                                                           self.nr_matches - 1),
+                                                           offset_start,
+                                                           offset_end),
                                    self.cqp_dump_subcorpus(self.result_subcorpus,
                                                            CONST_FIELD_MATCHEND,
-                                                           0,
-                                                           self.nr_matches - 1))
+                                                           offset_start,
+                                                           offset_end))
 
         # Generate all cpos between match boundries including start and end boundries.
         # Also generate cpos for left and right context.
@@ -152,7 +156,7 @@ class CQiWrapper(CQiClient):
             lc = {'lc': lc_cpos}
             match_cpos = list(range(start, end + 1))
             match = {'hit': match_cpos}
-            rc_cpos = list(range(end + 1, min([self.corpus_max_len, end + self.context_len + 1])))
+            rc_cpos = list(range(end, min([self.corpus_max_len, end + self.context_len])))
             rc = {'rc': rc_cpos}
             lc.update(match)
             lc.update(rc)
@@ -161,81 +165,87 @@ class CQiWrapper(CQiClient):
         # print(all_matches)
         # print(all_cpos)
 
-        # Get all sentences IDs for all above collected cpos in all_cpos
-        s_ids = self.cl_cpos2struc('CORPUS.s', all_cpos) # CHANGE to CORPUS.s will always be like this in nopaque
         # Get all cpos for all sneteces boundries
-        s_lookup = {}
-        for s_id in set(s_ids):
-            s_start, s_end = self.cl_struc2cpos('CORPUS.s', s_id)  # CHANGE to CORPUS.s will always be like this in nopaque
-            # print(s_start, s_end)
-            s_cpos = range(s_start, s_end)
-            s_lookup.update({s_id: list(s_cpos)})
-            # print(list(s_cpos))
-            all_cpos.extend(s_cpos)
+        # s_lookup = {}
+        # for s_id in set(s_ids):
+        #     s_start, s_end = self.cl_struc2cpos('UTOPIEN.s', s_id)
+        #     # CHANGE to UTOPIEN.s will always be like this in nopaque
+        #     s_cpos = range(s_start, s_end)
+        #     s_lookup.update({s_id: list(s_cpos)})
+        #     # print(list(s_cpos))
+        #     all_cpos.extend(s_cpos)
+        t0 = time.time()
         all_cpos = list(set(all_cpos)) # get rid of cpos duplicates
+        t1 = time.time()
+        t_total = t1 - t0
+        print('TIME FOR ALL CPOS:', t_total)
+        print('CPOS SUM:', len(all_cpos))
 
         # Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
         # all cpos entries in all_cpos_list
         # Also saves these informations into self.results dict
+        t6 = time.time()
         all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
+        t7 = time.time()
+        t_final = t7 - t6
+        print('GOT ALL RESULTS IN:', t_final)
 
         self.results = {'matches': all_matches, 'cpos_lookup': all_cpos_infos,
-                        's_lookup': s_lookup, 'text_lookup': text_lookup}
+                        'text_lookup': text_lookup}
         return self.results
-        # print(self.results)
 
     def get_cpos_infos(self, all_cpos):
         '''
         Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
         all cpos entries specified in the parameter all_cpos.
         '''
+        # Get all positional attribute informations
         cpos_infos = {}
         for p_attr_key in self.attr_strings['positional_attrs'].keys():
             match_strs = self.cl_cpos2str(self.attr_strings['positional_attrs'][p_attr_key], all_cpos)
             cpos_infos[p_attr_key] = match_strs
 
-        tmp_s_info = []
-        tmp_text_info = []
-        text_lookup = {}
-        tmp_dict = {}
+        # Get all strucutural attribute informations
+        tmp_info = {}
+        structs_to_check = []
         for struct_attr_key in self.attr_strings['struct_attrs'].keys():
-            check = self.attr_strings['struct_attrs'][struct_attr_key]
-            if check == 'CORPUS.s':
-                struct_ids = self.cl_cpos2struc(check, all_cpos)
+            key = self.attr_strings['struct_attrs'][struct_attr_key]
+            has_value = self.corpus_structural_attribute_has_values(key)
+            struct_ids = self.cl_cpos2struc(key, all_cpos)
+            if has_value is False:  # Get IDs of strucutural elements without values (this means get IDs of XML tags. Struct elements only have values if they are XML attributes)
+                tmp_info[struct_attr_key] = []
                 for id in struct_ids:
-                    tmp_s_info.append({struct_attr_key: id})
-            elif check == 'CORPUS.text':
-                struct_ids = self.cl_cpos2struc(check, all_cpos)
-                for id in struct_ids:
-                    tmp_text_info.append({struct_attr_key: id})
+                    tmp_info[struct_attr_key].append(id)
             else:
-                struct_ids = struct_ids = self.cl_cpos2struc(check, all_cpos)
-                struct_values = self.cl_struc2str(self.attr_strings['struct_attrs'][struct_attr_key], struct_ids)
-                for value in struct_values:
-                    for id in struct_ids:
-                        tmp_dict.update({id: {struct_attr_key: value}})
-        print(tmp_dict)
-        print(text_lookup)
+                structs_to_check.append({key: struct_attr_key})
+        struct_attr_values = list(tmp_info.values())
+        struct_attr_keys = list(tmp_info.keys())
 
-            # struct_entry = self.cl_cpos2struc(self.attr_strings['struct_attrs'][struct_attr_key], all_cpos)
-            # has_value = self.corpus_structural_attribute_has_values(self.attr_strings['struct_attrs'][struct_attr_key])
-            # if has_value:
-            #     match_strs = self.cl_struc2str(self.attr_strings['struct_attrs'][struct_attr_key], struct_entry)
-            # elif self.attr_strings['struct_attrs'][struct_attr_key] == 'CORPUS.s':
-            #     pass
-            # else:
-            #     match_strs = [None for i in struct_entry]
-            # cpos_infos[struct_attr_key] = zip(struct_entry, match_strs)
-        tmp_list = []
-        attr_key_list = []
+        # Build textlookup dictionary
+        text_lookup_ids = list(set(struct_attr_values[0]))  # First is always one text
+        text_lookup = {}
+        for d in structs_to_check:
+            s_key, s_value = zip(*d.items())
+            s_value = s_value[0].split('_')[1]
+            struct_values = self.cl_struc2str(s_key[0], text_lookup_ids)
+            zipped = dict(zip(text_lookup_ids, struct_values))
+            for zip_key, zip_value in zipped.items():
+                check = text_lookup.get(zip_key)
+                if check is None:
+                    text_lookup[zip_key] = {s_value: zip_value}
+                else:
+                    text_lookup[zip_key].update({s_value: zip_value})
+
+        # zip keys and values together
+        attr_values_list = []
+        attr_keys_list = []
         for key in cpos_infos.keys():
-            tmp_list.append(cpos_infos[key])
-            attr_key_list.append(key)
-        joined_cpos_infos = zip(all_cpos, *tmp_list)
+            attr_values_list.append(cpos_infos[key])
+            attr_keys_list.append(key)
+        attr_keys_list.extend(struct_attr_keys)
+        attr_values_list.extend(struct_attr_values)
+        joined_cpos_infos = zip(all_cpos, *attr_values_list)
         dict_cpos_infos = {}
         for info in joined_cpos_infos:
-            dict_cpos_infos[info[0]] = dict(zip(attr_key_list, info[1:]))
-        for key, s_id, text_id in zip(dict_cpos_infos.keys(), tmp_s_info, tmp_text_info):
-            dict_cpos_infos[key].update(s_id)
-            dict_cpos_infos[key].update(text_id)
+            dict_cpos_infos[info[0]] = dict(zip(attr_keys_list, info[1:]))
         return dict_cpos_infos, text_lookup
diff --git a/app/corpora/events.py b/app/corpora/events.py
index 265285b5..8d1580c3 100644
--- a/app/corpora/events.py
+++ b/app/corpora/events.py
@@ -4,10 +4,6 @@ from app.models import Corpus
 from flask import current_app, request
 from flask_login import current_user, login_required
 from .CQiWrapper.CQiWrapper import CQiWrapper
-import sys
-import gzip
-import zlib
-import json
 
 '''
 ' A dictionary containing lists of, with corpus ids associated, Socket.IO
@@ -47,46 +43,13 @@ def corpus_analysis(message):
                       room=request.sid)
         return
     """ Prepare and execute a query """
-    corpus = 'CORPUS'
+    corpus_name = 'CORPUS'
     query = (message['query'])
-    query_subcorpus = 'Results'
-    client.cqp_query(corpus, query_subcorpus, query)
+    client.select_corpus(corpus_name)
+    client.query_subcorpus(query)
+    results = client.show_query_results(result_len=int(message['hits_per_page']), context_len=int(message['context']))
 
-    data = {'matches': [], 'cpos_lookup': {}, 'text_loopup': {}}
-
-    """ Evaluate query results """
-    match_corpus = '{}:{}'.format(corpus, query_subcorpus)
-    match_num = min(int(message['hits_per_page']), client.cqp_subcorpus_size(match_corpus))
-    match_boundaries = zip(client.cqp_dump_subcorpus(match_corpus,
-                                                     0x10,
-                                                     0, match_num - 1),
-                           client.cqp_dump_subcorpus(match_corpus,
-                                                     0x11,
-                                                     0, match_num - 1))
-    context = 15
-    corpus_len = 10000
-    for match_start, match_end in match_boundaries:
-        data['matches'].append({'lc': list(range(max(0, match_start - int(message['context'])), match_start)),
-                                'hit': list(range(match_start, match_end + 1)),
-                                'rc': list(range(match_end + 1, min(corpus_len, match_end + 1 + int(message['context']))))})
-    cpos_list = []
-    for match in data['matches']:
-        cpos_list += match['lc'] + match['hit'] + match['rc']
-    cpos_list = list(set(cpos_list))
-    lemma_list = client.cl_cpos2str('{}.lemma'.format(corpus), cpos_list)
-    pos_list = client.cl_cpos2str('{}.pos'.format(corpus), cpos_list)
-    simple_pos_list = client.cl_cpos2str('{}.simple_pos'.format(corpus), cpos_list)
-    s_id_list = client.cl_cpos2struc('{}.s'.format(corpus), cpos_list)
-    text_id_list = client.cl_cpos2struc('{}.text'.format(corpus), cpos_list)
-    word_list = client.cl_cpos2str('{}.word'.format(corpus), cpos_list)
-    for cpos, lemma, pos, simple_pos, s_id, text_id, word in zip(cpos_list, lemma_list, pos_list, simple_pos_list, s_id_list, text_id_list, word_list):
-        data['cpos_lookup'][cpos] = {'lemma': lemma, 'pos': pos, 'simple_pos': simple_pos, 's_id': s_id, 'text_id': text_id, 'word': word}
-    text_author_list = client.cl_struc2str('{}.text_author'.format(corpus), text_id_list)
-    text_publishing_year_list = client.cl_struc2str('{}.text_publishing_year'.format(corpus), text_id_list)
-    text_title_list = client.cl_struc2str('{}.text_title'.format(corpus), text_id_list)
-    for text_id, text_author, text_publishing_year, text_title in zip(text_id_list, text_author_list, text_publishing_year_list, text_title_list):
-        data['text_loopup'][text_id] = {'author': text_author, 'publishing_year': text_publishing_year, 'title': text_title}
-    socketio.emit('corpus_analysis', data, room=request.sid)
+    socketio.emit('corpus_analysis', results, room=request.sid)
 
 
 def corpus_analysis_session_handler(app, corpus_id, session_id):
diff --git a/app/templates/corpora/analyse_corpus.html.j2 b/app/templates/corpora/analyse_corpus.html.j2
index e26f6e52..5db09786 100644
--- a/app/templates/corpora/analyse_corpus.html.j2
+++ b/app/templates/corpora/analyse_corpus.html.j2
@@ -182,6 +182,7 @@
   });
 
   socket.on("corpus_analysis", function(message) {
+    console.log(message);
     var matchElement;
     var matchTextTitlesElement;
     var matchLeftContextElement;
@@ -234,7 +235,7 @@
         matchHitElement.append(tokenElement);
         matchHitElement.append(document.createTextNode(" "));
         tokenElements.push(tokenElement);
-        textTitles.add(result["text_loopup"][token["text_id"]]["title"]);
+        textTitles.add(result["text_lookup"][token["text"]]["title"]);
       }
       matchTextTitlesElement.innerText = [...textTitles].join(",");
       matchElement.append(matchHitElement);
@@ -274,9 +275,9 @@
                                    simple_pos: ${token["simple_pos"]}
                                  </td>
                                  <td class="left-align">
-                                   Title: ${result["text_loopup"][token["text_id"]]["title"]}<br>
-                                   Author: ${result["text_loopup"][token["text_id"]]["title"]}<br>
-                                   Publishing year: ${result["text_loopup"][token["text_id"]]["publishing_year"]}
+                                   Title: ${result["text_lookup"][token["text"]]["title"]}<br>
+                                   Author: ${result["text_lookup"][token["text"]]["title"]}<br>
+                                   Publishing year: ${result["text_lookup"][token["text"]]["publishing_year"]}
                                  </td>
                                </tr>
                              </table>`,