mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
				synced 2025-11-04 04:12:45 +00:00 
			
		
		
		
	Move Socket.IO Namespaces to dedicated directory
This commit is contained in:
		@@ -1,206 +0,0 @@
 | 
			
		||||
from cqi import CQiClient
 | 
			
		||||
from cqi.errors import CQiException
 | 
			
		||||
from cqi.status import CQiStatus
 | 
			
		||||
from docker.models.containers import Container
 | 
			
		||||
from flask import current_app, session
 | 
			
		||||
from flask_login import current_user
 | 
			
		||||
from flask_socketio import Namespace
 | 
			
		||||
from inspect import signature
 | 
			
		||||
from threading import Lock
 | 
			
		||||
from typing import Callable
 | 
			
		||||
from app import db, docker_client, hashids, socketio
 | 
			
		||||
from app.decorators import socketio_login_required
 | 
			
		||||
from app.models import Corpus, CorpusStatus
 | 
			
		||||
from . import extensions
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
'''
 | 
			
		||||
This package tunnels the Corpus Query interface (CQi) protocol through
 | 
			
		||||
Socket.IO (SIO) by tunneling CQi API calls through an event called "exec".
 | 
			
		||||
 | 
			
		||||
Basic concept:
 | 
			
		||||
1. A client connects to the namespace.
 | 
			
		||||
2. The client emits the "init" event and provides a corpus id for the corpus
 | 
			
		||||
   that should be analysed in this session.
 | 
			
		||||
     1.1 The analysis session counter of the corpus is incremented.
 | 
			
		||||
     1.2 A CQiClient and a (Mutex) Lock belonging to it is created.
 | 
			
		||||
     1.3 Wait until the CQP server is running.
 | 
			
		||||
     1.4 Connect the CQiClient to the server.
 | 
			
		||||
     1.5 Save the CQiClient, the Lock and the corpus id in the session for
 | 
			
		||||
         subsequential use.
 | 
			
		||||
3. The client emits "exec" events, within which it provides the name of a CQi
 | 
			
		||||
   API function and the corresponding arguments.
 | 
			
		||||
     3.1 The "exec" event handler will execute the function, make sure that
 | 
			
		||||
         the result is serializable and returns the result back to the client.
 | 
			
		||||
4. The client disconnects from the namespace
 | 
			
		||||
     4.1 The analysis session counter of the corpus is decremented.
 | 
			
		||||
     4.2 The CQiClient and (Mutex) Lock belonging to it are teared down.
 | 
			
		||||
'''
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
CQI_API_FUNCTION_NAMES: list[str] = [
 | 
			
		||||
    'ask_feature_cl_2_3',
 | 
			
		||||
    'ask_feature_cqi_1_0',
 | 
			
		||||
    'ask_feature_cqp_2_3',
 | 
			
		||||
    'cl_alg2cpos',
 | 
			
		||||
    'cl_attribute_size',
 | 
			
		||||
    'cl_cpos2alg',
 | 
			
		||||
    'cl_cpos2id',
 | 
			
		||||
    'cl_cpos2lbound',
 | 
			
		||||
    'cl_cpos2rbound',
 | 
			
		||||
    'cl_cpos2str',
 | 
			
		||||
    'cl_cpos2struc',
 | 
			
		||||
    'cl_drop_attribute',
 | 
			
		||||
    'cl_id2cpos',
 | 
			
		||||
    'cl_id2freq',
 | 
			
		||||
    'cl_id2str',
 | 
			
		||||
    'cl_idlist2cpos',
 | 
			
		||||
    'cl_lexicon_size',
 | 
			
		||||
    'cl_regex2id',
 | 
			
		||||
    'cl_str2id',
 | 
			
		||||
    'cl_struc2cpos',
 | 
			
		||||
    'cl_struc2str',
 | 
			
		||||
    'corpus_alignment_attributes',
 | 
			
		||||
    'corpus_charset',
 | 
			
		||||
    'corpus_drop_corpus',
 | 
			
		||||
    'corpus_full_name',
 | 
			
		||||
    'corpus_info',
 | 
			
		||||
    'corpus_list_corpora',
 | 
			
		||||
    'corpus_positional_attributes',
 | 
			
		||||
    'corpus_properties',
 | 
			
		||||
    'corpus_structural_attribute_has_values',
 | 
			
		||||
    'corpus_structural_attributes',
 | 
			
		||||
    'cqp_drop_subcorpus',
 | 
			
		||||
    'cqp_dump_subcorpus',
 | 
			
		||||
    'cqp_fdist_1',
 | 
			
		||||
    'cqp_fdist_2',
 | 
			
		||||
    'cqp_list_subcorpora',
 | 
			
		||||
    'cqp_query',
 | 
			
		||||
    'cqp_subcorpus_has_field',
 | 
			
		||||
    'cqp_subcorpus_size',
 | 
			
		||||
    'ctrl_bye',
 | 
			
		||||
    'ctrl_connect',
 | 
			
		||||
    'ctrl_last_general_error',
 | 
			
		||||
    'ctrl_ping',
 | 
			
		||||
    'ctrl_user_abort'
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class CQiOverSocketIO(Namespace):
 | 
			
		||||
    @socketio_login_required
 | 
			
		||||
    def on_connect(self):
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
    @socketio_login_required
 | 
			
		||||
    def on_init(self, db_corpus_hashid: str):
 | 
			
		||||
        db_corpus_id: int = hashids.decode(db_corpus_hashid)
 | 
			
		||||
        db_corpus: Corpus | None = Corpus.query.get(db_corpus_id)
 | 
			
		||||
        if db_corpus is None:
 | 
			
		||||
            return {'code': 404, 'msg': 'Not Found'}
 | 
			
		||||
        if not (db_corpus.user == current_user
 | 
			
		||||
                or current_user.is_following_corpus(db_corpus)
 | 
			
		||||
                or current_user.is_administrator):
 | 
			
		||||
            return {'code': 403, 'msg': 'Forbidden'}
 | 
			
		||||
        if db_corpus.status not in [
 | 
			
		||||
            CorpusStatus.BUILT,
 | 
			
		||||
            CorpusStatus.STARTING_ANALYSIS_SESSION,
 | 
			
		||||
            CorpusStatus.RUNNING_ANALYSIS_SESSION,
 | 
			
		||||
            CorpusStatus.CANCELING_ANALYSIS_SESSION
 | 
			
		||||
        ]:
 | 
			
		||||
            return {'code': 424, 'msg': 'Failed Dependency'}
 | 
			
		||||
        if db_corpus.num_analysis_sessions is None:
 | 
			
		||||
            db_corpus.num_analysis_sessions = 0
 | 
			
		||||
            db.session.commit()
 | 
			
		||||
        db_corpus.num_analysis_sessions = Corpus.num_analysis_sessions + 1
 | 
			
		||||
        db.session.commit()
 | 
			
		||||
        retry_counter: int = 20
 | 
			
		||||
        while db_corpus.status != CorpusStatus.RUNNING_ANALYSIS_SESSION:
 | 
			
		||||
            if retry_counter == 0:
 | 
			
		||||
                db_corpus.num_analysis_sessions = Corpus.num_analysis_sessions - 1
 | 
			
		||||
                db.session.commit()
 | 
			
		||||
                return {'code': 408, 'msg': 'Request Timeout'}
 | 
			
		||||
            socketio.sleep(3)
 | 
			
		||||
            retry_counter -= 1
 | 
			
		||||
            db.session.refresh(db_corpus)
 | 
			
		||||
        # cqi_client: CQiClient = CQiClient(f'cqpserver_{db_corpus_id}')
 | 
			
		||||
        cqpserver_container_name: str = f'cqpserver_{db_corpus_id}'
 | 
			
		||||
        cqpserver_container: Container = docker_client.containers.get(cqpserver_container_name)
 | 
			
		||||
        cqpserver_host: str = cqpserver_container.attrs['NetworkSettings']['Networks'][current_app.config['NOPAQUE_DOCKER_NETWORK_NAME']]['IPAddress']
 | 
			
		||||
        cqi_client: CQiClient = CQiClient(cqpserver_host)
 | 
			
		||||
        session['cqi_over_sio'] = {
 | 
			
		||||
            'cqi_client': cqi_client,
 | 
			
		||||
            'cqi_client_lock': Lock(),
 | 
			
		||||
            'db_corpus_id': db_corpus_id
 | 
			
		||||
        }
 | 
			
		||||
        return {'code': 200, 'msg': 'OK'}
 | 
			
		||||
 | 
			
		||||
    @socketio_login_required
 | 
			
		||||
    def on_exec(self, fn_name: str, fn_args: dict = {}):
 | 
			
		||||
        try:
 | 
			
		||||
            cqi_client: CQiClient = session['cqi_over_sio']['cqi_client']
 | 
			
		||||
            cqi_client_lock: Lock = session['cqi_over_sio']['cqi_client_lock']
 | 
			
		||||
        except KeyError:
 | 
			
		||||
            return {'code': 424, 'msg': 'Failed Dependency'}
 | 
			
		||||
        if fn_name in CQI_API_FUNCTION_NAMES:
 | 
			
		||||
            fn: Callable = getattr(cqi_client.api, fn_name)
 | 
			
		||||
        elif fn_name in extensions.CQI_EXTENSION_FUNCTION_NAMES:
 | 
			
		||||
            fn: Callable = getattr(extensions, fn_name)
 | 
			
		||||
        else:
 | 
			
		||||
            return {'code': 400, 'msg': 'Bad Request'}
 | 
			
		||||
        for param in signature(fn).parameters.values():
 | 
			
		||||
            if param.default is param.empty:
 | 
			
		||||
                if param.name not in fn_args:
 | 
			
		||||
                    return {'code': 400, 'msg': 'Bad Request'}
 | 
			
		||||
            else:
 | 
			
		||||
                if param.name not in fn_args:
 | 
			
		||||
                    continue
 | 
			
		||||
            if type(fn_args[param.name]) is not param.annotation:
 | 
			
		||||
                return {'code': 400, 'msg': 'Bad Request'}
 | 
			
		||||
        cqi_client_lock.acquire()
 | 
			
		||||
        try:
 | 
			
		||||
            fn_return_value = fn(**fn_args)
 | 
			
		||||
        except BrokenPipeError as e:
 | 
			
		||||
            return {'code': 500, 'msg': 'Internal Server Error'}
 | 
			
		||||
        except CQiException as e:
 | 
			
		||||
            return {
 | 
			
		||||
                'code': 502,
 | 
			
		||||
                'msg': 'Bad Gateway',
 | 
			
		||||
                'payload': {
 | 
			
		||||
                    'code': e.code,
 | 
			
		||||
                    'desc': e.description,
 | 
			
		||||
                    'msg': e.__class__.__name__
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        finally:
 | 
			
		||||
            cqi_client_lock.release()
 | 
			
		||||
        if isinstance(fn_return_value, CQiStatus):
 | 
			
		||||
            payload = {
 | 
			
		||||
                'code': fn_return_value.code,
 | 
			
		||||
                'msg': fn_return_value.__class__.__name__
 | 
			
		||||
            }
 | 
			
		||||
        else:
 | 
			
		||||
            payload = fn_return_value
 | 
			
		||||
        return {'code': 200, 'msg': 'OK', 'payload': payload}
 | 
			
		||||
 | 
			
		||||
    def on_disconnect(self):
 | 
			
		||||
        try:
 | 
			
		||||
            cqi_client: CQiClient = session['cqi_over_sio']['cqi_client']
 | 
			
		||||
            cqi_client_lock: Lock = session['cqi_over_sio']['cqi_client_lock']
 | 
			
		||||
            db_corpus_id: int = session['cqi_over_sio']['db_corpus_id']
 | 
			
		||||
        except KeyError:
 | 
			
		||||
            return
 | 
			
		||||
        cqi_client_lock.acquire()
 | 
			
		||||
        try:
 | 
			
		||||
            session.pop('cqi_over_sio')
 | 
			
		||||
        except KeyError:
 | 
			
		||||
            pass
 | 
			
		||||
        try:
 | 
			
		||||
            cqi_client.api.ctrl_bye()
 | 
			
		||||
        except (BrokenPipeError, CQiException):
 | 
			
		||||
            pass
 | 
			
		||||
        cqi_client_lock.release()
 | 
			
		||||
        db_corpus: Corpus | None = Corpus.query.get(db_corpus_id)
 | 
			
		||||
        if db_corpus is None:
 | 
			
		||||
            return
 | 
			
		||||
        db_corpus.num_analysis_sessions = Corpus.num_analysis_sessions - 1
 | 
			
		||||
        db.session.commit()
 | 
			
		||||
@@ -1,286 +0,0 @@
 | 
			
		||||
from collections import Counter
 | 
			
		||||
from cqi import CQiClient
 | 
			
		||||
from cqi.models.corpora import Corpus as CQiCorpus
 | 
			
		||||
from cqi.models.subcorpora import Subcorpus as CQiSubcorpus
 | 
			
		||||
from cqi.models.attributes import (
 | 
			
		||||
    PositionalAttribute as CQiPositionalAttribute,
 | 
			
		||||
    StructuralAttribute as CQiStructuralAttribute
 | 
			
		||||
)
 | 
			
		||||
from cqi.status import StatusOk as CQiStatusOk
 | 
			
		||||
from flask import session
 | 
			
		||||
import gzip
 | 
			
		||||
import json
 | 
			
		||||
import math
 | 
			
		||||
from app import db
 | 
			
		||||
from app.models import Corpus
 | 
			
		||||
from .utils import lookups_by_cpos, partial_export_subcorpus, export_subcorpus
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
CQI_EXTENSION_FUNCTION_NAMES: list[str] = [
 | 
			
		||||
    'ext_corpus_update_db',
 | 
			
		||||
    'ext_corpus_static_data',
 | 
			
		||||
    'ext_corpus_paginate_corpus',
 | 
			
		||||
    'ext_cqp_paginate_subcorpus',
 | 
			
		||||
    'ext_cqp_partial_export_subcorpus',
 | 
			
		||||
    'ext_cqp_export_subcorpus',
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def ext_corpus_update_db(corpus: str) -> CQiStatusOk:
 | 
			
		||||
    cqi_client: CQiClient = session['cqi_over_sio']['cqi_client']
 | 
			
		||||
    db_corpus_id: int = session['cqi_over_sio']['db_corpus_id']
 | 
			
		||||
    db_corpus: Corpus = Corpus.query.get(db_corpus_id)
 | 
			
		||||
    cqi_corpus: CQiCorpus = cqi_client.corpora.get(corpus)
 | 
			
		||||
    db_corpus.num_tokens = cqi_corpus.size
 | 
			
		||||
    db.session.commit()
 | 
			
		||||
    return CQiStatusOk()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def ext_corpus_static_data(corpus: str) -> dict:
 | 
			
		||||
    db_corpus_id: int = session['cqi_over_sio']['db_corpus_id']
 | 
			
		||||
    db_corpus: Corpus = Corpus.query.get(db_corpus_id)
 | 
			
		||||
 | 
			
		||||
    static_data_file_path = db_corpus.path / 'cwb' / 'static.json.gz'
 | 
			
		||||
    if static_data_file_path.exists():
 | 
			
		||||
        with static_data_file_path.open('rb') as f:
 | 
			
		||||
            return f.read()
 | 
			
		||||
 | 
			
		||||
    cqi_client: CQiClient = session['cqi_over_sio']['cqi_client']
 | 
			
		||||
    cqi_corpus: CQiCorpus = cqi_client.corpora.get(corpus)
 | 
			
		||||
    cqi_p_attrs: list[CQiPositionalAttribute] = cqi_corpus.positional_attributes.list()
 | 
			
		||||
    cqi_s_attrs: list[CQiStructuralAttribute] = cqi_corpus.structural_attributes.list()
 | 
			
		||||
 | 
			
		||||
    static_data = {
 | 
			
		||||
        'corpus': {
 | 
			
		||||
            'bounds': [0, cqi_corpus.size - 1],
 | 
			
		||||
            'freqs': {}
 | 
			
		||||
        },
 | 
			
		||||
        'p_attrs': {},
 | 
			
		||||
        's_attrs': {},
 | 
			
		||||
        'values': {'p_attrs': {}, 's_attrs': {}}
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    for p_attr in cqi_p_attrs:
 | 
			
		||||
        print(f'corpus.freqs.{p_attr.name}')
 | 
			
		||||
        static_data['corpus']['freqs'][p_attr.name] = []
 | 
			
		||||
        p_attr_id_list: list[int] = list(range(p_attr.lexicon_size))
 | 
			
		||||
        static_data['corpus']['freqs'][p_attr.name].extend(p_attr.freqs_by_ids(p_attr_id_list))
 | 
			
		||||
        del p_attr_id_list
 | 
			
		||||
 | 
			
		||||
        print(f'p_attrs.{p_attr.name}')
 | 
			
		||||
        static_data['p_attrs'][p_attr.name] = []
 | 
			
		||||
        cpos_list: list[int] = list(range(cqi_corpus.size))
 | 
			
		||||
        static_data['p_attrs'][p_attr.name].extend(p_attr.ids_by_cpos(cpos_list))
 | 
			
		||||
        del cpos_list
 | 
			
		||||
 | 
			
		||||
        print(f'values.p_attrs.{p_attr.name}')
 | 
			
		||||
        static_data['values']['p_attrs'][p_attr.name] = []
 | 
			
		||||
        p_attr_id_list: list[int] = list(range(p_attr.lexicon_size))
 | 
			
		||||
        static_data['values']['p_attrs'][p_attr.name].extend(p_attr.values_by_ids(p_attr_id_list))
 | 
			
		||||
        del p_attr_id_list
 | 
			
		||||
 | 
			
		||||
    for s_attr in cqi_s_attrs:
 | 
			
		||||
        if s_attr.has_values:
 | 
			
		||||
            continue
 | 
			
		||||
 | 
			
		||||
        static_data['s_attrs'][s_attr.name] = {'lexicon': [], 'values': None}
 | 
			
		||||
 | 
			
		||||
        if s_attr.name in ['s', 'ent']:
 | 
			
		||||
            ##############################################################
 | 
			
		||||
            # A faster way to get cpos boundaries for smaller s_attrs    #
 | 
			
		||||
            # Note: Needs more testing, don't use it in production       #
 | 
			
		||||
            ##############################################################
 | 
			
		||||
            cqi_corpus.query('Last', f'<{s_attr.name}> []* </{s_attr.name}>;')
 | 
			
		||||
            cqi_subcorpus: CQiSubcorpus = cqi_corpus.subcorpora.get('Last')
 | 
			
		||||
            first_match: int = 0
 | 
			
		||||
            last_match: int = cqi_subcorpus.size - 1
 | 
			
		||||
            match_boundaries = zip(
 | 
			
		||||
                range(first_match, last_match + 1),
 | 
			
		||||
                cqi_subcorpus.dump(
 | 
			
		||||
                    cqi_subcorpus.fields['match'],
 | 
			
		||||
                    first_match,
 | 
			
		||||
                    last_match
 | 
			
		||||
                ),
 | 
			
		||||
                cqi_subcorpus.dump(
 | 
			
		||||
                    cqi_subcorpus.fields['matchend'],
 | 
			
		||||
                    first_match,
 | 
			
		||||
                    last_match
 | 
			
		||||
                )
 | 
			
		||||
            )
 | 
			
		||||
            cqi_subcorpus.drop()
 | 
			
		||||
            del cqi_subcorpus, first_match, last_match
 | 
			
		||||
            for id, lbound, rbound in match_boundaries:
 | 
			
		||||
                static_data['s_attrs'][s_attr.name]['lexicon'].append({})
 | 
			
		||||
                print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
 | 
			
		||||
                static_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
 | 
			
		||||
            del match_boundaries
 | 
			
		||||
 | 
			
		||||
        if s_attr.name != 'text':
 | 
			
		||||
            continue
 | 
			
		||||
 | 
			
		||||
        for id in range(0, s_attr.size):
 | 
			
		||||
            static_data['s_attrs'][s_attr.name]['lexicon'].append({})
 | 
			
		||||
            # This is a very slow operation, thats why we only use it for
 | 
			
		||||
            # the text attribute
 | 
			
		||||
            lbound, rbound = s_attr.cpos_by_id(id)
 | 
			
		||||
            print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
 | 
			
		||||
            static_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
 | 
			
		||||
            static_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'] = {}
 | 
			
		||||
            cpos_list: list[int] = list(range(lbound, rbound + 1))
 | 
			
		||||
            for p_attr in cqi_p_attrs:
 | 
			
		||||
                p_attr_ids: list[int] = []
 | 
			
		||||
                p_attr_ids.extend(p_attr.ids_by_cpos(cpos_list))
 | 
			
		||||
                print(f's_attrs.{s_attr.name}.lexicon.{id}.freqs.{p_attr.name}')
 | 
			
		||||
                static_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'][p_attr.name] = dict(Counter(p_attr_ids))
 | 
			
		||||
                del p_attr_ids
 | 
			
		||||
            del cpos_list
 | 
			
		||||
 | 
			
		||||
        sub_s_attrs: list[CQiStructuralAttribute] = cqi_corpus.structural_attributes.list(filters={'part_of': s_attr})
 | 
			
		||||
        print(f's_attrs.{s_attr.name}.values')
 | 
			
		||||
        static_data['s_attrs'][s_attr.name]['values'] = [
 | 
			
		||||
            sub_s_attr.name[(len(s_attr.name) + 1):]
 | 
			
		||||
            for sub_s_attr in sub_s_attrs
 | 
			
		||||
        ]
 | 
			
		||||
        s_attr_id_list: list[int] = list(range(s_attr.size))
 | 
			
		||||
        sub_s_attr_values: list[str] = []
 | 
			
		||||
        for sub_s_attr in sub_s_attrs:
 | 
			
		||||
            tmp = []
 | 
			
		||||
            tmp.extend(sub_s_attr.values_by_ids(s_attr_id_list))
 | 
			
		||||
            sub_s_attr_values.append(tmp)
 | 
			
		||||
            del tmp
 | 
			
		||||
        del s_attr_id_list
 | 
			
		||||
        print(f'values.s_attrs.{s_attr.name}')
 | 
			
		||||
        static_data['values']['s_attrs'][s_attr.name] = [
 | 
			
		||||
            {
 | 
			
		||||
                s_attr_value_name: sub_s_attr_values[s_attr_value_name_idx][s_attr_id]
 | 
			
		||||
                for s_attr_value_name_idx, s_attr_value_name in enumerate(
 | 
			
		||||
                    static_data['s_attrs'][s_attr.name]['values']
 | 
			
		||||
                )
 | 
			
		||||
            } for s_attr_id in range(0, s_attr.size)
 | 
			
		||||
        ]
 | 
			
		||||
        del sub_s_attr_values
 | 
			
		||||
    print('Saving static data to file')
 | 
			
		||||
    with gzip.open(static_data_file_path, 'wt') as f:
 | 
			
		||||
        json.dump(static_data, f)
 | 
			
		||||
    del static_data
 | 
			
		||||
    print('Sending static data to client')
 | 
			
		||||
    with open(static_data_file_path, 'rb') as f:
 | 
			
		||||
        return f.read()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def ext_corpus_paginate_corpus(
 | 
			
		||||
    corpus: str,
 | 
			
		||||
    page: int = 1,
 | 
			
		||||
    per_page: int = 20
 | 
			
		||||
) -> dict:
 | 
			
		||||
    cqi_client: CQiClient = session['cqi_over_sio']['cqi_client']
 | 
			
		||||
    cqi_corpus = cqi_client.corpora.get(corpus)
 | 
			
		||||
    # Sanity checks
 | 
			
		||||
    if (
 | 
			
		||||
        per_page < 1
 | 
			
		||||
        or page < 1
 | 
			
		||||
        or (
 | 
			
		||||
            cqi_corpus.size > 0
 | 
			
		||||
            and page > math.ceil(cqi_corpus.size / per_page)
 | 
			
		||||
        )
 | 
			
		||||
    ):
 | 
			
		||||
        return {'code': 416, 'msg': 'Range Not Satisfiable'}
 | 
			
		||||
    first_cpos = (page - 1) * per_page
 | 
			
		||||
    last_cpos = min(cqi_corpus.size, first_cpos + per_page)
 | 
			
		||||
    cpos_list = [*range(first_cpos, last_cpos)]
 | 
			
		||||
    lookups = lookups_by_cpos(cqi_corpus, cpos_list)
 | 
			
		||||
    payload = {}
 | 
			
		||||
    # the items for the current page
 | 
			
		||||
    payload['items'] = [cpos_list]
 | 
			
		||||
    # the lookups for the items
 | 
			
		||||
    payload['lookups'] = lookups
 | 
			
		||||
    # the total number of items matching the query
 | 
			
		||||
    payload['total'] = cqi_corpus.size
 | 
			
		||||
    # the number of items to be displayed on a page.
 | 
			
		||||
    payload['per_page'] = per_page
 | 
			
		||||
    # The total number of pages
 | 
			
		||||
    payload['pages'] = math.ceil(payload['total'] / payload['per_page'])
 | 
			
		||||
    # the current page number (1 indexed)
 | 
			
		||||
    payload['page'] = page if payload['pages'] > 0 else None
 | 
			
		||||
    # True if a previous page exists
 | 
			
		||||
    payload['has_prev'] = payload['page'] > 1 if payload['page'] else False
 | 
			
		||||
    # True if a next page exists.
 | 
			
		||||
    payload['has_next'] = payload['page'] < payload['pages'] if payload['page'] else False  # noqa
 | 
			
		||||
    # Number of the previous page.
 | 
			
		||||
    payload['prev_num'] = payload['page'] - 1 if payload['has_prev'] else None
 | 
			
		||||
    # Number of the next page
 | 
			
		||||
    payload['next_num'] = payload['page'] + 1 if payload['has_next'] else None
 | 
			
		||||
    return payload
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def ext_cqp_paginate_subcorpus(
 | 
			
		||||
    subcorpus: str,
 | 
			
		||||
    context: int = 50,
 | 
			
		||||
    page: int = 1,
 | 
			
		||||
    per_page: int = 20
 | 
			
		||||
) -> dict:
 | 
			
		||||
    corpus_name, subcorpus_name = subcorpus.split(':', 1)
 | 
			
		||||
    cqi_client: CQiClient = session['cqi_over_sio']['cqi_client']
 | 
			
		||||
    cqi_corpus = cqi_client.corpora.get(corpus_name)
 | 
			
		||||
    cqi_subcorpus = cqi_corpus.subcorpora.get(subcorpus_name)
 | 
			
		||||
    # Sanity checks
 | 
			
		||||
    if (
 | 
			
		||||
        per_page < 1
 | 
			
		||||
        or page < 1
 | 
			
		||||
        or (
 | 
			
		||||
            cqi_subcorpus.size > 0
 | 
			
		||||
            and page > math.ceil(cqi_subcorpus.size / per_page)
 | 
			
		||||
        )
 | 
			
		||||
    ):
 | 
			
		||||
        return {'code': 416, 'msg': 'Range Not Satisfiable'}
 | 
			
		||||
    offset = (page - 1) * per_page
 | 
			
		||||
    cutoff = per_page
 | 
			
		||||
    cqi_results_export = export_subcorpus(
 | 
			
		||||
        cqi_subcorpus, context=context, cutoff=cutoff, offset=offset)
 | 
			
		||||
    payload = {}
 | 
			
		||||
    # the items for the current page
 | 
			
		||||
    payload['items'] = cqi_results_export.pop('matches')
 | 
			
		||||
    # the lookups for the items
 | 
			
		||||
    payload['lookups'] = cqi_results_export
 | 
			
		||||
    # the total number of items matching the query
 | 
			
		||||
    payload['total'] = cqi_subcorpus.size
 | 
			
		||||
    # the number of items to be displayed on a page.
 | 
			
		||||
    payload['per_page'] = per_page
 | 
			
		||||
    # The total number of pages
 | 
			
		||||
    payload['pages'] = math.ceil(payload['total'] / payload['per_page'])
 | 
			
		||||
    # the current page number (1 indexed)
 | 
			
		||||
    payload['page'] = page if payload['pages'] > 0 else None
 | 
			
		||||
    # True if a previous page exists
 | 
			
		||||
    payload['has_prev'] = payload['page'] > 1 if payload['page'] else False
 | 
			
		||||
    # True if a next page exists.
 | 
			
		||||
    payload['has_next'] = payload['page'] < payload['pages'] if payload['page'] else False  # noqa
 | 
			
		||||
    # Number of the previous page.
 | 
			
		||||
    payload['prev_num'] = payload['page'] - 1 if payload['has_prev'] else None
 | 
			
		||||
    # Number of the next page
 | 
			
		||||
    payload['next_num'] = payload['page'] + 1 if payload['has_next'] else None
 | 
			
		||||
    return payload
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def ext_cqp_partial_export_subcorpus(
 | 
			
		||||
    subcorpus: str,
 | 
			
		||||
    match_id_list: list,
 | 
			
		||||
    context: int = 50
 | 
			
		||||
) -> dict:
 | 
			
		||||
    corpus_name, subcorpus_name = subcorpus.split(':', 1)
 | 
			
		||||
    cqi_client: CQiClient = session['cqi_over_sio']['cqi_client']
 | 
			
		||||
    cqi_corpus = cqi_client.corpora.get(corpus_name)
 | 
			
		||||
    cqi_subcorpus = cqi_corpus.subcorpora.get(subcorpus_name)
 | 
			
		||||
    cqi_subcorpus_partial_export = partial_export_subcorpus(cqi_subcorpus, match_id_list, context=context)
 | 
			
		||||
    return cqi_subcorpus_partial_export
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def ext_cqp_export_subcorpus(
 | 
			
		||||
    subcorpus: str,
 | 
			
		||||
    context: int = 50
 | 
			
		||||
) -> dict:
 | 
			
		||||
    corpus_name, subcorpus_name = subcorpus.split(':', 1)
 | 
			
		||||
    cqi_client: CQiClient = session['cqi_over_sio']['cqi_client']
 | 
			
		||||
    cqi_corpus = cqi_client.corpora.get(corpus_name)
 | 
			
		||||
    cqi_subcorpus = cqi_corpus.subcorpora.get(subcorpus_name)
 | 
			
		||||
    cqi_subcorpus_export = export_subcorpus(cqi_subcorpus, context=context)
 | 
			
		||||
    return cqi_subcorpus_export
 | 
			
		||||
@@ -1,130 +0,0 @@
 | 
			
		||||
from cqi.models.corpora import Corpus as CQiCorpus
 | 
			
		||||
from cqi.models.subcorpora import Subcorpus as CQiSubcorpus
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def lookups_by_cpos(corpus: CQiCorpus, cpos_list: list[int]) -> dict:
 | 
			
		||||
    lookups = {}
 | 
			
		||||
    lookups['cpos_lookup'] = {cpos: {} for cpos in cpos_list}
 | 
			
		||||
    for attr in corpus.positional_attributes.list():
 | 
			
		||||
        cpos_attr_values: list[str] = attr.values_by_cpos(cpos_list)
 | 
			
		||||
        for i, cpos in enumerate(cpos_list):
 | 
			
		||||
            lookups['cpos_lookup'][cpos][attr.name] = cpos_attr_values[i]
 | 
			
		||||
    for attr in corpus.structural_attributes.list():
 | 
			
		||||
        # We only want to iterate over non subattributes, identifiable by
 | 
			
		||||
        # attr.has_values == False
 | 
			
		||||
        if attr.has_values:
 | 
			
		||||
            continue
 | 
			
		||||
        cpos_attr_ids: list[int] = attr.ids_by_cpos(cpos_list)
 | 
			
		||||
        for i, cpos in enumerate(cpos_list):
 | 
			
		||||
            if cpos_attr_ids[i] == -1:
 | 
			
		||||
                continue
 | 
			
		||||
            lookups['cpos_lookup'][cpos][attr.name] = cpos_attr_ids[i]
 | 
			
		||||
        occured_attr_ids = [x for x in set(cpos_attr_ids) if x != -1]
 | 
			
		||||
        if len(occured_attr_ids) == 0:
 | 
			
		||||
            continue
 | 
			
		||||
        subattrs = corpus.structural_attributes.list(filters={'part_of': attr})
 | 
			
		||||
        if len(subattrs) == 0:
 | 
			
		||||
            continue
 | 
			
		||||
        lookup_name: str = f'{attr.name}_lookup'
 | 
			
		||||
        lookups[lookup_name] = {}
 | 
			
		||||
        for attr_id in occured_attr_ids:
 | 
			
		||||
            lookups[lookup_name][attr_id] = {}
 | 
			
		||||
        for subattr in subattrs:
 | 
			
		||||
            subattr_name = subattr.name[(len(attr.name) + 1):]  # noqa
 | 
			
		||||
            for i, subattr_value in enumerate(subattr.values_by_ids(occured_attr_ids)):  # noqa
 | 
			
		||||
                lookups[lookup_name][occured_attr_ids[i]][subattr_name] = subattr_value  # noqa
 | 
			
		||||
    return lookups
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def partial_export_subcorpus(
 | 
			
		||||
    subcorpus: CQiSubcorpus,
 | 
			
		||||
    match_id_list: list[int],
 | 
			
		||||
    context: int = 25
 | 
			
		||||
) -> dict:
 | 
			
		||||
    if subcorpus.size == 0:
 | 
			
		||||
        return {"matches": []}
 | 
			
		||||
    match_boundaries = []
 | 
			
		||||
    for match_id in match_id_list:
 | 
			
		||||
        if match_id < 0 or match_id >= subcorpus.size:
 | 
			
		||||
            continue
 | 
			
		||||
        match_boundaries.append(
 | 
			
		||||
            (
 | 
			
		||||
                match_id,
 | 
			
		||||
                subcorpus.dump(subcorpus.fields['match'], match_id, match_id)[0],
 | 
			
		||||
                subcorpus.dump(subcorpus.fields['matchend'], match_id, match_id)[0]
 | 
			
		||||
            )
 | 
			
		||||
        )
 | 
			
		||||
    cpos_set = set()
 | 
			
		||||
    matches = []
 | 
			
		||||
    for match_boundary in match_boundaries:
 | 
			
		||||
        match_num, match_start, match_end = match_boundary
 | 
			
		||||
        c = (match_start, match_end)
 | 
			
		||||
        if match_start == 0 or context == 0:
 | 
			
		||||
            lc = None
 | 
			
		||||
            cpos_list_lbound = match_start
 | 
			
		||||
        else:
 | 
			
		||||
            lc_lbound = max(0, (match_start - context))
 | 
			
		||||
            lc_rbound = match_start - 1
 | 
			
		||||
            lc = (lc_lbound, lc_rbound)
 | 
			
		||||
            cpos_list_lbound = lc_lbound
 | 
			
		||||
        if match_end == (subcorpus.collection.corpus.size - 1) or context == 0:
 | 
			
		||||
            rc = None
 | 
			
		||||
            cpos_list_rbound = match_end
 | 
			
		||||
        else:
 | 
			
		||||
            rc_lbound = match_end + 1
 | 
			
		||||
            rc_rbound = min(
 | 
			
		||||
                (match_end + context),
 | 
			
		||||
                (subcorpus.collection.corpus.size - 1)
 | 
			
		||||
            )
 | 
			
		||||
            rc = (rc_lbound, rc_rbound)
 | 
			
		||||
            cpos_list_rbound = rc_rbound
 | 
			
		||||
        match = {'num': match_num, 'lc': lc, 'c': c, 'rc': rc}
 | 
			
		||||
        matches.append(match)
 | 
			
		||||
        cpos_set.update(range(cpos_list_lbound, cpos_list_rbound + 1))
 | 
			
		||||
    lookups = lookups_by_cpos(subcorpus.collection.corpus, list(cpos_set))
 | 
			
		||||
    return {'matches': matches, **lookups}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def export_subcorpus(
 | 
			
		||||
    subcorpus: CQiSubcorpus,
 | 
			
		||||
    context: int = 25,
 | 
			
		||||
    cutoff: float = float('inf'),
 | 
			
		||||
    offset: int = 0
 | 
			
		||||
) -> dict:
 | 
			
		||||
    if subcorpus.size == 0:
 | 
			
		||||
        return {"matches": []}
 | 
			
		||||
    first_match = max(0, offset)
 | 
			
		||||
    last_match = min((offset + cutoff - 1), (subcorpus.size - 1))
 | 
			
		||||
    match_boundaries = zip(
 | 
			
		||||
        range(first_match, last_match + 1),
 | 
			
		||||
        subcorpus.dump(subcorpus.fields['match'], first_match, last_match),
 | 
			
		||||
        subcorpus.dump(subcorpus.fields['matchend'], first_match, last_match)
 | 
			
		||||
    )
 | 
			
		||||
    cpos_set = set()
 | 
			
		||||
    matches = []
 | 
			
		||||
    for match_num, match_start, match_end in match_boundaries:
 | 
			
		||||
        c = (match_start, match_end)
 | 
			
		||||
        if match_start == 0 or context == 0:
 | 
			
		||||
            lc = None
 | 
			
		||||
            cpos_list_lbound = match_start
 | 
			
		||||
        else:
 | 
			
		||||
            lc_lbound = max(0, (match_start - context))
 | 
			
		||||
            lc_rbound = match_start - 1
 | 
			
		||||
            lc = (lc_lbound, lc_rbound)
 | 
			
		||||
            cpos_list_lbound = lc_lbound
 | 
			
		||||
        if match_end == (subcorpus.collection.corpus.size - 1) or context == 0:
 | 
			
		||||
            rc = None
 | 
			
		||||
            cpos_list_rbound = match_end
 | 
			
		||||
        else:
 | 
			
		||||
            rc_lbound = match_end + 1
 | 
			
		||||
            rc_rbound = min(
 | 
			
		||||
                (match_end + context),
 | 
			
		||||
                (subcorpus.collection.corpus.size - 1)
 | 
			
		||||
            )
 | 
			
		||||
            rc = (rc_lbound, rc_rbound)
 | 
			
		||||
            cpos_list_rbound = rc_rbound
 | 
			
		||||
        match = {'num': match_num, 'lc': lc, 'c': c, 'rc': rc}
 | 
			
		||||
        matches.append(match)
 | 
			
		||||
        cpos_set.update(range(cpos_list_lbound, cpos_list_rbound + 1))
 | 
			
		||||
    lookups = lookups_by_cpos(subcorpus.collection.corpus, list(cpos_set))
 | 
			
		||||
    return {'matches': matches, **lookups}
 | 
			
		||||
		Reference in New Issue
	
	Block a user