mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
				synced 2025-11-04 12:22:47 +00:00 
			
		
		
		
	Move Socket.IO Namespaces to dedicated directory
This commit is contained in:
		
							
								
								
									
										223
									
								
								app/namespaces/cqi_over_sio/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										223
									
								
								app/namespaces/cqi_over_sio/__init__.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,223 @@
 | 
			
		||||
from cqi import CQiClient
 | 
			
		||||
from cqi.errors import CQiException
 | 
			
		||||
from cqi.status import CQiStatus
 | 
			
		||||
from flask import current_app
 | 
			
		||||
from flask_login import current_user
 | 
			
		||||
from flask_socketio import Namespace
 | 
			
		||||
from inspect import signature
 | 
			
		||||
from threading import Lock
 | 
			
		||||
from app import db, docker_client, hashids, socketio
 | 
			
		||||
from app.decorators import socketio_login_required
 | 
			
		||||
from app.models import Corpus, CorpusStatus
 | 
			
		||||
from . import extensions
 | 
			
		||||
from .utils import CQiOverSocketIOSessionManager
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
'''
 | 
			
		||||
This package tunnels the Corpus Query interface (CQi) protocol through
 | 
			
		||||
Socket.IO (SIO) by tunneling CQi API calls through an event called "exec".
 | 
			
		||||
 | 
			
		||||
Basic concept:
 | 
			
		||||
1. A client connects to the namespace.
 | 
			
		||||
2. The client emits the "init" event and provides a corpus id for the corpus
 | 
			
		||||
   that should be analysed in this session.
 | 
			
		||||
     1.1 The analysis session counter of the corpus is incremented.
 | 
			
		||||
     1.2 A CQiClient and a (Mutex) Lock belonging to it is created.
 | 
			
		||||
     1.3 Wait until the CQP server is running.
 | 
			
		||||
     1.4 Connect the CQiClient to the server.
 | 
			
		||||
     1.5 Save the CQiClient, the Lock and the corpus id in the session for
 | 
			
		||||
         subsequential use.
 | 
			
		||||
3. The client emits "exec" events, within which it provides the name of a CQi
 | 
			
		||||
   API function and the corresponding arguments.
 | 
			
		||||
     3.1 The "exec" event handler will execute the function, make sure that
 | 
			
		||||
         the result is serializable and returns the result back to the client.
 | 
			
		||||
4. The client disconnects from the namespace
 | 
			
		||||
     4.1 The analysis session counter of the corpus is decremented.
 | 
			
		||||
     4.2 The CQiClient and (Mutex) Lock belonging to it are teared down.
 | 
			
		||||
'''
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
CQI_API_FUNCTION_NAMES = [
 | 
			
		||||
    'ask_feature_cl_2_3',
 | 
			
		||||
    'ask_feature_cqi_1_0',
 | 
			
		||||
    'ask_feature_cqp_2_3',
 | 
			
		||||
    'cl_alg2cpos',
 | 
			
		||||
    'cl_attribute_size',
 | 
			
		||||
    'cl_cpos2alg',
 | 
			
		||||
    'cl_cpos2id',
 | 
			
		||||
    'cl_cpos2lbound',
 | 
			
		||||
    'cl_cpos2rbound',
 | 
			
		||||
    'cl_cpos2str',
 | 
			
		||||
    'cl_cpos2struc',
 | 
			
		||||
    'cl_drop_attribute',
 | 
			
		||||
    'cl_id2cpos',
 | 
			
		||||
    'cl_id2freq',
 | 
			
		||||
    'cl_id2str',
 | 
			
		||||
    'cl_idlist2cpos',
 | 
			
		||||
    'cl_lexicon_size',
 | 
			
		||||
    'cl_regex2id',
 | 
			
		||||
    'cl_str2id',
 | 
			
		||||
    'cl_struc2cpos',
 | 
			
		||||
    'cl_struc2str',
 | 
			
		||||
    'corpus_alignment_attributes',
 | 
			
		||||
    'corpus_charset',
 | 
			
		||||
    'corpus_drop_corpus',
 | 
			
		||||
    'corpus_full_name',
 | 
			
		||||
    'corpus_info',
 | 
			
		||||
    'corpus_list_corpora',
 | 
			
		||||
    'corpus_positional_attributes',
 | 
			
		||||
    'corpus_properties',
 | 
			
		||||
    'corpus_structural_attribute_has_values',
 | 
			
		||||
    'corpus_structural_attributes',
 | 
			
		||||
    'cqp_drop_subcorpus',
 | 
			
		||||
    'cqp_dump_subcorpus',
 | 
			
		||||
    'cqp_fdist_1',
 | 
			
		||||
    'cqp_fdist_2',
 | 
			
		||||
    'cqp_list_subcorpora',
 | 
			
		||||
    'cqp_query',
 | 
			
		||||
    'cqp_subcorpus_has_field',
 | 
			
		||||
    'cqp_subcorpus_size',
 | 
			
		||||
    'ctrl_bye',
 | 
			
		||||
    'ctrl_connect',
 | 
			
		||||
    'ctrl_last_general_error',
 | 
			
		||||
    'ctrl_ping',
 | 
			
		||||
    'ctrl_user_abort'
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class CQiOverSocketIONamespace(Namespace):
 | 
			
		||||
    @socketio_login_required
 | 
			
		||||
    def on_connect(self):
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
    @socketio_login_required
 | 
			
		||||
    def on_init(self, corpus_hashid: str) -> dict:
 | 
			
		||||
        corpus_id = hashids.decode(corpus_hashid)
 | 
			
		||||
 | 
			
		||||
        if not isinstance(corpus_id, int):
 | 
			
		||||
            return {'code': 400, 'msg': 'Bad Request'}
 | 
			
		||||
 | 
			
		||||
        corpus = Corpus.query.get(corpus_id)
 | 
			
		||||
 | 
			
		||||
        if corpus is None:
 | 
			
		||||
            return {'code': 404, 'msg': 'Not Found'}
 | 
			
		||||
 | 
			
		||||
        if not (
 | 
			
		||||
            corpus.user == current_user
 | 
			
		||||
            or current_user.is_following_corpus(corpus)
 | 
			
		||||
            or current_user.is_administrator
 | 
			
		||||
        ):
 | 
			
		||||
            return {'code': 403, 'msg': 'Forbidden'}
 | 
			
		||||
 | 
			
		||||
        if corpus.status not in [
 | 
			
		||||
            CorpusStatus.BUILT,
 | 
			
		||||
            CorpusStatus.STARTING_ANALYSIS_SESSION,
 | 
			
		||||
            CorpusStatus.RUNNING_ANALYSIS_SESSION,
 | 
			
		||||
            CorpusStatus.CANCELING_ANALYSIS_SESSION
 | 
			
		||||
        ]:
 | 
			
		||||
            return {'code': 424, 'msg': 'Failed Dependency'}
 | 
			
		||||
 | 
			
		||||
        corpus.num_analysis_sessions = Corpus.num_analysis_sessions + 1
 | 
			
		||||
        db.session.commit()
 | 
			
		||||
        retry_counter = 20
 | 
			
		||||
        while corpus.status != CorpusStatus.RUNNING_ANALYSIS_SESSION:
 | 
			
		||||
            if retry_counter == 0:
 | 
			
		||||
                corpus.num_analysis_sessions = Corpus.num_analysis_sessions - 1
 | 
			
		||||
                db.session.commit()
 | 
			
		||||
                return {'code': 408, 'msg': 'Request Timeout'}
 | 
			
		||||
            socketio.sleep(3)
 | 
			
		||||
            retry_counter -= 1
 | 
			
		||||
            db.session.refresh(corpus)
 | 
			
		||||
 | 
			
		||||
        cqpserver_container_name = f'nopaque-cqpserver-{corpus_id}'
 | 
			
		||||
        cqpserver_container = docker_client.containers.get(cqpserver_container_name)
 | 
			
		||||
        cqpserver_ip_address = cqpserver_container.attrs['NetworkSettings']['Networks'][current_app.config['NOPAQUE_DOCKER_NETWORK_NAME']]['IPAddress']
 | 
			
		||||
        cqi_client = CQiClient(cqpserver_ip_address)
 | 
			
		||||
        cqi_client_lock = Lock()
 | 
			
		||||
 | 
			
		||||
        CQiOverSocketIOSessionManager.setup()
 | 
			
		||||
        CQiOverSocketIOSessionManager.set_corpus_id(corpus_id)
 | 
			
		||||
        CQiOverSocketIOSessionManager.set_cqi_client(cqi_client)
 | 
			
		||||
        CQiOverSocketIOSessionManager.set_cqi_client_lock(cqi_client_lock)
 | 
			
		||||
 | 
			
		||||
        return {'code': 200, 'msg': 'OK'}
 | 
			
		||||
 | 
			
		||||
    @socketio_login_required
 | 
			
		||||
    def on_exec(self, fn_name: str, fn_args: dict = {}) -> dict:
 | 
			
		||||
        try:
 | 
			
		||||
            cqi_client = CQiOverSocketIOSessionManager.get_cqi_client()
 | 
			
		||||
            cqi_client_lock = CQiOverSocketIOSessionManager.get_cqi_client_lock()
 | 
			
		||||
        except KeyError:
 | 
			
		||||
            return {'code': 424, 'msg': 'Failed Dependency'}
 | 
			
		||||
 | 
			
		||||
        if fn_name in CQI_API_FUNCTION_NAMES:
 | 
			
		||||
            fn = getattr(cqi_client.api, fn_name)
 | 
			
		||||
        elif fn_name in extensions.CQI_EXTENSION_FUNCTION_NAMES:
 | 
			
		||||
            fn = getattr(extensions, fn_name)
 | 
			
		||||
        else:
 | 
			
		||||
            return {'code': 400, 'msg': 'Bad Request'}
 | 
			
		||||
 | 
			
		||||
        for param in signature(fn).parameters.values():
 | 
			
		||||
            # Check if the parameter is optional or required
 | 
			
		||||
            if param.default is param.empty:
 | 
			
		||||
                if param.name not in fn_args:
 | 
			
		||||
                    return {'code': 400, 'msg': 'Bad Request'}
 | 
			
		||||
            else:
 | 
			
		||||
                if param.name not in fn_args:
 | 
			
		||||
                    continue
 | 
			
		||||
            if type(fn_args[param.name]) is not param.annotation:
 | 
			
		||||
                return {'code': 400, 'msg': 'Bad Request'}
 | 
			
		||||
 | 
			
		||||
        cqi_client_lock.acquire()
 | 
			
		||||
        try:
 | 
			
		||||
            fn_return_value = fn(**fn_args)
 | 
			
		||||
        except BrokenPipeError as e:
 | 
			
		||||
            return {'code': 500, 'msg': 'Internal Server Error'}
 | 
			
		||||
        except CQiException as e:
 | 
			
		||||
            return {
 | 
			
		||||
                'code': 502,
 | 
			
		||||
                'msg': 'Bad Gateway',
 | 
			
		||||
                'payload': {
 | 
			
		||||
                    'code': e.code,
 | 
			
		||||
                    'desc': e.description,
 | 
			
		||||
                    'msg': e.__class__.__name__
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        finally:
 | 
			
		||||
            cqi_client_lock.release()
 | 
			
		||||
 | 
			
		||||
        if isinstance(fn_return_value, CQiStatus):
 | 
			
		||||
            payload = {
 | 
			
		||||
                'code': fn_return_value.code,
 | 
			
		||||
                'msg': fn_return_value.__class__.__name__
 | 
			
		||||
            }
 | 
			
		||||
        else:
 | 
			
		||||
            payload = fn_return_value
 | 
			
		||||
 | 
			
		||||
        return {'code': 200, 'msg': 'OK', 'payload': payload}
 | 
			
		||||
 | 
			
		||||
    def on_disconnect(self):
 | 
			
		||||
        try:
 | 
			
		||||
            corpus_id = CQiOverSocketIOSessionManager.get_corpus_id()
 | 
			
		||||
            cqi_client = CQiOverSocketIOSessionManager.get_cqi_client()
 | 
			
		||||
            cqi_client_lock = CQiOverSocketIOSessionManager.get_cqi_client_lock()
 | 
			
		||||
            CQiOverSocketIOSessionManager.teardown()
 | 
			
		||||
        except KeyError:
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
        cqi_client_lock.acquire()
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            cqi_client.api.ctrl_bye()
 | 
			
		||||
        except (BrokenPipeError, CQiException):
 | 
			
		||||
            pass
 | 
			
		||||
 | 
			
		||||
        cqi_client_lock.release()
 | 
			
		||||
 | 
			
		||||
        corpus = Corpus.query.get(corpus_id)
 | 
			
		||||
 | 
			
		||||
        if corpus is None:
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
        corpus.num_analysis_sessions = Corpus.num_analysis_sessions - 1
 | 
			
		||||
        db.session.commit()
 | 
			
		||||
							
								
								
									
										406
									
								
								app/namespaces/cqi_over_sio/extensions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										406
									
								
								app/namespaces/cqi_over_sio/extensions.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,406 @@
 | 
			
		||||
from collections import Counter
 | 
			
		||||
from cqi.models.corpora import Corpus as CQiCorpus
 | 
			
		||||
from cqi.models.subcorpora import Subcorpus as CQiSubcorpus
 | 
			
		||||
from cqi.status import StatusOk as CQiStatusOk
 | 
			
		||||
from flask import current_app
 | 
			
		||||
import gzip
 | 
			
		||||
import json
 | 
			
		||||
import math
 | 
			
		||||
from app import db
 | 
			
		||||
from app.models import Corpus
 | 
			
		||||
from .utils import CQiOverSocketIOSessionManager
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
CQI_EXTENSION_FUNCTION_NAMES = [
 | 
			
		||||
    'ext_corpus_update_db',
 | 
			
		||||
    'ext_corpus_static_data',
 | 
			
		||||
    'ext_corpus_paginate_corpus',
 | 
			
		||||
    'ext_cqp_paginate_subcorpus',
 | 
			
		||||
    'ext_cqp_partial_export_subcorpus',
 | 
			
		||||
    'ext_cqp_export_subcorpus',
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def ext_corpus_update_db(corpus: str) -> CQiStatusOk:
 | 
			
		||||
    corpus_id = CQiOverSocketIOSessionManager.get_corpus_id()
 | 
			
		||||
    cqi_client = CQiOverSocketIOSessionManager.get_cqi_client()
 | 
			
		||||
    db_corpus = Corpus.query.get(corpus_id)
 | 
			
		||||
    cqi_corpus = cqi_client.corpora.get(corpus)
 | 
			
		||||
    db_corpus.num_tokens = cqi_corpus.size
 | 
			
		||||
    db.session.commit()
 | 
			
		||||
    return CQiStatusOk()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def ext_corpus_static_data(corpus: str) -> dict:
 | 
			
		||||
    corpus_id = CQiOverSocketIOSessionManager.get_corpus_id()
 | 
			
		||||
    db_corpus = Corpus.query.get(corpus_id)
 | 
			
		||||
 | 
			
		||||
    static_data_file_path = db_corpus.path / 'cwb' / 'static.json.gz'
 | 
			
		||||
    if static_data_file_path.exists():
 | 
			
		||||
        with static_data_file_path.open('rb') as f:
 | 
			
		||||
            return f.read()
 | 
			
		||||
 | 
			
		||||
    cqi_client = CQiOverSocketIOSessionManager.get_cqi_client()
 | 
			
		||||
    cqi_corpus = cqi_client.corpora.get(corpus)
 | 
			
		||||
    cqi_p_attrs = cqi_corpus.positional_attributes.list()
 | 
			
		||||
    cqi_s_attrs = cqi_corpus.structural_attributes.list()
 | 
			
		||||
 | 
			
		||||
    static_data = {
 | 
			
		||||
        'corpus': {
 | 
			
		||||
            'bounds': [0, cqi_corpus.size - 1],
 | 
			
		||||
            'freqs': {}
 | 
			
		||||
        },
 | 
			
		||||
        'p_attrs': {},
 | 
			
		||||
        's_attrs': {},
 | 
			
		||||
        'values': {'p_attrs': {}, 's_attrs': {}}
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    for p_attr in cqi_p_attrs:
 | 
			
		||||
        current_app.logger.info(f'corpus.freqs.{p_attr.name}')
 | 
			
		||||
        static_data['corpus']['freqs'][p_attr.name] = []
 | 
			
		||||
        p_attr_id_list = list(range(p_attr.lexicon_size))
 | 
			
		||||
        static_data['corpus']['freqs'][p_attr.name].extend(p_attr.freqs_by_ids(p_attr_id_list))
 | 
			
		||||
        del p_attr_id_list
 | 
			
		||||
 | 
			
		||||
        current_app.logger.info(f'p_attrs.{p_attr.name}')
 | 
			
		||||
        static_data['p_attrs'][p_attr.name] = []
 | 
			
		||||
        cpos_list = list(range(cqi_corpus.size))
 | 
			
		||||
        static_data['p_attrs'][p_attr.name].extend(p_attr.ids_by_cpos(cpos_list))
 | 
			
		||||
        del cpos_list
 | 
			
		||||
 | 
			
		||||
        current_app.logger.info(f'values.p_attrs.{p_attr.name}')
 | 
			
		||||
        static_data['values']['p_attrs'][p_attr.name] = []
 | 
			
		||||
        p_attr_id_list = list(range(p_attr.lexicon_size))
 | 
			
		||||
        static_data['values']['p_attrs'][p_attr.name].extend(p_attr.values_by_ids(p_attr_id_list))
 | 
			
		||||
        del p_attr_id_list
 | 
			
		||||
 | 
			
		||||
    for s_attr in cqi_s_attrs:
 | 
			
		||||
        if s_attr.has_values:
 | 
			
		||||
            continue
 | 
			
		||||
 | 
			
		||||
        static_data['s_attrs'][s_attr.name] = {'lexicon': [], 'values': None}
 | 
			
		||||
 | 
			
		||||
        if s_attr.name in ['s', 'ent']:
 | 
			
		||||
            ##############################################################
 | 
			
		||||
            # A faster way to get cpos boundaries for smaller s_attrs    #
 | 
			
		||||
            # Note: Needs more testing, don't use it in production       #
 | 
			
		||||
            ##############################################################
 | 
			
		||||
            cqi_corpus.query('Last', f'<{s_attr.name}> []* </{s_attr.name}>;')
 | 
			
		||||
            cqi_subcorpus = cqi_corpus.subcorpora.get('Last')
 | 
			
		||||
            first_match = 0
 | 
			
		||||
            last_match = cqi_subcorpus.size - 1
 | 
			
		||||
            match_boundaries = zip(
 | 
			
		||||
                range(first_match, last_match + 1),
 | 
			
		||||
                cqi_subcorpus.dump(
 | 
			
		||||
                    cqi_subcorpus.fields['match'],
 | 
			
		||||
                    first_match,
 | 
			
		||||
                    last_match
 | 
			
		||||
                ),
 | 
			
		||||
                cqi_subcorpus.dump(
 | 
			
		||||
                    cqi_subcorpus.fields['matchend'],
 | 
			
		||||
                    first_match,
 | 
			
		||||
                    last_match
 | 
			
		||||
                )
 | 
			
		||||
            )
 | 
			
		||||
            cqi_subcorpus.drop()
 | 
			
		||||
            del cqi_subcorpus, first_match, last_match
 | 
			
		||||
            for id, lbound, rbound in match_boundaries:
 | 
			
		||||
                static_data['s_attrs'][s_attr.name]['lexicon'].append({})
 | 
			
		||||
                current_app.logger.info(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
 | 
			
		||||
                static_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
 | 
			
		||||
            del match_boundaries
 | 
			
		||||
 | 
			
		||||
        if s_attr.name != 'text':
 | 
			
		||||
            continue
 | 
			
		||||
 | 
			
		||||
        for id in range(0, s_attr.size):
 | 
			
		||||
            static_data['s_attrs'][s_attr.name]['lexicon'].append({})
 | 
			
		||||
            # This is a very slow operation, thats why we only use it for
 | 
			
		||||
            # the text attribute
 | 
			
		||||
            lbound, rbound = s_attr.cpos_by_id(id)
 | 
			
		||||
            current_app.logger.info(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
 | 
			
		||||
            static_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
 | 
			
		||||
            static_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'] = {}
 | 
			
		||||
            cpos_list = list(range(lbound, rbound + 1))
 | 
			
		||||
            for p_attr in cqi_p_attrs:
 | 
			
		||||
                p_attr_ids = []
 | 
			
		||||
                p_attr_ids.extend(p_attr.ids_by_cpos(cpos_list))
 | 
			
		||||
                current_app.logger.info(f's_attrs.{s_attr.name}.lexicon.{id}.freqs.{p_attr.name}')
 | 
			
		||||
                static_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'][p_attr.name] = dict(Counter(p_attr_ids))
 | 
			
		||||
                del p_attr_ids
 | 
			
		||||
            del cpos_list
 | 
			
		||||
 | 
			
		||||
        sub_s_attrs = cqi_corpus.structural_attributes.list(filters={'part_of': s_attr})
 | 
			
		||||
        current_app.logger.info(f's_attrs.{s_attr.name}.values')
 | 
			
		||||
        static_data['s_attrs'][s_attr.name]['values'] = [
 | 
			
		||||
            sub_s_attr.name[(len(s_attr.name) + 1):]
 | 
			
		||||
            for sub_s_attr in sub_s_attrs
 | 
			
		||||
        ]
 | 
			
		||||
        s_attr_id_list = list(range(s_attr.size))
 | 
			
		||||
        sub_s_attr_values = []
 | 
			
		||||
        for sub_s_attr in sub_s_attrs:
 | 
			
		||||
            tmp = []
 | 
			
		||||
            tmp.extend(sub_s_attr.values_by_ids(s_attr_id_list))
 | 
			
		||||
            sub_s_attr_values.append(tmp)
 | 
			
		||||
            del tmp
 | 
			
		||||
        del s_attr_id_list
 | 
			
		||||
        current_app.logger.info(f'values.s_attrs.{s_attr.name}')
 | 
			
		||||
        static_data['values']['s_attrs'][s_attr.name] = [
 | 
			
		||||
            {
 | 
			
		||||
                s_attr_value_name: sub_s_attr_values[s_attr_value_name_idx][s_attr_id]
 | 
			
		||||
                for s_attr_value_name_idx, s_attr_value_name in enumerate(
 | 
			
		||||
                    static_data['s_attrs'][s_attr.name]['values']
 | 
			
		||||
                )
 | 
			
		||||
            } for s_attr_id in range(0, s_attr.size)
 | 
			
		||||
        ]
 | 
			
		||||
        del sub_s_attr_values
 | 
			
		||||
    current_app.logger.info('Saving static data to file')
 | 
			
		||||
    with gzip.open(static_data_file_path, 'wt') as f:
 | 
			
		||||
        json.dump(static_data, f)
 | 
			
		||||
    del static_data
 | 
			
		||||
    current_app.logger.info('Sending static data to client')
 | 
			
		||||
    with open(static_data_file_path, 'rb') as f:
 | 
			
		||||
        return f.read()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def ext_corpus_paginate_corpus(
 | 
			
		||||
    corpus: str,
 | 
			
		||||
    page: int = 1,
 | 
			
		||||
    per_page: int = 20
 | 
			
		||||
) -> dict:
 | 
			
		||||
    cqi_client = CQiOverSocketIOSessionManager.get_cqi_client()
 | 
			
		||||
    cqi_corpus = cqi_client.corpora.get(corpus)
 | 
			
		||||
    # Sanity checks
 | 
			
		||||
    if (
 | 
			
		||||
        per_page < 1
 | 
			
		||||
        or page < 1
 | 
			
		||||
        or (
 | 
			
		||||
            cqi_corpus.size > 0
 | 
			
		||||
            and page > math.ceil(cqi_corpus.size / per_page)
 | 
			
		||||
        )
 | 
			
		||||
    ):
 | 
			
		||||
        return {'code': 416, 'msg': 'Range Not Satisfiable'}
 | 
			
		||||
    first_cpos = (page - 1) * per_page
 | 
			
		||||
    last_cpos = min(cqi_corpus.size, first_cpos + per_page)
 | 
			
		||||
    cpos_list = [*range(first_cpos, last_cpos)]
 | 
			
		||||
    lookups = _lookups_by_cpos(cqi_corpus, cpos_list)
 | 
			
		||||
    payload = {}
 | 
			
		||||
    # the items for the current page
 | 
			
		||||
    payload['items'] = [cpos_list]
 | 
			
		||||
    # the lookups for the items
 | 
			
		||||
    payload['lookups'] = lookups
 | 
			
		||||
    # the total number of items matching the query
 | 
			
		||||
    payload['total'] = cqi_corpus.size
 | 
			
		||||
    # the number of items to be displayed on a page.
 | 
			
		||||
    payload['per_page'] = per_page
 | 
			
		||||
    # The total number of pages
 | 
			
		||||
    payload['pages'] = math.ceil(payload['total'] / payload['per_page'])
 | 
			
		||||
    # the current page number (1 indexed)
 | 
			
		||||
    payload['page'] = page if payload['pages'] > 0 else None
 | 
			
		||||
    # True if a previous page exists
 | 
			
		||||
    payload['has_prev'] = payload['page'] > 1 if payload['page'] else False
 | 
			
		||||
    # True if a next page exists.
 | 
			
		||||
    payload['has_next'] = payload['page'] < payload['pages'] if payload['page'] else False  # noqa
 | 
			
		||||
    # Number of the previous page.
 | 
			
		||||
    payload['prev_num'] = payload['page'] - 1 if payload['has_prev'] else None
 | 
			
		||||
    # Number of the next page
 | 
			
		||||
    payload['next_num'] = payload['page'] + 1 if payload['has_next'] else None
 | 
			
		||||
    return payload
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def ext_cqp_paginate_subcorpus(
 | 
			
		||||
    subcorpus: str,
 | 
			
		||||
    context: int = 50,
 | 
			
		||||
    page: int = 1,
 | 
			
		||||
    per_page: int = 20
 | 
			
		||||
) -> dict:
 | 
			
		||||
    corpus_name, subcorpus_name = subcorpus.split(':', 1)
 | 
			
		||||
    cqi_client = CQiOverSocketIOSessionManager.get_cqi_client()
 | 
			
		||||
    cqi_corpus = cqi_client.corpora.get(corpus_name)
 | 
			
		||||
    cqi_subcorpus = cqi_corpus.subcorpora.get(subcorpus_name)
 | 
			
		||||
    # Sanity checks
 | 
			
		||||
    if (
 | 
			
		||||
        per_page < 1
 | 
			
		||||
        or page < 1
 | 
			
		||||
        or (
 | 
			
		||||
            cqi_subcorpus.size > 0
 | 
			
		||||
            and page > math.ceil(cqi_subcorpus.size / per_page)
 | 
			
		||||
        )
 | 
			
		||||
    ):
 | 
			
		||||
        return {'code': 416, 'msg': 'Range Not Satisfiable'}
 | 
			
		||||
    offset = (page - 1) * per_page
 | 
			
		||||
    cutoff = per_page
 | 
			
		||||
    cqi_results_export = _export_subcorpus(
 | 
			
		||||
        cqi_subcorpus, context=context, cutoff=cutoff, offset=offset)
 | 
			
		||||
    payload = {}
 | 
			
		||||
    # the items for the current page
 | 
			
		||||
    payload['items'] = cqi_results_export.pop('matches')
 | 
			
		||||
    # the lookups for the items
 | 
			
		||||
    payload['lookups'] = cqi_results_export
 | 
			
		||||
    # the total number of items matching the query
 | 
			
		||||
    payload['total'] = cqi_subcorpus.size
 | 
			
		||||
    # the number of items to be displayed on a page.
 | 
			
		||||
    payload['per_page'] = per_page
 | 
			
		||||
    # The total number of pages
 | 
			
		||||
    payload['pages'] = math.ceil(payload['total'] / payload['per_page'])
 | 
			
		||||
    # the current page number (1 indexed)
 | 
			
		||||
    payload['page'] = page if payload['pages'] > 0 else None
 | 
			
		||||
    # True if a previous page exists
 | 
			
		||||
    payload['has_prev'] = payload['page'] > 1 if payload['page'] else False
 | 
			
		||||
    # True if a next page exists.
 | 
			
		||||
    payload['has_next'] = payload['page'] < payload['pages'] if payload['page'] else False  # noqa
 | 
			
		||||
    # Number of the previous page.
 | 
			
		||||
    payload['prev_num'] = payload['page'] - 1 if payload['has_prev'] else None
 | 
			
		||||
    # Number of the next page
 | 
			
		||||
    payload['next_num'] = payload['page'] + 1 if payload['has_next'] else None
 | 
			
		||||
    return payload
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def ext_cqp_partial_export_subcorpus(
 | 
			
		||||
    subcorpus: str,
 | 
			
		||||
    match_id_list: list,
 | 
			
		||||
    context: int = 50
 | 
			
		||||
) -> dict:
 | 
			
		||||
    corpus_name, subcorpus_name = subcorpus.split(':', 1)
 | 
			
		||||
    cqi_client = CQiOverSocketIOSessionManager.get_cqi_client()
 | 
			
		||||
    cqi_corpus = cqi_client.corpora.get(corpus_name)
 | 
			
		||||
    cqi_subcorpus = cqi_corpus.subcorpora.get(subcorpus_name)
 | 
			
		||||
    cqi_subcorpus_partial_export = _partial_export_subcorpus(cqi_subcorpus, match_id_list, context=context)
 | 
			
		||||
    return cqi_subcorpus_partial_export
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def ext_cqp_export_subcorpus(subcorpus: str, context: int = 50) -> dict:
 | 
			
		||||
    corpus_name, subcorpus_name = subcorpus.split(':', 1)
 | 
			
		||||
    cqi_client = CQiOverSocketIOSessionManager.get_cqi_client()
 | 
			
		||||
    cqi_corpus = cqi_client.corpora.get(corpus_name)
 | 
			
		||||
    cqi_subcorpus = cqi_corpus.subcorpora.get(subcorpus_name)
 | 
			
		||||
    cqi_subcorpus_export = _export_subcorpus(cqi_subcorpus, context=context)
 | 
			
		||||
    return cqi_subcorpus_export
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _lookups_by_cpos(corpus: CQiCorpus, cpos_list: list[int]) -> dict:
 | 
			
		||||
    lookups = {}
 | 
			
		||||
    lookups['cpos_lookup'] = {cpos: {} for cpos in cpos_list}
 | 
			
		||||
    for attr in corpus.positional_attributes.list():
 | 
			
		||||
        cpos_attr_values = attr.values_by_cpos(cpos_list)
 | 
			
		||||
        for i, cpos in enumerate(cpos_list):
 | 
			
		||||
            lookups['cpos_lookup'][cpos][attr.name] = cpos_attr_values[i]
 | 
			
		||||
    for attr in corpus.structural_attributes.list():
 | 
			
		||||
        # We only want to iterate over non subattributes, identifiable by
 | 
			
		||||
        # attr.has_values == False
 | 
			
		||||
        if attr.has_values:
 | 
			
		||||
            continue
 | 
			
		||||
        cpos_attr_ids = attr.ids_by_cpos(cpos_list)
 | 
			
		||||
        for i, cpos in enumerate(cpos_list):
 | 
			
		||||
            if cpos_attr_ids[i] == -1:
 | 
			
		||||
                continue
 | 
			
		||||
            lookups['cpos_lookup'][cpos][attr.name] = cpos_attr_ids[i]
 | 
			
		||||
        occured_attr_ids = [x for x in set(cpos_attr_ids) if x != -1]
 | 
			
		||||
        if len(occured_attr_ids) == 0:
 | 
			
		||||
            continue
 | 
			
		||||
        subattrs = corpus.structural_attributes.list(filters={'part_of': attr})
 | 
			
		||||
        if len(subattrs) == 0:
 | 
			
		||||
            continue
 | 
			
		||||
        lookup_name = f'{attr.name}_lookup'
 | 
			
		||||
        lookups[lookup_name] = {}
 | 
			
		||||
        for attr_id in occured_attr_ids:
 | 
			
		||||
            lookups[lookup_name][attr_id] = {}
 | 
			
		||||
        for subattr in subattrs:
 | 
			
		||||
            subattr_name = subattr.name[(len(attr.name) + 1):]  # noqa
 | 
			
		||||
            for i, subattr_value in enumerate(subattr.values_by_ids(occured_attr_ids)):  # noqa
 | 
			
		||||
                lookups[lookup_name][occured_attr_ids[i]][subattr_name] = subattr_value  # noqa
 | 
			
		||||
    return lookups
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _partial_export_subcorpus(
 | 
			
		||||
    subcorpus: CQiSubcorpus,
 | 
			
		||||
    match_id_list: list[int],
 | 
			
		||||
    context: int = 25
 | 
			
		||||
) -> dict:
 | 
			
		||||
    if subcorpus.size == 0:
 | 
			
		||||
        return {'matches': []}
 | 
			
		||||
    match_boundaries = []
 | 
			
		||||
    for match_id in match_id_list:
 | 
			
		||||
        if match_id < 0 or match_id >= subcorpus.size:
 | 
			
		||||
            continue
 | 
			
		||||
        match_boundaries.append(
 | 
			
		||||
            (
 | 
			
		||||
                match_id,
 | 
			
		||||
                subcorpus.dump(subcorpus.fields['match'], match_id, match_id)[0],
 | 
			
		||||
                subcorpus.dump(subcorpus.fields['matchend'], match_id, match_id)[0]
 | 
			
		||||
            )
 | 
			
		||||
        )
 | 
			
		||||
    cpos_set = set()
 | 
			
		||||
    matches = []
 | 
			
		||||
    for match_boundary in match_boundaries:
 | 
			
		||||
        match_num, match_start, match_end = match_boundary
 | 
			
		||||
        c = (match_start, match_end)
 | 
			
		||||
        if match_start == 0 or context == 0:
 | 
			
		||||
            lc = None
 | 
			
		||||
            cpos_list_lbound = match_start
 | 
			
		||||
        else:
 | 
			
		||||
            lc_lbound = max(0, (match_start - context))
 | 
			
		||||
            lc_rbound = match_start - 1
 | 
			
		||||
            lc = (lc_lbound, lc_rbound)
 | 
			
		||||
            cpos_list_lbound = lc_lbound
 | 
			
		||||
        if match_end == (subcorpus.collection.corpus.size - 1) or context == 0:
 | 
			
		||||
            rc = None
 | 
			
		||||
            cpos_list_rbound = match_end
 | 
			
		||||
        else:
 | 
			
		||||
            rc_lbound = match_end + 1
 | 
			
		||||
            rc_rbound = min(
 | 
			
		||||
                (match_end + context),
 | 
			
		||||
                (subcorpus.collection.corpus.size - 1)
 | 
			
		||||
            )
 | 
			
		||||
            rc = (rc_lbound, rc_rbound)
 | 
			
		||||
            cpos_list_rbound = rc_rbound
 | 
			
		||||
        match = {'num': match_num, 'lc': lc, 'c': c, 'rc': rc}
 | 
			
		||||
        matches.append(match)
 | 
			
		||||
        cpos_set.update(range(cpos_list_lbound, cpos_list_rbound + 1))
 | 
			
		||||
    lookups = _lookups_by_cpos(subcorpus.collection.corpus, list(cpos_set))
 | 
			
		||||
    return {'matches': matches, **lookups}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _export_subcorpus(
 | 
			
		||||
    subcorpus: CQiSubcorpus,
 | 
			
		||||
    context: int = 25,
 | 
			
		||||
    cutoff: float = float('inf'),
 | 
			
		||||
    offset: int = 0
 | 
			
		||||
) -> dict:
 | 
			
		||||
    if subcorpus.size == 0:
 | 
			
		||||
        return {'matches': []}
 | 
			
		||||
    first_match = max(0, offset)
 | 
			
		||||
    last_match = min((offset + cutoff - 1), (subcorpus.size - 1))
 | 
			
		||||
    match_boundaries = zip(
 | 
			
		||||
        range(first_match, last_match + 1),
 | 
			
		||||
        subcorpus.dump(subcorpus.fields['match'], first_match, last_match),
 | 
			
		||||
        subcorpus.dump(subcorpus.fields['matchend'], first_match, last_match)
 | 
			
		||||
    )
 | 
			
		||||
    cpos_set = set()
 | 
			
		||||
    matches = []
 | 
			
		||||
    for match_num, match_start, match_end in match_boundaries:
 | 
			
		||||
        c = (match_start, match_end)
 | 
			
		||||
        if match_start == 0 or context == 0:
 | 
			
		||||
            lc = None
 | 
			
		||||
            cpos_list_lbound = match_start
 | 
			
		||||
        else:
 | 
			
		||||
            lc_lbound = max(0, (match_start - context))
 | 
			
		||||
            lc_rbound = match_start - 1
 | 
			
		||||
            lc = (lc_lbound, lc_rbound)
 | 
			
		||||
            cpos_list_lbound = lc_lbound
 | 
			
		||||
        if match_end == (subcorpus.collection.corpus.size - 1) or context == 0:
 | 
			
		||||
            rc = None
 | 
			
		||||
            cpos_list_rbound = match_end
 | 
			
		||||
        else:
 | 
			
		||||
            rc_lbound = match_end + 1
 | 
			
		||||
            rc_rbound = min(
 | 
			
		||||
                (match_end + context),
 | 
			
		||||
                (subcorpus.collection.corpus.size - 1)
 | 
			
		||||
            )
 | 
			
		||||
            rc = (rc_lbound, rc_rbound)
 | 
			
		||||
            cpos_list_rbound = rc_rbound
 | 
			
		||||
        match = {'num': match_num, 'lc': lc, 'c': c, 'rc': rc}
 | 
			
		||||
        matches.append(match)
 | 
			
		||||
        cpos_set.update(range(cpos_list_lbound, cpos_list_rbound + 1))
 | 
			
		||||
    lookups = _lookups_by_cpos(subcorpus.collection.corpus, list(cpos_set))
 | 
			
		||||
    return {'matches': matches, **lookups}
 | 
			
		||||
							
								
								
									
										37
									
								
								app/namespaces/cqi_over_sio/utils.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								app/namespaces/cqi_over_sio/utils.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,37 @@
 | 
			
		||||
from cqi import CQiClient
 | 
			
		||||
from threading import Lock
 | 
			
		||||
from flask import session
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class CQiOverSocketIOSessionManager:
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def setup():
 | 
			
		||||
        session['cqi_over_sio'] = {}
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def teardown():
 | 
			
		||||
        session.pop('cqi_over_sio')
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def set_corpus_id(corpus_id: int):
 | 
			
		||||
        session['cqi_over_sio']['corpus_id'] = corpus_id
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def get_corpus_id() -> int:
 | 
			
		||||
        return session['cqi_over_sio']['corpus_id']
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def set_cqi_client(cqi_client: CQiClient):
 | 
			
		||||
        session['cqi_over_sio']['cqi_client'] = cqi_client
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def get_cqi_client() -> CQiClient:
 | 
			
		||||
        return session['cqi_over_sio']['cqi_client']
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def set_cqi_client_lock(cqi_client_lock: Lock):
 | 
			
		||||
        session['cqi_over_sio']['cqi_client_lock'] = cqi_client_lock
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def get_cqi_client_lock() -> Lock:
 | 
			
		||||
        return session['cqi_over_sio']['cqi_client_lock']
 | 
			
		||||
		Reference in New Issue
	
	Block a user