Move Socket.IO Namespaces to dedicated directory

This commit is contained in:
Patrick Jentsch
2024-11-06 12:27:49 +01:00
parent fc2ace4b9e
commit 2a28f19660
5 changed files with 273 additions and 229 deletions

View File

@ -0,0 +1,223 @@
from cqi import CQiClient
from cqi.errors import CQiException
from cqi.status import CQiStatus
from flask import current_app
from flask_login import current_user
from flask_socketio import Namespace
from inspect import signature
from threading import Lock
from app import db, docker_client, hashids, socketio
from app.decorators import socketio_login_required
from app.models import Corpus, CorpusStatus
from . import extensions
from .utils import CQiOverSocketIOSessionManager
'''
This package tunnels the Corpus Query interface (CQi) protocol through
Socket.IO (SIO) by tunneling CQi API calls through an event called "exec".
Basic concept:
1. A client connects to the namespace.
2. The client emits the "init" event and provides a corpus id for the corpus
that should be analysed in this session.
1.1 The analysis session counter of the corpus is incremented.
1.2 A CQiClient and a (Mutex) Lock belonging to it is created.
1.3 Wait until the CQP server is running.
1.4 Connect the CQiClient to the server.
1.5 Save the CQiClient, the Lock and the corpus id in the session for
subsequential use.
3. The client emits "exec" events, within which it provides the name of a CQi
API function and the corresponding arguments.
3.1 The "exec" event handler will execute the function, make sure that
the result is serializable and returns the result back to the client.
4. The client disconnects from the namespace
4.1 The analysis session counter of the corpus is decremented.
4.2 The CQiClient and (Mutex) Lock belonging to it are teared down.
'''
CQI_API_FUNCTION_NAMES = [
'ask_feature_cl_2_3',
'ask_feature_cqi_1_0',
'ask_feature_cqp_2_3',
'cl_alg2cpos',
'cl_attribute_size',
'cl_cpos2alg',
'cl_cpos2id',
'cl_cpos2lbound',
'cl_cpos2rbound',
'cl_cpos2str',
'cl_cpos2struc',
'cl_drop_attribute',
'cl_id2cpos',
'cl_id2freq',
'cl_id2str',
'cl_idlist2cpos',
'cl_lexicon_size',
'cl_regex2id',
'cl_str2id',
'cl_struc2cpos',
'cl_struc2str',
'corpus_alignment_attributes',
'corpus_charset',
'corpus_drop_corpus',
'corpus_full_name',
'corpus_info',
'corpus_list_corpora',
'corpus_positional_attributes',
'corpus_properties',
'corpus_structural_attribute_has_values',
'corpus_structural_attributes',
'cqp_drop_subcorpus',
'cqp_dump_subcorpus',
'cqp_fdist_1',
'cqp_fdist_2',
'cqp_list_subcorpora',
'cqp_query',
'cqp_subcorpus_has_field',
'cqp_subcorpus_size',
'ctrl_bye',
'ctrl_connect',
'ctrl_last_general_error',
'ctrl_ping',
'ctrl_user_abort'
]
class CQiOverSocketIONamespace(Namespace):
@socketio_login_required
def on_connect(self):
pass
@socketio_login_required
def on_init(self, corpus_hashid: str) -> dict:
corpus_id = hashids.decode(corpus_hashid)
if not isinstance(corpus_id, int):
return {'code': 400, 'msg': 'Bad Request'}
corpus = Corpus.query.get(corpus_id)
if corpus is None:
return {'code': 404, 'msg': 'Not Found'}
if not (
corpus.user == current_user
or current_user.is_following_corpus(corpus)
or current_user.is_administrator
):
return {'code': 403, 'msg': 'Forbidden'}
if corpus.status not in [
CorpusStatus.BUILT,
CorpusStatus.STARTING_ANALYSIS_SESSION,
CorpusStatus.RUNNING_ANALYSIS_SESSION,
CorpusStatus.CANCELING_ANALYSIS_SESSION
]:
return {'code': 424, 'msg': 'Failed Dependency'}
corpus.num_analysis_sessions = Corpus.num_analysis_sessions + 1
db.session.commit()
retry_counter = 20
while corpus.status != CorpusStatus.RUNNING_ANALYSIS_SESSION:
if retry_counter == 0:
corpus.num_analysis_sessions = Corpus.num_analysis_sessions - 1
db.session.commit()
return {'code': 408, 'msg': 'Request Timeout'}
socketio.sleep(3)
retry_counter -= 1
db.session.refresh(corpus)
cqpserver_container_name = f'nopaque-cqpserver-{corpus_id}'
cqpserver_container = docker_client.containers.get(cqpserver_container_name)
cqpserver_ip_address = cqpserver_container.attrs['NetworkSettings']['Networks'][current_app.config['NOPAQUE_DOCKER_NETWORK_NAME']]['IPAddress']
cqi_client = CQiClient(cqpserver_ip_address)
cqi_client_lock = Lock()
CQiOverSocketIOSessionManager.setup()
CQiOverSocketIOSessionManager.set_corpus_id(corpus_id)
CQiOverSocketIOSessionManager.set_cqi_client(cqi_client)
CQiOverSocketIOSessionManager.set_cqi_client_lock(cqi_client_lock)
return {'code': 200, 'msg': 'OK'}
@socketio_login_required
def on_exec(self, fn_name: str, fn_args: dict = {}) -> dict:
try:
cqi_client = CQiOverSocketIOSessionManager.get_cqi_client()
cqi_client_lock = CQiOverSocketIOSessionManager.get_cqi_client_lock()
except KeyError:
return {'code': 424, 'msg': 'Failed Dependency'}
if fn_name in CQI_API_FUNCTION_NAMES:
fn = getattr(cqi_client.api, fn_name)
elif fn_name in extensions.CQI_EXTENSION_FUNCTION_NAMES:
fn = getattr(extensions, fn_name)
else:
return {'code': 400, 'msg': 'Bad Request'}
for param in signature(fn).parameters.values():
# Check if the parameter is optional or required
if param.default is param.empty:
if param.name not in fn_args:
return {'code': 400, 'msg': 'Bad Request'}
else:
if param.name not in fn_args:
continue
if type(fn_args[param.name]) is not param.annotation:
return {'code': 400, 'msg': 'Bad Request'}
cqi_client_lock.acquire()
try:
fn_return_value = fn(**fn_args)
except BrokenPipeError as e:
return {'code': 500, 'msg': 'Internal Server Error'}
except CQiException as e:
return {
'code': 502,
'msg': 'Bad Gateway',
'payload': {
'code': e.code,
'desc': e.description,
'msg': e.__class__.__name__
}
}
finally:
cqi_client_lock.release()
if isinstance(fn_return_value, CQiStatus):
payload = {
'code': fn_return_value.code,
'msg': fn_return_value.__class__.__name__
}
else:
payload = fn_return_value
return {'code': 200, 'msg': 'OK', 'payload': payload}
def on_disconnect(self):
try:
corpus_id = CQiOverSocketIOSessionManager.get_corpus_id()
cqi_client = CQiOverSocketIOSessionManager.get_cqi_client()
cqi_client_lock = CQiOverSocketIOSessionManager.get_cqi_client_lock()
CQiOverSocketIOSessionManager.teardown()
except KeyError:
return
cqi_client_lock.acquire()
try:
cqi_client.api.ctrl_bye()
except (BrokenPipeError, CQiException):
pass
cqi_client_lock.release()
corpus = Corpus.query.get(corpus_id)
if corpus is None:
return
corpus.num_analysis_sessions = Corpus.num_analysis_sessions - 1
db.session.commit()

View File

@ -0,0 +1,406 @@
from collections import Counter
from cqi.models.corpora import Corpus as CQiCorpus
from cqi.models.subcorpora import Subcorpus as CQiSubcorpus
from cqi.status import StatusOk as CQiStatusOk
from flask import current_app
import gzip
import json
import math
from app import db
from app.models import Corpus
from .utils import CQiOverSocketIOSessionManager
CQI_EXTENSION_FUNCTION_NAMES = [
'ext_corpus_update_db',
'ext_corpus_static_data',
'ext_corpus_paginate_corpus',
'ext_cqp_paginate_subcorpus',
'ext_cqp_partial_export_subcorpus',
'ext_cqp_export_subcorpus',
]
def ext_corpus_update_db(corpus: str) -> CQiStatusOk:
corpus_id = CQiOverSocketIOSessionManager.get_corpus_id()
cqi_client = CQiOverSocketIOSessionManager.get_cqi_client()
db_corpus = Corpus.query.get(corpus_id)
cqi_corpus = cqi_client.corpora.get(corpus)
db_corpus.num_tokens = cqi_corpus.size
db.session.commit()
return CQiStatusOk()
def ext_corpus_static_data(corpus: str) -> dict:
corpus_id = CQiOverSocketIOSessionManager.get_corpus_id()
db_corpus = Corpus.query.get(corpus_id)
static_data_file_path = db_corpus.path / 'cwb' / 'static.json.gz'
if static_data_file_path.exists():
with static_data_file_path.open('rb') as f:
return f.read()
cqi_client = CQiOverSocketIOSessionManager.get_cqi_client()
cqi_corpus = cqi_client.corpora.get(corpus)
cqi_p_attrs = cqi_corpus.positional_attributes.list()
cqi_s_attrs = cqi_corpus.structural_attributes.list()
static_data = {
'corpus': {
'bounds': [0, cqi_corpus.size - 1],
'freqs': {}
},
'p_attrs': {},
's_attrs': {},
'values': {'p_attrs': {}, 's_attrs': {}}
}
for p_attr in cqi_p_attrs:
current_app.logger.info(f'corpus.freqs.{p_attr.name}')
static_data['corpus']['freqs'][p_attr.name] = []
p_attr_id_list = list(range(p_attr.lexicon_size))
static_data['corpus']['freqs'][p_attr.name].extend(p_attr.freqs_by_ids(p_attr_id_list))
del p_attr_id_list
current_app.logger.info(f'p_attrs.{p_attr.name}')
static_data['p_attrs'][p_attr.name] = []
cpos_list = list(range(cqi_corpus.size))
static_data['p_attrs'][p_attr.name].extend(p_attr.ids_by_cpos(cpos_list))
del cpos_list
current_app.logger.info(f'values.p_attrs.{p_attr.name}')
static_data['values']['p_attrs'][p_attr.name] = []
p_attr_id_list = list(range(p_attr.lexicon_size))
static_data['values']['p_attrs'][p_attr.name].extend(p_attr.values_by_ids(p_attr_id_list))
del p_attr_id_list
for s_attr in cqi_s_attrs:
if s_attr.has_values:
continue
static_data['s_attrs'][s_attr.name] = {'lexicon': [], 'values': None}
if s_attr.name in ['s', 'ent']:
##############################################################
# A faster way to get cpos boundaries for smaller s_attrs #
# Note: Needs more testing, don't use it in production #
##############################################################
cqi_corpus.query('Last', f'<{s_attr.name}> []* </{s_attr.name}>;')
cqi_subcorpus = cqi_corpus.subcorpora.get('Last')
first_match = 0
last_match = cqi_subcorpus.size - 1
match_boundaries = zip(
range(first_match, last_match + 1),
cqi_subcorpus.dump(
cqi_subcorpus.fields['match'],
first_match,
last_match
),
cqi_subcorpus.dump(
cqi_subcorpus.fields['matchend'],
first_match,
last_match
)
)
cqi_subcorpus.drop()
del cqi_subcorpus, first_match, last_match
for id, lbound, rbound in match_boundaries:
static_data['s_attrs'][s_attr.name]['lexicon'].append({})
current_app.logger.info(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
static_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
del match_boundaries
if s_attr.name != 'text':
continue
for id in range(0, s_attr.size):
static_data['s_attrs'][s_attr.name]['lexicon'].append({})
# This is a very slow operation, thats why we only use it for
# the text attribute
lbound, rbound = s_attr.cpos_by_id(id)
current_app.logger.info(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
static_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
static_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'] = {}
cpos_list = list(range(lbound, rbound + 1))
for p_attr in cqi_p_attrs:
p_attr_ids = []
p_attr_ids.extend(p_attr.ids_by_cpos(cpos_list))
current_app.logger.info(f's_attrs.{s_attr.name}.lexicon.{id}.freqs.{p_attr.name}')
static_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'][p_attr.name] = dict(Counter(p_attr_ids))
del p_attr_ids
del cpos_list
sub_s_attrs = cqi_corpus.structural_attributes.list(filters={'part_of': s_attr})
current_app.logger.info(f's_attrs.{s_attr.name}.values')
static_data['s_attrs'][s_attr.name]['values'] = [
sub_s_attr.name[(len(s_attr.name) + 1):]
for sub_s_attr in sub_s_attrs
]
s_attr_id_list = list(range(s_attr.size))
sub_s_attr_values = []
for sub_s_attr in sub_s_attrs:
tmp = []
tmp.extend(sub_s_attr.values_by_ids(s_attr_id_list))
sub_s_attr_values.append(tmp)
del tmp
del s_attr_id_list
current_app.logger.info(f'values.s_attrs.{s_attr.name}')
static_data['values']['s_attrs'][s_attr.name] = [
{
s_attr_value_name: sub_s_attr_values[s_attr_value_name_idx][s_attr_id]
for s_attr_value_name_idx, s_attr_value_name in enumerate(
static_data['s_attrs'][s_attr.name]['values']
)
} for s_attr_id in range(0, s_attr.size)
]
del sub_s_attr_values
current_app.logger.info('Saving static data to file')
with gzip.open(static_data_file_path, 'wt') as f:
json.dump(static_data, f)
del static_data
current_app.logger.info('Sending static data to client')
with open(static_data_file_path, 'rb') as f:
return f.read()
def ext_corpus_paginate_corpus(
corpus: str,
page: int = 1,
per_page: int = 20
) -> dict:
cqi_client = CQiOverSocketIOSessionManager.get_cqi_client()
cqi_corpus = cqi_client.corpora.get(corpus)
# Sanity checks
if (
per_page < 1
or page < 1
or (
cqi_corpus.size > 0
and page > math.ceil(cqi_corpus.size / per_page)
)
):
return {'code': 416, 'msg': 'Range Not Satisfiable'}
first_cpos = (page - 1) * per_page
last_cpos = min(cqi_corpus.size, first_cpos + per_page)
cpos_list = [*range(first_cpos, last_cpos)]
lookups = _lookups_by_cpos(cqi_corpus, cpos_list)
payload = {}
# the items for the current page
payload['items'] = [cpos_list]
# the lookups for the items
payload['lookups'] = lookups
# the total number of items matching the query
payload['total'] = cqi_corpus.size
# the number of items to be displayed on a page.
payload['per_page'] = per_page
# The total number of pages
payload['pages'] = math.ceil(payload['total'] / payload['per_page'])
# the current page number (1 indexed)
payload['page'] = page if payload['pages'] > 0 else None
# True if a previous page exists
payload['has_prev'] = payload['page'] > 1 if payload['page'] else False
# True if a next page exists.
payload['has_next'] = payload['page'] < payload['pages'] if payload['page'] else False # noqa
# Number of the previous page.
payload['prev_num'] = payload['page'] - 1 if payload['has_prev'] else None
# Number of the next page
payload['next_num'] = payload['page'] + 1 if payload['has_next'] else None
return payload
def ext_cqp_paginate_subcorpus(
subcorpus: str,
context: int = 50,
page: int = 1,
per_page: int = 20
) -> dict:
corpus_name, subcorpus_name = subcorpus.split(':', 1)
cqi_client = CQiOverSocketIOSessionManager.get_cqi_client()
cqi_corpus = cqi_client.corpora.get(corpus_name)
cqi_subcorpus = cqi_corpus.subcorpora.get(subcorpus_name)
# Sanity checks
if (
per_page < 1
or page < 1
or (
cqi_subcorpus.size > 0
and page > math.ceil(cqi_subcorpus.size / per_page)
)
):
return {'code': 416, 'msg': 'Range Not Satisfiable'}
offset = (page - 1) * per_page
cutoff = per_page
cqi_results_export = _export_subcorpus(
cqi_subcorpus, context=context, cutoff=cutoff, offset=offset)
payload = {}
# the items for the current page
payload['items'] = cqi_results_export.pop('matches')
# the lookups for the items
payload['lookups'] = cqi_results_export
# the total number of items matching the query
payload['total'] = cqi_subcorpus.size
# the number of items to be displayed on a page.
payload['per_page'] = per_page
# The total number of pages
payload['pages'] = math.ceil(payload['total'] / payload['per_page'])
# the current page number (1 indexed)
payload['page'] = page if payload['pages'] > 0 else None
# True if a previous page exists
payload['has_prev'] = payload['page'] > 1 if payload['page'] else False
# True if a next page exists.
payload['has_next'] = payload['page'] < payload['pages'] if payload['page'] else False # noqa
# Number of the previous page.
payload['prev_num'] = payload['page'] - 1 if payload['has_prev'] else None
# Number of the next page
payload['next_num'] = payload['page'] + 1 if payload['has_next'] else None
return payload
def ext_cqp_partial_export_subcorpus(
subcorpus: str,
match_id_list: list,
context: int = 50
) -> dict:
corpus_name, subcorpus_name = subcorpus.split(':', 1)
cqi_client = CQiOverSocketIOSessionManager.get_cqi_client()
cqi_corpus = cqi_client.corpora.get(corpus_name)
cqi_subcorpus = cqi_corpus.subcorpora.get(subcorpus_name)
cqi_subcorpus_partial_export = _partial_export_subcorpus(cqi_subcorpus, match_id_list, context=context)
return cqi_subcorpus_partial_export
def ext_cqp_export_subcorpus(subcorpus: str, context: int = 50) -> dict:
corpus_name, subcorpus_name = subcorpus.split(':', 1)
cqi_client = CQiOverSocketIOSessionManager.get_cqi_client()
cqi_corpus = cqi_client.corpora.get(corpus_name)
cqi_subcorpus = cqi_corpus.subcorpora.get(subcorpus_name)
cqi_subcorpus_export = _export_subcorpus(cqi_subcorpus, context=context)
return cqi_subcorpus_export
def _lookups_by_cpos(corpus: CQiCorpus, cpos_list: list[int]) -> dict:
lookups = {}
lookups['cpos_lookup'] = {cpos: {} for cpos in cpos_list}
for attr in corpus.positional_attributes.list():
cpos_attr_values = attr.values_by_cpos(cpos_list)
for i, cpos in enumerate(cpos_list):
lookups['cpos_lookup'][cpos][attr.name] = cpos_attr_values[i]
for attr in corpus.structural_attributes.list():
# We only want to iterate over non subattributes, identifiable by
# attr.has_values == False
if attr.has_values:
continue
cpos_attr_ids = attr.ids_by_cpos(cpos_list)
for i, cpos in enumerate(cpos_list):
if cpos_attr_ids[i] == -1:
continue
lookups['cpos_lookup'][cpos][attr.name] = cpos_attr_ids[i]
occured_attr_ids = [x for x in set(cpos_attr_ids) if x != -1]
if len(occured_attr_ids) == 0:
continue
subattrs = corpus.structural_attributes.list(filters={'part_of': attr})
if len(subattrs) == 0:
continue
lookup_name = f'{attr.name}_lookup'
lookups[lookup_name] = {}
for attr_id in occured_attr_ids:
lookups[lookup_name][attr_id] = {}
for subattr in subattrs:
subattr_name = subattr.name[(len(attr.name) + 1):] # noqa
for i, subattr_value in enumerate(subattr.values_by_ids(occured_attr_ids)): # noqa
lookups[lookup_name][occured_attr_ids[i]][subattr_name] = subattr_value # noqa
return lookups
def _partial_export_subcorpus(
subcorpus: CQiSubcorpus,
match_id_list: list[int],
context: int = 25
) -> dict:
if subcorpus.size == 0:
return {'matches': []}
match_boundaries = []
for match_id in match_id_list:
if match_id < 0 or match_id >= subcorpus.size:
continue
match_boundaries.append(
(
match_id,
subcorpus.dump(subcorpus.fields['match'], match_id, match_id)[0],
subcorpus.dump(subcorpus.fields['matchend'], match_id, match_id)[0]
)
)
cpos_set = set()
matches = []
for match_boundary in match_boundaries:
match_num, match_start, match_end = match_boundary
c = (match_start, match_end)
if match_start == 0 or context == 0:
lc = None
cpos_list_lbound = match_start
else:
lc_lbound = max(0, (match_start - context))
lc_rbound = match_start - 1
lc = (lc_lbound, lc_rbound)
cpos_list_lbound = lc_lbound
if match_end == (subcorpus.collection.corpus.size - 1) or context == 0:
rc = None
cpos_list_rbound = match_end
else:
rc_lbound = match_end + 1
rc_rbound = min(
(match_end + context),
(subcorpus.collection.corpus.size - 1)
)
rc = (rc_lbound, rc_rbound)
cpos_list_rbound = rc_rbound
match = {'num': match_num, 'lc': lc, 'c': c, 'rc': rc}
matches.append(match)
cpos_set.update(range(cpos_list_lbound, cpos_list_rbound + 1))
lookups = _lookups_by_cpos(subcorpus.collection.corpus, list(cpos_set))
return {'matches': matches, **lookups}
def _export_subcorpus(
subcorpus: CQiSubcorpus,
context: int = 25,
cutoff: float = float('inf'),
offset: int = 0
) -> dict:
if subcorpus.size == 0:
return {'matches': []}
first_match = max(0, offset)
last_match = min((offset + cutoff - 1), (subcorpus.size - 1))
match_boundaries = zip(
range(first_match, last_match + 1),
subcorpus.dump(subcorpus.fields['match'], first_match, last_match),
subcorpus.dump(subcorpus.fields['matchend'], first_match, last_match)
)
cpos_set = set()
matches = []
for match_num, match_start, match_end in match_boundaries:
c = (match_start, match_end)
if match_start == 0 or context == 0:
lc = None
cpos_list_lbound = match_start
else:
lc_lbound = max(0, (match_start - context))
lc_rbound = match_start - 1
lc = (lc_lbound, lc_rbound)
cpos_list_lbound = lc_lbound
if match_end == (subcorpus.collection.corpus.size - 1) or context == 0:
rc = None
cpos_list_rbound = match_end
else:
rc_lbound = match_end + 1
rc_rbound = min(
(match_end + context),
(subcorpus.collection.corpus.size - 1)
)
rc = (rc_lbound, rc_rbound)
cpos_list_rbound = rc_rbound
match = {'num': match_num, 'lc': lc, 'c': c, 'rc': rc}
matches.append(match)
cpos_set.update(range(cpos_list_lbound, cpos_list_rbound + 1))
lookups = _lookups_by_cpos(subcorpus.collection.corpus, list(cpos_set))
return {'matches': matches, **lookups}

View File

@ -0,0 +1,37 @@
from cqi import CQiClient
from threading import Lock
from flask import session
class CQiOverSocketIOSessionManager:
@staticmethod
def setup():
session['cqi_over_sio'] = {}
@staticmethod
def teardown():
session.pop('cqi_over_sio')
@staticmethod
def set_corpus_id(corpus_id: int):
session['cqi_over_sio']['corpus_id'] = corpus_id
@staticmethod
def get_corpus_id() -> int:
return session['cqi_over_sio']['corpus_id']
@staticmethod
def set_cqi_client(cqi_client: CQiClient):
session['cqi_over_sio']['cqi_client'] = cqi_client
@staticmethod
def get_cqi_client() -> CQiClient:
return session['cqi_over_sio']['cqi_client']
@staticmethod
def set_cqi_client_lock(cqi_client_lock: Lock):
session['cqi_over_sio']['cqi_client_lock'] = cqi_client_lock
@staticmethod
def get_cqi_client_lock() -> Lock:
return session['cqi_over_sio']['cqi_client_lock']