mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
				synced 2025-11-03 20:02:47 +00:00 
			
		
		
		
	Replace the old js CQiClient with fully featured new one
This commit is contained in:
		@@ -8,6 +8,14 @@ from typing import Callable, Dict, List
 | 
			
		||||
from app import socketio
 | 
			
		||||
from app.decorators import socketio_login_required
 | 
			
		||||
from . import NAMESPACE as ns
 | 
			
		||||
from .extensions import (
 | 
			
		||||
    corpus_update_db,
 | 
			
		||||
    corpus_static_data,
 | 
			
		||||
    corpus_paginate_corpus,
 | 
			
		||||
    cqp_paginate_subcorpus,
 | 
			
		||||
    cqp_partial_export_subcorpus,
 | 
			
		||||
    cqp_export_subcorpus,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
CQI_API_FUNCTIONS: List[str] = [
 | 
			
		||||
@@ -56,15 +64,21 @@ CQI_API_FUNCTIONS: List[str] = [
 | 
			
		||||
    'ctrl_ping',
 | 
			
		||||
    'ctrl_user_abort'
 | 
			
		||||
]
 | 
			
		||||
CQI_NOPAQUE_FUNCTIONS: Dict[str, Callable] = {
 | 
			
		||||
    'nopaque_corpus_update_db': corpus_update_db,
 | 
			
		||||
    'nopaque_corpus_static_data': corpus_static_data,
 | 
			
		||||
    'nopaque_corpus_paginate_corpus': corpus_paginate_corpus,
 | 
			
		||||
    'nopaque_cqp_paginate_subcorpus': cqp_paginate_subcorpus,
 | 
			
		||||
    'nopaque_cqp_partial_export_subcorpus': cqp_partial_export_subcorpus,
 | 
			
		||||
    'nopaque_cqp_export_subcorpus': cqp_export_subcorpus,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@socketio.on('cqi_client.api', namespace=ns)
 | 
			
		||||
@socketio.on('cqi', namespace=ns)
 | 
			
		||||
@socketio_login_required
 | 
			
		||||
def cqi_over_sio(fn_data):
 | 
			
		||||
    try:
 | 
			
		||||
        fn_name: str = fn_data['fn_name']
 | 
			
		||||
        if fn_name not in CQI_API_FUNCTIONS:
 | 
			
		||||
            raise KeyError
 | 
			
		||||
    except KeyError:
 | 
			
		||||
        return {'code': 400, 'msg': 'Bad Request'}
 | 
			
		||||
    fn_name: str = fn_data['fn_name']
 | 
			
		||||
@@ -74,7 +88,13 @@ def cqi_over_sio(fn_data):
 | 
			
		||||
        cqi_client_lock: Lock = session['cqi_over_sio']['cqi_client_lock']
 | 
			
		||||
    except KeyError:
 | 
			
		||||
        return {'code': 424, 'msg': 'Failed Dependency'}
 | 
			
		||||
    fn: Callable = getattr(cqi_client.api, fn_name)
 | 
			
		||||
    if fn_name in CQI_API_FUNCTIONS:
 | 
			
		||||
        fn: Callable = getattr(cqi_client.api, fn_name)
 | 
			
		||||
    elif fn_name in CQI_NOPAQUE_FUNCTIONS:
 | 
			
		||||
        fn_args['cqi_client'] = cqi_client
 | 
			
		||||
        fn: Callable = CQI_NOPAQUE_FUNCTIONS[fn_name]
 | 
			
		||||
    else:
 | 
			
		||||
        return {'code': 400, 'msg': 'Bad Request'}
 | 
			
		||||
    for param in signature(fn).parameters.values():
 | 
			
		||||
        if param.default is param.empty:
 | 
			
		||||
            if param.name not in fn_args:
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										243
									
								
								app/corpora/cqi_over_sio/extensions/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										243
									
								
								app/corpora/cqi_over_sio/extensions/__init__.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,243 @@
 | 
			
		||||
from collections import Counter
 | 
			
		||||
from cqi import CQiClient
 | 
			
		||||
from cqi.status import StatusOk
 | 
			
		||||
from flask import session
 | 
			
		||||
from typing import Dict
 | 
			
		||||
import json
 | 
			
		||||
import math
 | 
			
		||||
import os
 | 
			
		||||
from app import db
 | 
			
		||||
from app.models import Corpus
 | 
			
		||||
from .utils import lookups_by_cpos, export_subcorpus, partial_export_subcorpus
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def corpus_update_db(cqi_client: CQiClient, corpus: str):
 | 
			
		||||
    db_corpus = Corpus.query.get(session['cqi_over_sio']['corpus_id'])
 | 
			
		||||
    cqi_corpus = cqi_client.corpora.get(corpus)
 | 
			
		||||
    db_corpus.num_tokens = cqi_corpus.size
 | 
			
		||||
    db.session.commit()
 | 
			
		||||
    return StatusOk()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def corpus_static_data(cqi_client: CQiClient, corpus: str) -> Dict:
 | 
			
		||||
    db_corpus = Corpus.query.get(session['cqi_over_sio']['corpus_id'])
 | 
			
		||||
    static_corpus_data_file = os.path.join(db_corpus.path, 'cwb', 'static.json')
 | 
			
		||||
    if os.path.exists(static_corpus_data_file):
 | 
			
		||||
        with open(static_corpus_data_file, 'r') as f:
 | 
			
		||||
            return json.load(f)
 | 
			
		||||
    cqi_corpus = cqi_client.corpora.get(corpus)
 | 
			
		||||
    ##########################################################################
 | 
			
		||||
    # A faster way to get cpos boundaries for smaller s_attrs                #
 | 
			
		||||
    ##########################################################################
 | 
			
		||||
    # cqi_corpus.query('Last', '<s> []* </s>;')
 | 
			
		||||
    # cqi_subcorpus = cqi_corpus.subcorpora.get('Last')
 | 
			
		||||
    # print(cqi_subcorpus.size)
 | 
			
		||||
    # first_match = 0
 | 
			
		||||
    # last_match = cqi_subcorpus.attrs['size'] - 1
 | 
			
		||||
    # match_boundaries = zip(
 | 
			
		||||
    #     list(range(first_match, last_match + 1)),
 | 
			
		||||
    #     cqi_subcorpus.dump(cqi_subcorpus.attrs['fields']['match'], first_match, last_match),
 | 
			
		||||
    #     cqi_subcorpus.dump(cqi_subcorpus.attrs['fields']['matchend'], first_match, last_match)
 | 
			
		||||
    # )
 | 
			
		||||
    # for x in match_boundaries:
 | 
			
		||||
    #     print(x)
 | 
			
		||||
    cqi_p_attrs = {
 | 
			
		||||
        p_attr.name: p_attr
 | 
			
		||||
        for p_attr in cqi_corpus.positional_attributes.list()
 | 
			
		||||
    }
 | 
			
		||||
    cqi_s_attrs = {
 | 
			
		||||
        s_attr.name: s_attr
 | 
			
		||||
        for s_attr in cqi_corpus.structural_attributes.list()
 | 
			
		||||
    }
 | 
			
		||||
    static_corpus_data = {
 | 
			
		||||
        'corpus': {
 | 
			
		||||
            'bounds': [0, cqi_corpus.size - 1],
 | 
			
		||||
            'counts': {
 | 
			
		||||
                'token': cqi_corpus.size
 | 
			
		||||
            },
 | 
			
		||||
            'freqs': {}
 | 
			
		||||
        },
 | 
			
		||||
        'p_attrs': {},
 | 
			
		||||
        's_attrs': {},
 | 
			
		||||
        'values': {'p_attrs': {}, 's_attrs': {}}
 | 
			
		||||
    }
 | 
			
		||||
    for p_attr in cqi_p_attrs.values():
 | 
			
		||||
        static_corpus_data['corpus']['freqs'][p_attr.name] = dict(
 | 
			
		||||
            zip(
 | 
			
		||||
                range(0, p_attr.lexicon_size),
 | 
			
		||||
                p_attr.freqs_by_ids(list(range(0, p_attr.lexicon_size)))
 | 
			
		||||
            )
 | 
			
		||||
        )
 | 
			
		||||
        static_corpus_data['p_attrs'][p_attr.name] = dict(
 | 
			
		||||
            zip(
 | 
			
		||||
                range(0, cqi_corpus.size),
 | 
			
		||||
                p_attr.ids_by_cpos(list(range(0, cqi_corpus.size)))
 | 
			
		||||
            )
 | 
			
		||||
        )
 | 
			
		||||
        static_corpus_data['values']['p_attrs'][p_attr.name] = dict(
 | 
			
		||||
            zip(
 | 
			
		||||
                range(0, p_attr.lexicon_size),
 | 
			
		||||
                p_attr.values_by_ids(list(range(0, p_attr.lexicon_size)))
 | 
			
		||||
            )
 | 
			
		||||
        )
 | 
			
		||||
    for s_attr in cqi_s_attrs.values():
 | 
			
		||||
        if s_attr.has_values:
 | 
			
		||||
            continue
 | 
			
		||||
        static_corpus_data['corpus']['counts'][s_attr.name] = s_attr.size
 | 
			
		||||
        static_corpus_data['s_attrs'][s_attr.name] = {'lexicon': {}, 'values': None}
 | 
			
		||||
        static_corpus_data['values']['s_attrs'][s_attr.name] = {}
 | 
			
		||||
        for id in range(0, s_attr.size):
 | 
			
		||||
            static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {}
 | 
			
		||||
            lbound, rbound = s_attr.cpos_by_id(id)
 | 
			
		||||
            static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
 | 
			
		||||
            static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {}
 | 
			
		||||
            static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
 | 
			
		||||
            if s_attr.name not in ['text', 's']:
 | 
			
		||||
                continue
 | 
			
		||||
            cpos_range = range(lbound, rbound + 1)
 | 
			
		||||
            static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['ent'] = len({x for x in cqi_s_attrs['ent'].ids_by_cpos(list(cpos_range)) if x != -1})
 | 
			
		||||
            if s_attr.name != 'text':
 | 
			
		||||
                continue
 | 
			
		||||
            static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['s'] = len({x for x in cqi_s_attrs['s'].ids_by_cpos(list(cpos_range)) if x != -1})
 | 
			
		||||
            static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'] = {}
 | 
			
		||||
            for p_attr in cqi_p_attrs.values():
 | 
			
		||||
                static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'][p_attr.name] = dict(Counter(p_attr.ids_by_cpos(list(cpos_range))))
 | 
			
		||||
        sub_s_attrs = cqi_corpus.structural_attributes.list(filters={'part_of': s_attr})
 | 
			
		||||
        s_attr_value_names = [
 | 
			
		||||
            sub_s_attr.name[(len(s_attr.name) + 1):]
 | 
			
		||||
            for sub_s_attr in sub_s_attrs
 | 
			
		||||
        ]
 | 
			
		||||
        sub_s_attr_values = [
 | 
			
		||||
            sub_s_attr.values_by_ids(list(range(0, s_attr.size)))
 | 
			
		||||
            for sub_s_attr in sub_s_attrs
 | 
			
		||||
        ]
 | 
			
		||||
        static_corpus_data['s_attrs'][s_attr.name]['values'] = s_attr_value_names
 | 
			
		||||
        static_corpus_data['values']['s_attrs'][s_attr.name] = {
 | 
			
		||||
            s_attr_id: {
 | 
			
		||||
                s_attr_value_name: sub_s_attr_values[s_attr_value_name_idx][s_attr_id_idx]
 | 
			
		||||
                for s_attr_value_name_idx, s_attr_value_name in enumerate(
 | 
			
		||||
                    static_corpus_data['s_attrs'][s_attr.name]['values']
 | 
			
		||||
                )
 | 
			
		||||
            } for s_attr_id_idx, s_attr_id in enumerate(range(0, s_attr.size))
 | 
			
		||||
        }
 | 
			
		||||
    with open(static_corpus_data_file, 'w') as f:
 | 
			
		||||
        json.dump(static_corpus_data, f)
 | 
			
		||||
    return static_corpus_data
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def corpus_paginate_corpus(
 | 
			
		||||
    cqi_client: CQiClient,
 | 
			
		||||
    corpus: str,
 | 
			
		||||
    page: int = 1,
 | 
			
		||||
    per_page: int = 20
 | 
			
		||||
) -> Dict:
 | 
			
		||||
    cqi_corpus = cqi_client.corpora.get(corpus)
 | 
			
		||||
    # Sanity checks
 | 
			
		||||
    if (
 | 
			
		||||
        per_page < 1
 | 
			
		||||
        or page < 1
 | 
			
		||||
        or (
 | 
			
		||||
            cqi_corpus.size > 0
 | 
			
		||||
            and page > math.ceil(cqi_corpus.size / per_page)
 | 
			
		||||
        )
 | 
			
		||||
    ):
 | 
			
		||||
        return {'code': 416, 'msg': 'Range Not Satisfiable'}
 | 
			
		||||
    first_cpos = (page - 1) * per_page
 | 
			
		||||
    last_cpos = min(cqi_corpus.size, first_cpos + per_page)
 | 
			
		||||
    cpos_list = [*range(first_cpos, last_cpos)]
 | 
			
		||||
    lookups = lookups_by_cpos(cqi_corpus, cpos_list)
 | 
			
		||||
    payload = {}
 | 
			
		||||
    # the items for the current page
 | 
			
		||||
    payload['items'] = [cpos_list]
 | 
			
		||||
    # the lookups for the items
 | 
			
		||||
    payload['lookups'] = lookups
 | 
			
		||||
    # the total number of items matching the query
 | 
			
		||||
    payload['total'] = cqi_corpus.size
 | 
			
		||||
    # the number of items to be displayed on a page.
 | 
			
		||||
    payload['per_page'] = per_page
 | 
			
		||||
    # The total number of pages
 | 
			
		||||
    payload['pages'] = math.ceil(payload['total'] / payload['per_page'])
 | 
			
		||||
    # the current page number (1 indexed)
 | 
			
		||||
    payload['page'] = page if payload['pages'] > 0 else None
 | 
			
		||||
    # True if a previous page exists
 | 
			
		||||
    payload['has_prev'] = payload['page'] > 1 if payload['page'] else False
 | 
			
		||||
    # True if a next page exists.
 | 
			
		||||
    payload['has_next'] = payload['page'] < payload['pages'] if payload['page'] else False  # noqa
 | 
			
		||||
    # Number of the previous page.
 | 
			
		||||
    payload['prev_num'] = payload['page'] - 1 if payload['has_prev'] else None
 | 
			
		||||
    # Number of the next page
 | 
			
		||||
    payload['next_num'] = payload['page'] + 1 if payload['has_next'] else None
 | 
			
		||||
    return {'code': 200, 'msg': 'OK', 'payload': payload}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def cqp_paginate_subcorpus(
 | 
			
		||||
    cqi_client: CQiClient,
 | 
			
		||||
    subcorpus: str,
 | 
			
		||||
    context: int = 50,
 | 
			
		||||
    page: int = 1,
 | 
			
		||||
    per_page: int = 20
 | 
			
		||||
) -> Dict:
 | 
			
		||||
    corpus_name, subcorpus_name = subcorpus.split(':', 1)
 | 
			
		||||
    cqi_corpus = cqi_client.corpora.get(corpus_name)
 | 
			
		||||
    cqi_subcorpus = cqi_corpus.subcorpora.get(subcorpus_name)
 | 
			
		||||
    # Sanity checks
 | 
			
		||||
    if (
 | 
			
		||||
        per_page < 1
 | 
			
		||||
        or page < 1
 | 
			
		||||
        or (
 | 
			
		||||
            cqi_subcorpus.size > 0
 | 
			
		||||
            and page > math.ceil(cqi_subcorpus.size / per_page)
 | 
			
		||||
        )
 | 
			
		||||
    ):
 | 
			
		||||
        return {'code': 416, 'msg': 'Range Not Satisfiable'}
 | 
			
		||||
    offset = (page - 1) * per_page
 | 
			
		||||
    cutoff = per_page
 | 
			
		||||
    cqi_results_export = export_subcorpus(
 | 
			
		||||
        cqi_subcorpus, context=context, cutoff=cutoff, offset=offset)
 | 
			
		||||
    payload = {}
 | 
			
		||||
    # the items for the current page
 | 
			
		||||
    payload['items'] = cqi_results_export.pop('matches')
 | 
			
		||||
    # the lookups for the items
 | 
			
		||||
    payload['lookups'] = cqi_results_export
 | 
			
		||||
    # the total number of items matching the query
 | 
			
		||||
    payload['total'] = cqi_subcorpus.size
 | 
			
		||||
    # the number of items to be displayed on a page.
 | 
			
		||||
    payload['per_page'] = per_page
 | 
			
		||||
    # The total number of pages
 | 
			
		||||
    payload['pages'] = math.ceil(payload['total'] / payload['per_page'])
 | 
			
		||||
    # the current page number (1 indexed)
 | 
			
		||||
    payload['page'] = page if payload['pages'] > 0 else None
 | 
			
		||||
    # True if a previous page exists
 | 
			
		||||
    payload['has_prev'] = payload['page'] > 1 if payload['page'] else False
 | 
			
		||||
    # True if a next page exists.
 | 
			
		||||
    payload['has_next'] = payload['page'] < payload['pages'] if payload['page'] else False  # noqa
 | 
			
		||||
    # Number of the previous page.
 | 
			
		||||
    payload['prev_num'] = payload['page'] - 1 if payload['has_prev'] else None
 | 
			
		||||
    # Number of the next page
 | 
			
		||||
    payload['next_num'] = payload['page'] + 1 if payload['has_next'] else None
 | 
			
		||||
    return {'code': 200, 'msg': 'OK', 'payload': payload}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def cqp_partial_export_subcorpus(
 | 
			
		||||
    cqi_client: CQiClient,
 | 
			
		||||
    subcorpus: str,
 | 
			
		||||
    match_id_list: list,
 | 
			
		||||
    context: int = 50
 | 
			
		||||
) -> Dict:
 | 
			
		||||
    corpus_name, subcorpus_name = subcorpus.split(':', 1)
 | 
			
		||||
    cqi_corpus = cqi_client.corpora.get(corpus_name)
 | 
			
		||||
    cqi_subcorpus = cqi_corpus.subcorpora.get(subcorpus_name)
 | 
			
		||||
    cqi_subcorpus_partial_export = partial_export_subcorpus(cqi_subcorpus, match_id_list, context=context)
 | 
			
		||||
    return {'code': 200, 'msg': 'OK', 'payload': cqi_subcorpus_partial_export}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def cqp_export_subcorpus(
 | 
			
		||||
    cqi_client: CQiClient,
 | 
			
		||||
    subcorpus: str,
 | 
			
		||||
    context: int = 50
 | 
			
		||||
) -> Dict:
 | 
			
		||||
    corpus_name, subcorpus_name = subcorpus.split(':', 1)
 | 
			
		||||
    cqi_corpus = cqi_client.corpora.get(corpus_name)
 | 
			
		||||
    cqi_subcorpus = cqi_corpus.subcorpora.get(subcorpus_name)
 | 
			
		||||
    cqi_subcorpus_export = export_subcorpus(cqi_subcorpus, context=context)
 | 
			
		||||
    return {'code': 200, 'msg': 'OK', 'payload': cqi_subcorpus_export}
 | 
			
		||||
							
								
								
									
										132
									
								
								app/corpora/cqi_over_sio/extensions/utils.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										132
									
								
								app/corpora/cqi_over_sio/extensions/utils.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,132 @@
 | 
			
		||||
from typing import Dict, List
 | 
			
		||||
from cqi.models.corpora import Corpus
 | 
			
		||||
from cqi.models.subcorpora import Subcorpus
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def lookups_by_cpos(corpus: Corpus, cpos_list: List[int]) -> Dict:
 | 
			
		||||
    lookups = {}
 | 
			
		||||
    lookups['cpos_lookup'] = {cpos: {} for cpos in cpos_list}
 | 
			
		||||
    for attr in corpus.positional_attributes.list():
 | 
			
		||||
        cpos_attr_values = attr.values_by_cpos(cpos_list)
 | 
			
		||||
        for i, cpos in enumerate(cpos_list):
 | 
			
		||||
            lookups['cpos_lookup'][cpos][attr.attrs['name']] = \
 | 
			
		||||
                cpos_attr_values[i]
 | 
			
		||||
    for attr in corpus.structural_attributes.list():
 | 
			
		||||
        # We only want to iterate over non subattributes, identifiable by
 | 
			
		||||
        # attr.attrs['has_values'] == False
 | 
			
		||||
        if attr.attrs['has_values']:
 | 
			
		||||
            continue
 | 
			
		||||
        cpos_attr_ids = attr.ids_by_cpos(cpos_list)
 | 
			
		||||
        for i, cpos in enumerate(cpos_list):
 | 
			
		||||
            if cpos_attr_ids[i] == -1:
 | 
			
		||||
                continue
 | 
			
		||||
            lookups['cpos_lookup'][cpos][attr.attrs['name']] = cpos_attr_ids[i]
 | 
			
		||||
        occured_attr_ids = [x for x in set(cpos_attr_ids) if x != -1]
 | 
			
		||||
        if not occured_attr_ids:
 | 
			
		||||
            continue
 | 
			
		||||
        subattrs = corpus.structural_attributes.list(filters={'part_of': attr})
 | 
			
		||||
        if not subattrs:
 | 
			
		||||
            continue
 | 
			
		||||
        lookup_name = f'{attr.attrs["name"]}_lookup'
 | 
			
		||||
        lookups[lookup_name] = {}
 | 
			
		||||
        for attr_id in occured_attr_ids:
 | 
			
		||||
            lookups[lookup_name][attr_id] = {}
 | 
			
		||||
        for subattr in subattrs:
 | 
			
		||||
            subattr_name = subattr.attrs['name'][(len(attr.attrs['name']) + 1):]  # noqa
 | 
			
		||||
            for i, subattr_value in enumerate(subattr.values_by_ids(occured_attr_ids)):  # noqa
 | 
			
		||||
                lookups[lookup_name][occured_attr_ids[i]][subattr_name] = subattr_value  # noqa
 | 
			
		||||
    return lookups
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def partial_export_subcorpus(
 | 
			
		||||
    subcorpus: Subcorpus,
 | 
			
		||||
    match_id_list: List[int],
 | 
			
		||||
    context: int = 25
 | 
			
		||||
) -> Dict:
 | 
			
		||||
    if subcorpus.size == 0:
 | 
			
		||||
        return {"matches": []}
 | 
			
		||||
    match_boundaries = []
 | 
			
		||||
    for match_id in match_id_list:
 | 
			
		||||
        if match_id < 0 or match_id >= subcorpus.size:
 | 
			
		||||
            continue
 | 
			
		||||
        match_boundaries.append(
 | 
			
		||||
            (
 | 
			
		||||
                match_id,
 | 
			
		||||
                subcorpus.dump(subcorpus.fields['match'], match_id, match_id)[0],
 | 
			
		||||
                subcorpus.dump(subcorpus.fields['matchend'], match_id, match_id)[0]
 | 
			
		||||
            )
 | 
			
		||||
        )
 | 
			
		||||
    cpos_set = set()
 | 
			
		||||
    matches = []
 | 
			
		||||
    for match_boundary in match_boundaries:
 | 
			
		||||
        match_num, match_start, match_end = match_boundary
 | 
			
		||||
        c = (match_start, match_end)
 | 
			
		||||
        if match_start == 0 or context == 0:
 | 
			
		||||
            lc = None
 | 
			
		||||
            cpos_list_lbound = match_start
 | 
			
		||||
        else:
 | 
			
		||||
            lc_lbound = max(0, (match_start - context))
 | 
			
		||||
            lc_rbound = match_start - 1
 | 
			
		||||
            lc = (lc_lbound, lc_rbound)
 | 
			
		||||
            cpos_list_lbound = lc_lbound
 | 
			
		||||
        if match_end == (subcorpus.collection.corpus.size - 1) or context == 0:
 | 
			
		||||
            rc = None
 | 
			
		||||
            cpos_list_rbound = match_end
 | 
			
		||||
        else:
 | 
			
		||||
            rc_lbound = match_end + 1
 | 
			
		||||
            rc_rbound = min(
 | 
			
		||||
                (match_end + context),
 | 
			
		||||
                (subcorpus.collection.corpus.size - 1)
 | 
			
		||||
            )
 | 
			
		||||
            rc = (rc_lbound, rc_rbound)
 | 
			
		||||
            cpos_list_rbound = rc_rbound
 | 
			
		||||
        match = {'num': match_num, 'lc': lc, 'c': c, 'rc': rc}
 | 
			
		||||
        matches.append(match)
 | 
			
		||||
        cpos_set.update(range(cpos_list_lbound, cpos_list_rbound + 1))
 | 
			
		||||
    lookups = lookups_by_cpos(subcorpus.collection.corpus, list(cpos_set))
 | 
			
		||||
    return {'matches': matches, **lookups}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def export_subcorpus(
 | 
			
		||||
    subcorpus: Subcorpus,
 | 
			
		||||
    context: int = 25,
 | 
			
		||||
    cutoff: float = float('inf'),
 | 
			
		||||
    offset: int = 0
 | 
			
		||||
) -> Dict:
 | 
			
		||||
    if subcorpus.size == 0:
 | 
			
		||||
        return {"matches": []}
 | 
			
		||||
    first_match = max(0, offset)
 | 
			
		||||
    last_match = min((offset + cutoff - 1), (subcorpus.size - 1))
 | 
			
		||||
    match_boundaries = zip(
 | 
			
		||||
        range(first_match, last_match + 1),
 | 
			
		||||
        subcorpus.dump(subcorpus.fields['match'], first_match, last_match),
 | 
			
		||||
        subcorpus.dump(subcorpus.fields['matchend'], first_match, last_match)
 | 
			
		||||
    )
 | 
			
		||||
    cpos_set = set()
 | 
			
		||||
    matches = []
 | 
			
		||||
    for match_num, match_start, match_end in match_boundaries:
 | 
			
		||||
        c = (match_start, match_end)
 | 
			
		||||
        if match_start == 0 or context == 0:
 | 
			
		||||
            lc = None
 | 
			
		||||
            cpos_list_lbound = match_start
 | 
			
		||||
        else:
 | 
			
		||||
            lc_lbound = max(0, (match_start - context))
 | 
			
		||||
            lc_rbound = match_start - 1
 | 
			
		||||
            lc = (lc_lbound, lc_rbound)
 | 
			
		||||
            cpos_list_lbound = lc_lbound
 | 
			
		||||
        if match_end == (subcorpus.collection.corpus.size - 1) or context == 0:
 | 
			
		||||
            rc = None
 | 
			
		||||
            cpos_list_rbound = match_end
 | 
			
		||||
        else:
 | 
			
		||||
            rc_lbound = match_end + 1
 | 
			
		||||
            rc_rbound = min(
 | 
			
		||||
                (match_end + context),
 | 
			
		||||
                (subcorpus.collection.corpus.size - 1)
 | 
			
		||||
            )
 | 
			
		||||
            rc = (rc_lbound, rc_rbound)
 | 
			
		||||
            cpos_list_rbound = rc_rbound
 | 
			
		||||
        match = {'num': match_num, 'lc': lc, 'c': c, 'rc': rc}
 | 
			
		||||
        matches.append(match)
 | 
			
		||||
        cpos_set.update(range(cpos_list_lbound, cpos_list_rbound + 1))
 | 
			
		||||
    lookups = lookups_by_cpos(subcorpus.collection.corpus, list(cpos_set))
 | 
			
		||||
    return {'matches': matches, **lookups}
 | 
			
		||||
		Reference in New Issue
	
	Block a user