mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
				synced 2025-11-03 20:02:47 +00:00 
			
		
		
		
	Add export options to subcorpora
This commit is contained in:
		@@ -1,13 +1,9 @@
 | 
			
		||||
from flask import session
 | 
			
		||||
import cqi
 | 
			
		||||
import json
 | 
			
		||||
import math
 | 
			
		||||
import os
 | 
			
		||||
from app import socketio
 | 
			
		||||
from app.decorators import socketio_login_required
 | 
			
		||||
from app.models import Corpus
 | 
			
		||||
from . import NAMESPACE as ns
 | 
			
		||||
from .utils import cqi_over_socketio, export_subcorpus
 | 
			
		||||
from .utils import cqi_over_socketio, export_subcorpus, partial_export_subcorpus
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@socketio.on('cqi.corpora.corpus.subcorpora.get', namespace=ns)
 | 
			
		||||
@@ -109,6 +105,16 @@ def cqi_corpora_corpus_subcorpora_subcorpus_paginate(cqi_client: cqi.CQiClient,
 | 
			
		||||
    return {'code': 200, 'msg': 'OK', 'payload': payload}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@socketio.on('cqi.corpora.corpus.subcorpora.subcorpus.partial_export', namespace=ns)
 | 
			
		||||
@socketio_login_required
 | 
			
		||||
@cqi_over_socketio
 | 
			
		||||
def cqi_corpora_corpus_subcorpora_subcorpus_partial_export(cqi_client: cqi.CQiClient, corpus_name: str, subcorpus_name: str, match_id_list: list, context: int = 50):  # noqa
 | 
			
		||||
    cqi_corpus = cqi_client.corpora.get(corpus_name)
 | 
			
		||||
    cqi_subcorpus = cqi_corpus.subcorpora.get(subcorpus_name)
 | 
			
		||||
    cqi_subcorpus_partial_export = partial_export_subcorpus(cqi_subcorpus, match_id_list, context=context)
 | 
			
		||||
    return {'code': 200, 'msg': 'OK', 'payload': cqi_subcorpus_partial_export}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@socketio.on('cqi.corpora.corpus.subcorpora.subcorpus.export', namespace=ns)
 | 
			
		||||
@socketio_login_required
 | 
			
		||||
@cqi_over_socketio
 | 
			
		||||
@@ -116,8 +122,4 @@ def cqi_corpora_corpus_subcorpora_subcorpus_export(cqi_client: cqi.CQiClient, co
 | 
			
		||||
    cqi_corpus = cqi_client.corpora.get(corpus_name)
 | 
			
		||||
    cqi_subcorpus = cqi_corpus.subcorpora.get(subcorpus_name)
 | 
			
		||||
    cqi_subcorpus_export = export_subcorpus(cqi_subcorpus, context=context)
 | 
			
		||||
    corpus = Corpus.query.get(session['d']['corpus_id'])
 | 
			
		||||
    file_path = os.path.join(corpus.path, f'{subcorpus_name}.json')
 | 
			
		||||
    with open(file_path, 'w') as file:
 | 
			
		||||
        json.dump(cqi_subcorpus_export, file)
 | 
			
		||||
    return {'code': 200, 'msg': 'OK'}
 | 
			
		||||
    return {'code': 200, 'msg': 'OK', 'payload': cqi_subcorpus_export}
 | 
			
		||||
 
 | 
			
		||||
@@ -68,7 +68,7 @@ def lookups_by_cpos(corpus, cpos_list):
 | 
			
		||||
                cpos_attr_values[i]
 | 
			
		||||
    for attr in corpus.structural_attributes.list():
 | 
			
		||||
        # We only want to iterate over non subattributes, identifiable by
 | 
			
		||||
        # attr.attrs['has_values']==False
 | 
			
		||||
        # attr.attrs['has_values'] == False
 | 
			
		||||
        if attr.attrs['has_values']:
 | 
			
		||||
            continue
 | 
			
		||||
        cpos_attr_ids = attr.ids_by_cpos(cpos_list)
 | 
			
		||||
@@ -93,43 +93,86 @@ def lookups_by_cpos(corpus, cpos_list):
 | 
			
		||||
    return lookups
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def partial_export_subcorpus(subcorpus, match_id_list, context=25):
 | 
			
		||||
    if subcorpus.attrs['size'] == 0:
 | 
			
		||||
        return {"matches": []}
 | 
			
		||||
    match_boundaries = []
 | 
			
		||||
    for match_id in match_id_list:
 | 
			
		||||
        if match_id < 0 or match_id >= subcorpus.attrs['size']:
 | 
			
		||||
            continue
 | 
			
		||||
        match_boundaries.append(
 | 
			
		||||
            (
 | 
			
		||||
                match_id,
 | 
			
		||||
                subcorpus.dump(subcorpus.attrs['fields']['match'], match_id, match_id)[0],
 | 
			
		||||
                subcorpus.dump(subcorpus.attrs['fields']['matchend'], match_id, match_id)[0]
 | 
			
		||||
            )
 | 
			
		||||
        )
 | 
			
		||||
    cpos_set = set()
 | 
			
		||||
    matches = []
 | 
			
		||||
    for match_boundary in match_boundaries:
 | 
			
		||||
        match_num, match_start, match_end = match_boundary
 | 
			
		||||
        c = (match_start, match_end)
 | 
			
		||||
        if match_start == 0 or context == 0:
 | 
			
		||||
            lc = None
 | 
			
		||||
            cpos_list_lbound = match_start
 | 
			
		||||
        else:
 | 
			
		||||
            lc_lbound = max(0, (match_start - context))
 | 
			
		||||
            lc_rbound = match_start - 1
 | 
			
		||||
            lc = (lc_lbound, lc_rbound)
 | 
			
		||||
            cpos_list_lbound = lc_lbound
 | 
			
		||||
        if match_end == (subcorpus.collection.corpus.attrs['size'] - 1) or context == 0:
 | 
			
		||||
            rc = None
 | 
			
		||||
            cpos_list_rbound = match_end
 | 
			
		||||
        else:
 | 
			
		||||
            rc_lbound = match_end + 1
 | 
			
		||||
            rc_rbound = min(
 | 
			
		||||
                (match_end + context),
 | 
			
		||||
                (subcorpus.collection.corpus.attrs['size'] - 1)
 | 
			
		||||
            )
 | 
			
		||||
            rc = (rc_lbound, rc_rbound)
 | 
			
		||||
            cpos_list_rbound = rc_rbound
 | 
			
		||||
        match = {'num': match_num, 'lc': lc, 'c': c, 'rc': rc}
 | 
			
		||||
        matches.append(match)
 | 
			
		||||
        cpos_set.update(range(cpos_list_lbound, cpos_list_rbound + 1))
 | 
			
		||||
    lookups = lookups_by_cpos(subcorpus.collection.corpus, list(cpos_set))
 | 
			
		||||
    return {'matches': matches, **lookups}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def export_subcorpus(subcorpus, context=25, cutoff=float('inf'), offset=0):
 | 
			
		||||
    if subcorpus.attrs['size'] == 0:
 | 
			
		||||
        return {"matches": []}
 | 
			
		||||
    first_match = max(0, offset)
 | 
			
		||||
    last_match = min((offset + cutoff - 1), (subcorpus.attrs['size'] - 1))
 | 
			
		||||
    match_boundaries = zip(
 | 
			
		||||
        subcorpus.dump(
 | 
			
		||||
            subcorpus.attrs['fields']['match'], first_match, last_match),
 | 
			
		||||
        subcorpus.dump(
 | 
			
		||||
            subcorpus.attrs['fields']['matchend'], first_match, last_match)
 | 
			
		||||
        list(range(first_match, last_match + 1)),
 | 
			
		||||
        subcorpus.dump(subcorpus.attrs['fields']['match'], first_match, last_match),
 | 
			
		||||
        subcorpus.dump(subcorpus.attrs['fields']['matchend'], first_match, last_match)
 | 
			
		||||
    )
 | 
			
		||||
    cpos_set = set()
 | 
			
		||||
    matches = []
 | 
			
		||||
    match_num = offset + 1
 | 
			
		||||
    for match_start, match_end in match_boundaries:
 | 
			
		||||
    for match_num, match_start, match_end in match_boundaries:
 | 
			
		||||
        c = (match_start, match_end)
 | 
			
		||||
        if match_start == 0 or context == 0:
 | 
			
		||||
            lc = None
 | 
			
		||||
            cpos_list_lbound = match_start
 | 
			
		||||
        else:
 | 
			
		||||
            lc_lbound = max(0, (match_start - 1 - context))
 | 
			
		||||
            lc_lbound = max(0, (match_start - context))
 | 
			
		||||
            lc_rbound = match_start - 1
 | 
			
		||||
            lc = (lc_lbound, lc_rbound)
 | 
			
		||||
            cpos_list_lbound = lc_lbound
 | 
			
		||||
        if (match_end == (subcorpus.collection.corpus.attrs['size'] - 1)
 | 
			
		||||
                or context == 0):
 | 
			
		||||
        if match_end == (subcorpus.collection.corpus.attrs['size'] - 1) or context == 0:
 | 
			
		||||
            rc = None
 | 
			
		||||
            cpos_list_rbound = match_end
 | 
			
		||||
        else:
 | 
			
		||||
            rc_lbound = match_end + 1
 | 
			
		||||
            rc_rbound = min(match_end + 1 + context,
 | 
			
		||||
                            subcorpus.collection.corpus.attrs['size'] - 1)
 | 
			
		||||
            rc_rbound = min(
 | 
			
		||||
                (match_end + context),
 | 
			
		||||
                (subcorpus.collection.corpus.attrs['size'] - 1)
 | 
			
		||||
            )
 | 
			
		||||
            rc = (rc_lbound, rc_rbound)
 | 
			
		||||
            cpos_list_rbound = rc_rbound
 | 
			
		||||
        match = {'num': match_num, 'lc': lc, 'c': c, 'rc': rc}
 | 
			
		||||
        matches.append(match)
 | 
			
		||||
        cpos_set.update(range(cpos_list_lbound, cpos_list_rbound + 1))
 | 
			
		||||
        match_num += 1
 | 
			
		||||
    lookups = lookups_by_cpos(subcorpus.collection.corpus, list(cpos_set))
 | 
			
		||||
    return {'matches': matches, **lookups}
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user