mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
				synced 2025-11-03 20:02:47 +00:00 
			
		
		
		
	Big update, corpus analysis reworked, versioned services, preliminary work for contributions
This commit is contained in:
		@@ -1,77 +1,13 @@
 | 
			
		||||
from flask import Blueprint
 | 
			
		||||
import os
 | 
			
		||||
import yaml
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
SERVICES = {
 | 
			
		||||
    'file-setup': {
 | 
			
		||||
        'name': 'File setup',
 | 
			
		||||
        'versions': {
 | 
			
		||||
            'latest': '1.0.0b',
 | 
			
		||||
            '1.0.0b': {
 | 
			
		||||
                'publishing_data': {
 | 
			
		||||
                    'date': None,
 | 
			
		||||
                    'title': 'nopaque File setup service',
 | 
			
		||||
                    'url': 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup/-/tree/1.0.0b',  # noqa
 | 
			
		||||
                    'version': '1.0.0'
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    'nlp': {
 | 
			
		||||
        'name': 'Natural Language Processing',
 | 
			
		||||
        'versions': {
 | 
			
		||||
            'latest': '1.0.0b',
 | 
			
		||||
            '1.0.0b': {
 | 
			
		||||
                'check_encoding': True,
 | 
			
		||||
                'models': {
 | 
			
		||||
                    'de': 'German',
 | 
			
		||||
                    'en': 'English',
 | 
			
		||||
                    'it': 'Italian',
 | 
			
		||||
                    'nl': 'Dutch',
 | 
			
		||||
                    'pl': 'Polish',
 | 
			
		||||
                    'zh': 'Chinese'
 | 
			
		||||
                },
 | 
			
		||||
                'publishing_data': {
 | 
			
		||||
                    'date': None,
 | 
			
		||||
                    'title': 'nopaque NLP service',
 | 
			
		||||
                    'url': 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/tree/1.0.0b',  # noqa
 | 
			
		||||
                    'version': '1.0.0'
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    'ocr': {
 | 
			
		||||
        'name': 'Optical Character Recognition',
 | 
			
		||||
        'versions': {
 | 
			
		||||
            'latest': '1.0.0b',
 | 
			
		||||
            '1.0.0b': {
 | 
			
		||||
                'binarization': True,
 | 
			
		||||
                'models': {
 | 
			
		||||
                    'ara': 'Arabic',
 | 
			
		||||
                    'chi_tra': 'Chinese - Traditional',
 | 
			
		||||
                    'dan': 'Danish',
 | 
			
		||||
                    'eng': 'English',
 | 
			
		||||
                    'enm': 'English, Middle 1100-1500',
 | 
			
		||||
                    'fra': 'French',
 | 
			
		||||
                    'frm': 'French, Middle ca. 1400-1600',
 | 
			
		||||
                    'deu': 'German',
 | 
			
		||||
                    'frk': 'German Fraktur',
 | 
			
		||||
                    'ell': 'Greek, Modern (1453-)',
 | 
			
		||||
                    'ita': 'Italian',
 | 
			
		||||
                    'por': 'Portuguese',
 | 
			
		||||
                    'rus': 'Russian',
 | 
			
		||||
                    'spa': 'Spanish; Castilian',
 | 
			
		||||
                },
 | 
			
		||||
                'publishing_data': {
 | 
			
		||||
                    'date': None,
 | 
			
		||||
                    'title': 'nopaque OCR service',
 | 
			
		||||
                    'url': 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/tree/1.0.0b',  # noqa
 | 
			
		||||
                    'version': '1.0.0'
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
services_file = os.path.join(
 | 
			
		||||
    os.path.dirname(os.path.abspath(__file__)), 'services.yml')
 | 
			
		||||
with open(services_file, 'r') as f:
 | 
			
		||||
    SERVICES = yaml.safe_load(f)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
bp = Blueprint('services', __name__)
 | 
			
		||||
from . import routes
 | 
			
		||||
from . import routes  # noqa
 | 
			
		||||
 
 | 
			
		||||
@@ -1,3 +1,4 @@
 | 
			
		||||
from app.models import TesseractOCRModel
 | 
			
		||||
from flask_wtf import FlaskForm
 | 
			
		||||
from wtforms import (BooleanField, MultipleFileField, SelectField, StringField,
 | 
			
		||||
                     SubmitField, ValidationError)
 | 
			
		||||
@@ -6,85 +7,105 @@ from . import SERVICES
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class AddJobForm(FlaskForm):
 | 
			
		||||
    description = StringField('Description',
 | 
			
		||||
                              validators=[DataRequired(), Length(1, 255)])
 | 
			
		||||
    description = StringField('Description', validators=[DataRequired(), Length(1, 255)])  # noqa
 | 
			
		||||
    submit = SubmitField()
 | 
			
		||||
    title = StringField('Title', validators=[DataRequired(), Length(1, 32)])
 | 
			
		||||
    version = SelectField('Version', validators=[DataRequired()])
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class AddNLPJobForm(AddJobForm):
 | 
			
		||||
    check_encoding = BooleanField('Check encoding')
 | 
			
		||||
class AddSpacyNLPJobForm(AddJobForm):
 | 
			
		||||
    encoding_detection = BooleanField('Encoding detection')
 | 
			
		||||
    files = MultipleFileField('Files', validators=[DataRequired()])
 | 
			
		||||
    language = SelectField('Language',  choices=[('', 'Choose your option')],
 | 
			
		||||
                           default='', validators=[DataRequired()])
 | 
			
		||||
    model = SelectField(
 | 
			
		||||
        'Model',
 | 
			
		||||
        choices=[('', 'Choose your option')],
 | 
			
		||||
        default='',
 | 
			
		||||
        validators=[DataRequired()]
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    def validate_check_encoding(self, field):
 | 
			
		||||
        if field.data and 'check_encoding' not in SERVICES['nlp']['versions'][self.version.data]:  # noqa
 | 
			
		||||
            raise ValidationError('Check encoding is not available in this version')  # noqa
 | 
			
		||||
    def validate_encoding_detection(self, field):
 | 
			
		||||
        service_info = SERVICES['spacy-nlp']['versions'][self.version.data]
 | 
			
		||||
        if field.data and 'encoding_detection' not in service_info:
 | 
			
		||||
            raise ValidationError('Encoding detection is not available')
 | 
			
		||||
 | 
			
		||||
    def validate_files(form, field):
 | 
			
		||||
        valid_extensions = ['.txt']
 | 
			
		||||
        for file in field.data:
 | 
			
		||||
            if not file.filename.lower().endswith('.txt'):
 | 
			
		||||
                raise ValidationError('File does not have an approved '
 | 
			
		||||
                                      'extension: .txt')
 | 
			
		||||
            if not file.filename.lower().endswith(tuple(valid_extensions)):
 | 
			
		||||
                raise ValidationError(
 | 
			
		||||
                    'File does not have an approved extension: '
 | 
			
		||||
                    '/'.join(valid_extensions)
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
    def __init__(self, *args, **kwargs):
 | 
			
		||||
        version = kwargs.pop('version', SERVICES['nlp']['versions']['latest'])
 | 
			
		||||
        version = kwargs.pop('version', SERVICES['spacy-nlp']['latest_version'])  # noqa
 | 
			
		||||
        super().__init__(*args, **kwargs)
 | 
			
		||||
        if 'check_encoding' not in SERVICES['nlp']['versions'][version]:
 | 
			
		||||
            self.check_encoding.render_kw = {'disabled': True}
 | 
			
		||||
        self.language.choices += [(x, y) for x, y in SERVICES['nlp']['versions'][version]['models'].items()]  # noqa
 | 
			
		||||
        self.version.choices = [(x, x) for x in SERVICES['nlp']['versions'] if x != 'latest']  # noqa
 | 
			
		||||
        service_info = SERVICES['spacy-nlp']['versions'][version]
 | 
			
		||||
        if 'check_encoding' not in service_info['methods']:
 | 
			
		||||
            self.encoding_detection.render_kw = {'disabled': True}
 | 
			
		||||
        self.model.choices += [(x, y) for x, y in service_info['models'].items()]  # noqa
 | 
			
		||||
        self.version.choices = [(x, x) for x in SERVICES['spacy-nlp']['versions']]  # noqa
 | 
			
		||||
        self.version.default = version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class AddOCRJobForm(AddJobForm):
 | 
			
		||||
    binarization = BooleanField('Binarazation')
 | 
			
		||||
class AddTesseractOCRJobForm(AddJobForm):
 | 
			
		||||
    binarization = BooleanField('Binarization')
 | 
			
		||||
    files = MultipleFileField('Files', validators=[DataRequired()])
 | 
			
		||||
    language = SelectField('Language', choices=[('', 'Choose your option')],
 | 
			
		||||
                           default='', validators=[DataRequired()])
 | 
			
		||||
    model = SelectField(
 | 
			
		||||
        'Model',
 | 
			
		||||
        choices=[('', 'Choose your option')],
 | 
			
		||||
        default='',
 | 
			
		||||
        validators=[DataRequired()]
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    def validate_binarization(self, field):
 | 
			
		||||
        if field.data and 'binarization' not in SERVICES['ocr']['versions'][self.version.data]:  # noqa
 | 
			
		||||
            raise ValidationError('Binarization is not available in this version')  # noqa
 | 
			
		||||
        service_info = SERVICES['tesseract-ocr']['versions'][self.version.data]
 | 
			
		||||
        if field.data and 'binarization' not in service_info:
 | 
			
		||||
            raise ValidationError('Binarization is not available')
 | 
			
		||||
 | 
			
		||||
    def validate_files(self, field):
 | 
			
		||||
        valid_extensions = ['.pdf']
 | 
			
		||||
        for file in field.data:
 | 
			
		||||
            if not file.filename.lower().endswith('.pdf'):
 | 
			
		||||
                raise ValidationError('File does not have an approved '
 | 
			
		||||
                                      'extension: .pdf')
 | 
			
		||||
            if not file.filename.lower().endswith(tuple(valid_extensions)):
 | 
			
		||||
                raise ValidationError(
 | 
			
		||||
                    'File does not have an approved extension: '
 | 
			
		||||
                    '/'.join(valid_extensions)
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
    def __init__(self, *args, **kwargs):
 | 
			
		||||
        version = kwargs.pop('version', SERVICES['ocr']['versions']['latest'])
 | 
			
		||||
        version = kwargs.pop('version', SERVICES['tesseract-ocr']['latest_version'])  # noqa
 | 
			
		||||
        super().__init__(*args, **kwargs)
 | 
			
		||||
        if 'binarization' not in SERVICES['ocr']['versions'][version]:
 | 
			
		||||
        service_info = SERVICES['tesseract-ocr']['versions'][version]
 | 
			
		||||
        if 'binarization' not in service_info['methods']:
 | 
			
		||||
            self.binarization.render_kw = {'disabled': True}
 | 
			
		||||
        self.language.choices += [(x, y) for x, y in SERVICES['ocr']['versions'][version]['models'].items()]  # noqa
 | 
			
		||||
        self.version.choices = [(x, x) for x in SERVICES['ocr']['versions'] if x != 'latest']  # noqa
 | 
			
		||||
        self.version.default = version
 | 
			
		||||
        self.model.choices += [(x.hashid, x.title) for x in TesseractOCRModel.query.all()]  # noqa
 | 
			
		||||
        self.version.choices = [(x, x) for x in SERVICES['tesseract-ocr']['versions']]  # noqa
 | 
			
		||||
        self.version.data = version
 | 
			
		||||
        self.version.default = SERVICES['tesseract-ocr']['latest_version']
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class AddFileSetupJobForm(AddJobForm):
 | 
			
		||||
    files = MultipleFileField('Files', validators=[DataRequired()])
 | 
			
		||||
 | 
			
		||||
    def validate_files(form, field):
 | 
			
		||||
        valid_extensions = ['.jpeg', '.jpg', '.png', '.tiff', '.tif']
 | 
			
		||||
        for file in field.data:
 | 
			
		||||
            if not file.filename.lower().endswith(('.jpeg', '.jpg', '.png',
 | 
			
		||||
                                                   '.tiff', '.tif')):
 | 
			
		||||
                raise ValidationError('File does not have an approved '
 | 
			
		||||
                                      'extension: .jpeg | .jpg | .png | .tiff '
 | 
			
		||||
                                      '| .tif')
 | 
			
		||||
            if not file.filename.lower().endswith(tuple(valid_extensions)):
 | 
			
		||||
                raise ValidationError(
 | 
			
		||||
                    'File does not have an approved extension: '
 | 
			
		||||
                    '/'.join(valid_extensions)
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
    def __init__(self, *args, **kwargs):
 | 
			
		||||
        version = kwargs.pop('version', SERVICES['file-setup']['versions']['latest'])
 | 
			
		||||
        version = kwargs.pop('version', SERVICES['file-setup']['latest_version'])  # noqa
 | 
			
		||||
        super().__init__(*args, **kwargs)
 | 
			
		||||
        self.version.choices = [(x, x) for x in SERVICES['file-setup']['versions'] if x != 'latest']  # noqa
 | 
			
		||||
        self.version.default = version
 | 
			
		||||
        self.version.choices = [(x, x) for x in SERVICES['file-setup']['versions']]  # noqa
 | 
			
		||||
        self.version.data = version
 | 
			
		||||
        self.version.default = SERVICES['file-setup']['latest_version']
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
AddJobForms = {
 | 
			
		||||
    'file-setup': AddFileSetupJobForm,
 | 
			
		||||
    'ocr': AddOCRJobForm,
 | 
			
		||||
    'nlp': AddNLPJobForm
 | 
			
		||||
    'tesseract-ocr': AddTesseractOCRJobForm,
 | 
			
		||||
    'spacy-nlp': AddSpacyNLPJobForm
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -1,3 +1,4 @@
 | 
			
		||||
from app import hashids
 | 
			
		||||
from flask import (abort, current_app, flash, make_response, render_template,
 | 
			
		||||
                   request, url_for)
 | 
			
		||||
from flask_login import current_user, login_required
 | 
			
		||||
@@ -8,7 +9,6 @@ from .. import db
 | 
			
		||||
from .forms import AddJobForms
 | 
			
		||||
from ..models import Job, JobInput
 | 
			
		||||
import json
 | 
			
		||||
import os
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@bp.route('/corpus-analysis')
 | 
			
		||||
@@ -24,57 +24,65 @@ def service(service):
 | 
			
		||||
    # Check if the requested service exist
 | 
			
		||||
    if service not in SERVICES or service not in AddJobForms:
 | 
			
		||||
        abort(404)
 | 
			
		||||
    version = request.args.get(
 | 
			
		||||
        'version', SERVICES[service]['versions']['latest'])
 | 
			
		||||
    version = request.args.get('version', SERVICES[service]['latest_version'])
 | 
			
		||||
    if version not in SERVICES[service]['versions']:
 | 
			
		||||
        abort(404)
 | 
			
		||||
    form = AddJobForms[service](prefix='add-job-form', version=version)
 | 
			
		||||
    form.version.data = version
 | 
			
		||||
    title = SERVICES[service]['name']
 | 
			
		||||
    versions = SERVICES[service]['versions']
 | 
			
		||||
    if form.is_submitted():
 | 
			
		||||
        if not form.validate():
 | 
			
		||||
            return make_response(form.errors, 400)
 | 
			
		||||
        service_args = []
 | 
			
		||||
        if service == 'nlp':
 | 
			
		||||
            service_args.append(f'-l {form.language.data}')
 | 
			
		||||
            if form.check_encoding.data:
 | 
			
		||||
                service_args.append('--check-encoding')
 | 
			
		||||
        if service == 'ocr':
 | 
			
		||||
            service_args.append(f'-l {form.language.data}')
 | 
			
		||||
        service_args = {}
 | 
			
		||||
        if service == 'spacy-nlp':
 | 
			
		||||
            service_args['model'] = form.model.data
 | 
			
		||||
            if form.encoding_detection.data:
 | 
			
		||||
                service_args['encoding_detection'] = True
 | 
			
		||||
        if service == 'tesseract-ocr':
 | 
			
		||||
            service_args['model'] = hashids.decode(form.model.data)
 | 
			
		||||
            if form.binarization.data:
 | 
			
		||||
                service_args.append('--binarize')
 | 
			
		||||
        job = Job(user=current_user,
 | 
			
		||||
                  description=form.description.data,
 | 
			
		||||
                  service=service, service_args=json.dumps(service_args),
 | 
			
		||||
                  service_version=form.version.data,
 | 
			
		||||
                  status='preparing', title=form.title.data)
 | 
			
		||||
                service_args['binarization'] = True
 | 
			
		||||
        job = Job(
 | 
			
		||||
            user=current_user,
 | 
			
		||||
            description=form.description.data,
 | 
			
		||||
            service=service,
 | 
			
		||||
            service_args=json.dumps(service_args),
 | 
			
		||||
            service_version=form.version.data,
 | 
			
		||||
            status='preparing',
 | 
			
		||||
            title=form.title.data
 | 
			
		||||
        )
 | 
			
		||||
        db.session.add(job)
 | 
			
		||||
        db.session.flush()
 | 
			
		||||
        db.session.flush(objects=[job])
 | 
			
		||||
        db.session.refresh(job)
 | 
			
		||||
        try:
 | 
			
		||||
            os.makedirs(job.path)
 | 
			
		||||
        except OSError:
 | 
			
		||||
            current_app.logger.error(f'Make dir {job.path} led to an OSError!')
 | 
			
		||||
            job.makedirs()
 | 
			
		||||
        except OSError as e:
 | 
			
		||||
            current_app.logger.error(e)
 | 
			
		||||
            db.session.rollback()
 | 
			
		||||
            flash('Internal Server Error', 'error')
 | 
			
		||||
            return make_response(
 | 
			
		||||
                {'redirect_url': url_for('.service', service=service)}, 500)
 | 
			
		||||
        else:
 | 
			
		||||
            for file in form.files.data:
 | 
			
		||||
                filename = secure_filename(file.filename)
 | 
			
		||||
                job_input = JobInput(
 | 
			
		||||
                    filename=filename, job=job, mimetype=file.mimetype)
 | 
			
		||||
            return make_response({'redirect_url': url_for('.service', service=service)}, 500)  # noqa
 | 
			
		||||
        for file in form.files.data:
 | 
			
		||||
            filename = secure_filename(file.filename)
 | 
			
		||||
            job_input = JobInput(
 | 
			
		||||
                filename=filename,
 | 
			
		||||
                job=job,
 | 
			
		||||
                mimetype=file.mimetype
 | 
			
		||||
            )
 | 
			
		||||
            db.session.add(job_input)
 | 
			
		||||
            db.session.flush(objects=[job_input])
 | 
			
		||||
            db.session.refresh(job_input)
 | 
			
		||||
            try:
 | 
			
		||||
                file.save(job_input.path)
 | 
			
		||||
                db.session.add(job_input)
 | 
			
		||||
            job.status = 'submitted'
 | 
			
		||||
            db.session.commit()
 | 
			
		||||
            flash(f'Job "{job.title}" added', 'job')
 | 
			
		||||
            return make_response(
 | 
			
		||||
                {'redirect_url': url_for('jobs.job', job_id=job.id)}, 201)
 | 
			
		||||
            except OSError as e:
 | 
			
		||||
                current_app.logger.error(e)
 | 
			
		||||
                db.session.rollback()
 | 
			
		||||
                flash('Internal Server Error', 'error')
 | 
			
		||||
                return make_response({'redirect_url': url_for('.service', service=service)}, 500)  # noqa
 | 
			
		||||
        job.status = 'submitted'
 | 
			
		||||
        db.session.commit()
 | 
			
		||||
        flash(f'Job "{job.title}" added', 'job')
 | 
			
		||||
        return make_response({'redirect_url': url_for('jobs.job', job_id=job.id)}, 201)  # noqa
 | 
			
		||||
    return render_template(
 | 
			
		||||
        f'services/{service.replace("-", "_")}.html.j2',
 | 
			
		||||
        form=form,
 | 
			
		||||
        title=title,
 | 
			
		||||
        versions=versions
 | 
			
		||||
        title=title
 | 
			
		||||
    )
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										38
									
								
								app/services/services.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										38
									
								
								app/services/services.yml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,38 @@
 | 
			
		||||
# TODO: This could also be done via GitLab/GitHub APIs
 | 
			
		||||
#file-setup-pipeline:
 | 
			
		||||
file-setup:
 | 
			
		||||
  name: 'File setup pipeline'
 | 
			
		||||
  latest_version: '0.1.0'
 | 
			
		||||
  versions:
 | 
			
		||||
    0.1.0:
 | 
			
		||||
      publisher: 'Bielefeld University - CRC 1288 - INF'
 | 
			
		||||
      publishing_year: 2022
 | 
			
		||||
      url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup/-/releases/v0.1.0'
 | 
			
		||||
#spacy-nlp-pipeline:
 | 
			
		||||
spacy-nlp:
 | 
			
		||||
  name: 'spaCy NLP'
 | 
			
		||||
  latest_version: '0.1.0'
 | 
			
		||||
  versions:
 | 
			
		||||
    0.1.0:
 | 
			
		||||
      methods:
 | 
			
		||||
        - 'encoding_detection'
 | 
			
		||||
      models:
 | 
			
		||||
        de: 'German'
 | 
			
		||||
        en: 'English'
 | 
			
		||||
        it: 'Italian'
 | 
			
		||||
        pl: 'Polish'
 | 
			
		||||
        zh: 'Chinese'
 | 
			
		||||
      publisher: 'Bielefeld University - CRC 1288 - INF'
 | 
			
		||||
      publishing_year: 2022
 | 
			
		||||
      url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/releases/v0.1.0'
 | 
			
		||||
#tesseract-ocr-pipeline:
 | 
			
		||||
tesseract-ocr:
 | 
			
		||||
  name: 'Tesseract OCR'
 | 
			
		||||
  latest_version: '0.1.0'
 | 
			
		||||
  versions:
 | 
			
		||||
    0.1.0:
 | 
			
		||||
      methods:
 | 
			
		||||
        - 'binarization'
 | 
			
		||||
      publisher: 'Bielefeld University - CRC 1288 - INF'
 | 
			
		||||
      publishing_year: 2022
 | 
			
		||||
      url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/releases/v0.1.0'
 | 
			
		||||
		Reference in New Issue
	
	Block a user