mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
				synced 2025-11-03 20:02:47 +00:00 
			
		
		
		
	Rename all services, use scss, cleanup, add sandpaper conversion script
This commit is contained in:
		@@ -1,5 +1,7 @@
 | 
			
		||||
from app.models import TesseractOCRModel
 | 
			
		||||
from app.models import Job, TesseractOCRModel
 | 
			
		||||
from flask_login import current_user
 | 
			
		||||
from flask_wtf import FlaskForm
 | 
			
		||||
from flask_wtf.file import FileField, FileAllowed, FileRequired
 | 
			
		||||
from wtforms import (
 | 
			
		||||
    BooleanField,
 | 
			
		||||
    MultipleFileField,
 | 
			
		||||
@@ -8,110 +10,143 @@ from wtforms import (
 | 
			
		||||
    SubmitField,
 | 
			
		||||
    ValidationError
 | 
			
		||||
)
 | 
			
		||||
from wtforms.validators import DataRequired, Length
 | 
			
		||||
from wtforms.validators import DataRequired, InputRequired, Length
 | 
			
		||||
from . import SERVICES
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class AddJobForm(FlaskForm):
 | 
			
		||||
    description = StringField('Description', validators=[DataRequired(), Length(1, 255)])  # noqa
 | 
			
		||||
    description = StringField('Description', validators=[InputRequired()])  # noqa
 | 
			
		||||
    submit = SubmitField()
 | 
			
		||||
    title = StringField('Title', validators=[DataRequired(), Length(1, 32)])
 | 
			
		||||
    title = StringField('Title', validators=[InputRequired()])
 | 
			
		||||
    version = SelectField('Version', validators=[DataRequired()])
 | 
			
		||||
 | 
			
		||||
    def validate_description(self, field):
 | 
			
		||||
        max_length = Job.description.property.columns[0].type.length
 | 
			
		||||
        if len(field.data) > max_length:
 | 
			
		||||
            raise ValidationError(
 | 
			
		||||
                f'Description must be less than {max_length} characters'
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
class AddSpacyNLPJobForm(AddJobForm):
 | 
			
		||||
    encoding_detection = BooleanField('Encoding detection')
 | 
			
		||||
    files = MultipleFileField('Files', validators=[DataRequired()])
 | 
			
		||||
    model = SelectField(
 | 
			
		||||
        'Model',
 | 
			
		||||
        choices=[('', 'Choose your option')],
 | 
			
		||||
        default='',
 | 
			
		||||
        validators=[DataRequired()]
 | 
			
		||||
    )
 | 
			
		||||
    def validate_title(self, field):
 | 
			
		||||
        max_length = Job.title.property.columns[0].type.length
 | 
			
		||||
        if len(field.data) > max_length:
 | 
			
		||||
            raise ValidationError(
 | 
			
		||||
                f'Title must be less than {max_length} characters'
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
    def validate_encoding_detection(self, field):
 | 
			
		||||
        service_info = SERVICES['spacy-nlp']['versions'][self.version.data]
 | 
			
		||||
        if field.data and 'encoding_detection' not in service_info['methods']:
 | 
			
		||||
            raise ValidationError('Encoding detection is not available')
 | 
			
		||||
 | 
			
		||||
    def validate_files(form, field):
 | 
			
		||||
        valid_extensions = ['.txt']
 | 
			
		||||
        for file in field.data:
 | 
			
		||||
            if not file.filename.lower().endswith(tuple(valid_extensions)):
 | 
			
		||||
                raise ValidationError(
 | 
			
		||||
                    'File does not have an approved extension: '
 | 
			
		||||
                    '/'.join(valid_extensions)
 | 
			
		||||
                )
 | 
			
		||||
class AddFileSetupPipelineJobForm(AddJobForm):
 | 
			
		||||
    images = MultipleFileField('File(s)', validators=[DataRequired()])
 | 
			
		||||
 | 
			
		||||
    def validate_images(form, field):
 | 
			
		||||
        valid_mimetypes = ['image/jpeg', 'image/png', 'image/tiff']
 | 
			
		||||
        for image in field.data:
 | 
			
		||||
            if image.mimetype not in valid_mimetypes:
 | 
			
		||||
                raise ValidationError('JPEG, PNG and TIFF files only!')
 | 
			
		||||
 | 
			
		||||
    def __init__(self, *args, **kwargs):
 | 
			
		||||
        version = kwargs.pop('version', SERVICES['spacy-nlp']['latest_version'])  # noqa
 | 
			
		||||
        service_manifest = SERVICES['file-setup-pipeline']
 | 
			
		||||
        version = kwargs.pop('version', service_manifest['latest_version'])
 | 
			
		||||
        super().__init__(*args, **kwargs)
 | 
			
		||||
        service_info = SERVICES['spacy-nlp']['versions'][version]
 | 
			
		||||
        if 'encoding_detection' not in service_info['methods']:
 | 
			
		||||
            self.encoding_detection.render_kw = {'disabled': True}
 | 
			
		||||
        self.model.choices += [(x, y) for x, y in service_info['models'].items()]  # noqa
 | 
			
		||||
        self.version.choices = [(x, x) for x in SERVICES['spacy-nlp']['versions']]  # noqa
 | 
			
		||||
        self.version.default = version
 | 
			
		||||
        self.version.choices = [(x, x) for x in service_manifest['versions']]
 | 
			
		||||
        self.version.data = version
 | 
			
		||||
        self.version.default = service_manifest['latest_version']
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class AddTesseractOCRJobForm(AddJobForm):
 | 
			
		||||
class AddTesseractOCRPipelineJobForm(AddJobForm):
 | 
			
		||||
    binarization = BooleanField('Binarization')
 | 
			
		||||
    files = MultipleFileField('Files', validators=[DataRequired()])
 | 
			
		||||
    model = SelectField(
 | 
			
		||||
        'Model',
 | 
			
		||||
        choices=[('', 'Choose your option')],
 | 
			
		||||
        default='',
 | 
			
		||||
        validators=[DataRequired()]
 | 
			
		||||
    )
 | 
			
		||||
    pdf = FileField('File', validators=[FileRequired()])
 | 
			
		||||
    model = SelectField('Model', validators=[DataRequired()])
 | 
			
		||||
 | 
			
		||||
    def validate_binarization(self, field):
 | 
			
		||||
        service_info = SERVICES['tesseract-ocr']['versions'][self.version.data]
 | 
			
		||||
        service_info = SERVICES['tesseract-ocr-pipeline']['versions'][self.version.data]
 | 
			
		||||
        if field.data and 'binarization' not in service_info['methods']:
 | 
			
		||||
            raise ValidationError('Binarization is not available')
 | 
			
		||||
 | 
			
		||||
    def validate_files(self, field):
 | 
			
		||||
        valid_extensions = ['.pdf']
 | 
			
		||||
        for file in field.data:
 | 
			
		||||
            if not file.filename.lower().endswith(tuple(valid_extensions)):
 | 
			
		||||
                raise ValidationError(
 | 
			
		||||
                    'File does not have an approved extension: '
 | 
			
		||||
                    '/'.join(valid_extensions)
 | 
			
		||||
                )
 | 
			
		||||
    def validate_pdf(self, field):
 | 
			
		||||
        if field.data.mimetype != 'application/pdf':
 | 
			
		||||
            raise ValidationError('PDF files only!')
 | 
			
		||||
 | 
			
		||||
    def __init__(self, *args, **kwargs):
 | 
			
		||||
        version = kwargs.pop('version', SERVICES['tesseract-ocr']['latest_version'])  # noqa
 | 
			
		||||
        service_manifest = SERVICES['tesseract-ocr-pipeline']
 | 
			
		||||
        version = kwargs.pop('version', service_manifest['latest_version'])
 | 
			
		||||
        super().__init__(*args, **kwargs)
 | 
			
		||||
        service_info = SERVICES['tesseract-ocr']['versions'][version]
 | 
			
		||||
        service_info = service_manifest['versions'][version]
 | 
			
		||||
        if 'binarization' not in service_info['methods']:
 | 
			
		||||
            self.binarization.render_kw = {'disabled': True}
 | 
			
		||||
        self.model.choices += [(x.hashid, x.title) for x in TesseractOCRModel.query.all()]  # noqa
 | 
			
		||||
        self.version.choices = [(x, x) for x in SERVICES['tesseract-ocr']['versions']]  # noqa
 | 
			
		||||
        compatible_models = [
 | 
			
		||||
            x for x in TesseractOCRModel.query.filter_by(shared=True).all()
 | 
			
		||||
            if version in x.compatible_service_versions
 | 
			
		||||
        ]
 | 
			
		||||
        compatible_models += [
 | 
			
		||||
            x for x in TesseractOCRModel.query.filter_by(shared=False, user=current_user).all()
 | 
			
		||||
            if version in x.compatible_service_versions
 | 
			
		||||
        ]
 | 
			
		||||
        self.model.choices = [('', 'Choose your option')]
 | 
			
		||||
        self.model.choices += [(x.hashid, x.title) for x in compatible_models]
 | 
			
		||||
        self.model.default = ''
 | 
			
		||||
        self.version.choices = [(x, x) for x in service_manifest['versions']]
 | 
			
		||||
        self.version.data = version
 | 
			
		||||
        self.version.default = SERVICES['tesseract-ocr']['latest_version']
 | 
			
		||||
        self.version.default = service_manifest['latest_version']
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class AddFileSetupJobForm(AddJobForm):
 | 
			
		||||
    files = MultipleFileField('Files', validators=[DataRequired()])
 | 
			
		||||
class AddTranskribusHTRPipelineJobForm(AddJobForm):
 | 
			
		||||
    binarization = BooleanField('Binarization')
 | 
			
		||||
    pdf = FileField('File', validators=[FileRequired()])
 | 
			
		||||
    model = SelectField('Model', validators=[DataRequired()])
 | 
			
		||||
 | 
			
		||||
    def validate_files(form, field):
 | 
			
		||||
        valid_extensions = ['.jpeg', '.jpg', '.png', '.tiff', '.tif']
 | 
			
		||||
        for file in field.data:
 | 
			
		||||
            if not file.filename.lower().endswith(tuple(valid_extensions)):
 | 
			
		||||
                raise ValidationError(
 | 
			
		||||
                    'File does not have an approved extension: '
 | 
			
		||||
                    '/'.join(valid_extensions)
 | 
			
		||||
                )
 | 
			
		||||
    def validate_binarization(self, field):
 | 
			
		||||
        service_info = SERVICES['transkribus-htr-pipeline']['versions'][self.version.data]
 | 
			
		||||
        if field.data and 'binarization' not in service_info['methods']:
 | 
			
		||||
            raise ValidationError('Binarization is not available')
 | 
			
		||||
 | 
			
		||||
    def validate_pdf(self, field):
 | 
			
		||||
        if field.data.mimetype != 'application/pdf':
 | 
			
		||||
            raise ValidationError('PDF files only!')
 | 
			
		||||
 | 
			
		||||
    def __init__(self, *args, **kwargs):
 | 
			
		||||
        version = kwargs.pop('version', SERVICES['file-setup']['latest_version'])  # noqa
 | 
			
		||||
        service_manifest = SERVICES['transkribus-htr-pipeline']
 | 
			
		||||
        version = kwargs.pop('version', service_manifest['latest_version'])
 | 
			
		||||
        super().__init__(*args, **kwargs)
 | 
			
		||||
        self.version.choices = [(x, x) for x in SERVICES['file-setup']['versions']]  # noqa
 | 
			
		||||
        service_info = service_manifest['versions'][version]
 | 
			
		||||
        if 'binarization' not in service_info['methods']:
 | 
			
		||||
            self.binarization.render_kw = {'disabled': True}
 | 
			
		||||
        self.model.choices = [('', 'Choose your option')]
 | 
			
		||||
        self.model.choices += [
 | 
			
		||||
            ('37569', 'Tim Model'),
 | 
			
		||||
            ('29539', 'UCL–University of Toronto #7')
 | 
			
		||||
        ]
 | 
			
		||||
        self.model.default = ''
 | 
			
		||||
        self.version.choices = [(x, x) for x in service_manifest['versions']]
 | 
			
		||||
        self.version.data = version
 | 
			
		||||
        self.version.default = SERVICES['file-setup']['latest_version']
 | 
			
		||||
        self.version.default = service_manifest['latest_version']
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
AddJobForms = {
 | 
			
		||||
    'file-setup': AddFileSetupJobForm,
 | 
			
		||||
    'tesseract-ocr': AddTesseractOCRJobForm,
 | 
			
		||||
    'spacy-nlp': AddSpacyNLPJobForm
 | 
			
		||||
}
 | 
			
		||||
class AddSpacyNLPPipelineJobForm(AddJobForm):
 | 
			
		||||
    encoding_detection = BooleanField('Encoding detection')
 | 
			
		||||
    txt = FileField('File', validators=[FileRequired()])
 | 
			
		||||
    model = SelectField('Model', validators=[DataRequired()])
 | 
			
		||||
 | 
			
		||||
    def validate_encoding_detection(self, field):
 | 
			
		||||
        service_manifest = SERVICES['spacy-nlp-pipeline']
 | 
			
		||||
        service_info = service_manifest['versions'][self.version.data]
 | 
			
		||||
        if field.data and 'encoding_detection' not in service_info['methods']:
 | 
			
		||||
            raise ValidationError('Encoding detection is not available!')
 | 
			
		||||
 | 
			
		||||
    def validate_txt(form, field):
 | 
			
		||||
        if field.data.mimetype != 'text/plain':
 | 
			
		||||
            raise ValidationError('Plain text files only!')
 | 
			
		||||
 | 
			
		||||
    def __init__(self, *args, **kwargs):
 | 
			
		||||
        service_manifest = SERVICES['spacy-nlp-pipeline']
 | 
			
		||||
        version = kwargs.pop('version', service_manifest['latest_version'])
 | 
			
		||||
        super().__init__(*args, **kwargs)
 | 
			
		||||
        service_info = service_manifest['versions'][version]
 | 
			
		||||
        if 'encoding_detection' not in service_info['methods']:
 | 
			
		||||
            self.encoding_detection.render_kw = {'disabled': True}
 | 
			
		||||
        self.model.choices = [('', 'Choose your option')]
 | 
			
		||||
        self.model.choices += [(x, y) for x, y in service_info['models'].items()]  # noqa
 | 
			
		||||
        self.model.default = ''
 | 
			
		||||
        self.version.choices = [(x, x) for x in service_manifest['versions']]
 | 
			
		||||
        self.version.data = version
 | 
			
		||||
        self.version.default = version
 | 
			
		||||
 
 | 
			
		||||
@@ -13,47 +13,33 @@ from flask_login import current_user, login_required
 | 
			
		||||
from werkzeug.utils import secure_filename
 | 
			
		||||
from . import bp
 | 
			
		||||
from . import SERVICES
 | 
			
		||||
from .forms import AddJobForms
 | 
			
		||||
from .forms import (
 | 
			
		||||
    AddFileSetupPipelineJobForm,
 | 
			
		||||
    AddTesseractOCRPipelineJobForm,
 | 
			
		||||
    AddTranskribusHTRPipelineJobForm,
 | 
			
		||||
    AddSpacyNLPPipelineJobForm
 | 
			
		||||
)
 | 
			
		||||
import json
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@bp.route('/corpus-analysis')
 | 
			
		||||
@bp.route('/file-setup-pipeline', methods=['GET', 'POST'])
 | 
			
		||||
@login_required
 | 
			
		||||
def corpus_analysis():
 | 
			
		||||
    return render_template(
 | 
			
		||||
        'services/corpus_analysis.html.j2',
 | 
			
		||||
        title='Corpus analysis'
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@bp.route('/<service>', methods=['GET', 'POST'])
 | 
			
		||||
@login_required
 | 
			
		||||
def service(service):
 | 
			
		||||
    # Check if the requested service exist
 | 
			
		||||
    if service not in SERVICES or service not in AddJobForms:
 | 
			
		||||
def file_setup_pipeline():
 | 
			
		||||
    service = 'file-setup-pipeline'
 | 
			
		||||
    service_manifest = SERVICES[service]
 | 
			
		||||
    version = request.args.get('version', service_manifest['latest_version'])
 | 
			
		||||
    if version not in service_manifest['versions']:
 | 
			
		||||
        abort(404)
 | 
			
		||||
    version = request.args.get('version', SERVICES[service]['latest_version'])
 | 
			
		||||
    if version not in SERVICES[service]['versions']:
 | 
			
		||||
        abort(404)
 | 
			
		||||
    form = AddJobForms[service](prefix='add-job-form', version=version)
 | 
			
		||||
    title = SERVICES[service]['name']
 | 
			
		||||
    form = AddFileSetupPipelineJobForm(prefix='add-job-form', version=version)
 | 
			
		||||
    if form.is_submitted():
 | 
			
		||||
        if not form.validate():
 | 
			
		||||
            return make_response(form.errors, 400)
 | 
			
		||||
        service_args = {}
 | 
			
		||||
        if service == 'spacy-nlp':
 | 
			
		||||
            service_args['model'] = form.model.data
 | 
			
		||||
            if form.encoding_detection.data:
 | 
			
		||||
                service_args['encoding_detection'] = True
 | 
			
		||||
        if service == 'tesseract-ocr':
 | 
			
		||||
            service_args['model'] = hashids.decode(form.model.data)
 | 
			
		||||
            if form.binarization.data:
 | 
			
		||||
                service_args['binarization'] = True
 | 
			
		||||
        job = Job(
 | 
			
		||||
            user=current_user,
 | 
			
		||||
            description=form.description.data,
 | 
			
		||||
            service=service,
 | 
			
		||||
            service_args=json.dumps(service_args),
 | 
			
		||||
            service_args=service_args,
 | 
			
		||||
            service_version=form.version.data,
 | 
			
		||||
            title=form.title.data
 | 
			
		||||
        )
 | 
			
		||||
@@ -67,18 +53,17 @@ def service(service):
 | 
			
		||||
            db.session.rollback()
 | 
			
		||||
            flash('Internal Server Error', 'error')
 | 
			
		||||
            return make_response({'redirect_url': url_for('.service', service=service)}, 500)  # noqa
 | 
			
		||||
        for file in form.files.data:
 | 
			
		||||
            filename = secure_filename(file.filename)
 | 
			
		||||
        for image_file in form.images.data:
 | 
			
		||||
            job_input = JobInput(
 | 
			
		||||
                filename=filename,
 | 
			
		||||
                filename=secure_filename(image_file.filename),
 | 
			
		||||
                job=job,
 | 
			
		||||
                mimetype=file.mimetype
 | 
			
		||||
                mimetype=image_file.mimetype
 | 
			
		||||
            )
 | 
			
		||||
            db.session.add(job_input)
 | 
			
		||||
            db.session.flush(objects=[job_input])
 | 
			
		||||
            db.session.refresh(job_input)
 | 
			
		||||
            try:
 | 
			
		||||
                file.save(job_input.path)
 | 
			
		||||
                image_file.save(job_input.path)
 | 
			
		||||
            except OSError as e:
 | 
			
		||||
                current_app.logger.error(e)
 | 
			
		||||
                db.session.rollback()
 | 
			
		||||
@@ -91,5 +76,196 @@ def service(service):
 | 
			
		||||
    return render_template(
 | 
			
		||||
        f'services/{service.replace("-", "_")}.html.j2',
 | 
			
		||||
        form=form,
 | 
			
		||||
        title=title
 | 
			
		||||
        title=service_manifest['name']
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@bp.route('/tesseract-ocr-pipeline', methods=['GET', 'POST'])
 | 
			
		||||
@login_required
 | 
			
		||||
def tesseract_ocr_pipeline():
 | 
			
		||||
    service = 'tesseract-ocr-pipeline'
 | 
			
		||||
    service_manifest = SERVICES[service]
 | 
			
		||||
    version = request.args.get('version', service_manifest['latest_version'])
 | 
			
		||||
    if version not in service_manifest['versions']:
 | 
			
		||||
        abort(404)
 | 
			
		||||
    form = AddTesseractOCRPipelineJobForm(prefix='add-job-form', version=version)
 | 
			
		||||
    if form.is_submitted():
 | 
			
		||||
        if not form.validate():
 | 
			
		||||
            return make_response(form.errors, 400)
 | 
			
		||||
        service_args = {}
 | 
			
		||||
        service_args['model'] = hashids.decode(form.model.data)
 | 
			
		||||
        if form.binarization.data:
 | 
			
		||||
            service_args['binarization'] = True
 | 
			
		||||
        job = Job(
 | 
			
		||||
            user=current_user,
 | 
			
		||||
            description=form.description.data,
 | 
			
		||||
            service=service,
 | 
			
		||||
            service_args=service_args,
 | 
			
		||||
            service_version=form.version.data,
 | 
			
		||||
            title=form.title.data
 | 
			
		||||
        )
 | 
			
		||||
        db.session.add(job)
 | 
			
		||||
        db.session.flush(objects=[job])
 | 
			
		||||
        db.session.refresh(job)
 | 
			
		||||
        try:
 | 
			
		||||
            job.makedirs()
 | 
			
		||||
        except OSError as e:
 | 
			
		||||
            current_app.logger.error(e)
 | 
			
		||||
            db.session.rollback()
 | 
			
		||||
            flash('Internal Server Error', 'error')
 | 
			
		||||
            return make_response({'redirect_url': url_for('.service', service=service)}, 500)  # noqa
 | 
			
		||||
        job_input = JobInput(
 | 
			
		||||
            filename=secure_filename(form.pdf.data.filename),
 | 
			
		||||
            job=job,
 | 
			
		||||
            mimetype=form.pdf.data.mimetype
 | 
			
		||||
        )
 | 
			
		||||
        db.session.add(job_input)
 | 
			
		||||
        db.session.flush(objects=[job_input])
 | 
			
		||||
        db.session.refresh(job_input)
 | 
			
		||||
        try:
 | 
			
		||||
            form.pdf.data.save(job_input.path)
 | 
			
		||||
        except OSError as e:
 | 
			
		||||
            current_app.logger.error(e)
 | 
			
		||||
            db.session.rollback()
 | 
			
		||||
            flash('Internal Server Error', 'error')
 | 
			
		||||
            return make_response({'redirect_url': url_for('.service', service=service)}, 500)  # noqa
 | 
			
		||||
        job.status = JobStatus.SUBMITTED
 | 
			
		||||
        db.session.commit()
 | 
			
		||||
        flash(f'Job "{job.title}" added', 'job')
 | 
			
		||||
        return make_response({'redirect_url': url_for('jobs.job', job_id=job.id)}, 201)  # noqa
 | 
			
		||||
    return render_template(
 | 
			
		||||
        f'services/{service.replace("-", "_")}.html.j2',
 | 
			
		||||
        form=form,
 | 
			
		||||
        title=service_manifest['name']
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@bp.route('/transkribus-htr-pipeline', methods=['GET', 'POST'])
 | 
			
		||||
@login_required
 | 
			
		||||
def transkribus_htr_pipeline():
 | 
			
		||||
    if not current_app.config.get('NOPAQUE_TRANSKRIBUS_ENABLED'):
 | 
			
		||||
        abort(404)
 | 
			
		||||
    service = 'transkribus-htr-pipeline'
 | 
			
		||||
    service_manifest = SERVICES[service]
 | 
			
		||||
    version = request.args.get('version', service_manifest['latest_version'])
 | 
			
		||||
    if version not in service_manifest['versions']:
 | 
			
		||||
        abort(404)
 | 
			
		||||
    form = AddTranskribusHTRPipelineJobForm(prefix='add-job-form', version=version)
 | 
			
		||||
    if form.is_submitted():
 | 
			
		||||
        if not form.validate():
 | 
			
		||||
            return make_response(form.errors, 400)
 | 
			
		||||
        service_args = {}
 | 
			
		||||
        service_args['model'] = form.model.data
 | 
			
		||||
        if form.binarization.data:
 | 
			
		||||
            service_args['binarization'] = True
 | 
			
		||||
        job = Job(
 | 
			
		||||
            user=current_user,
 | 
			
		||||
            description=form.description.data,
 | 
			
		||||
            service=service,
 | 
			
		||||
            service_args=service_args,
 | 
			
		||||
            service_version=form.version.data,
 | 
			
		||||
            title=form.title.data
 | 
			
		||||
        )
 | 
			
		||||
        db.session.add(job)
 | 
			
		||||
        db.session.flush(objects=[job])
 | 
			
		||||
        db.session.refresh(job)
 | 
			
		||||
        try:
 | 
			
		||||
            job.makedirs()
 | 
			
		||||
        except OSError as e:
 | 
			
		||||
            current_app.logger.error(e)
 | 
			
		||||
            db.session.rollback()
 | 
			
		||||
            flash('Internal Server Error', 'error')
 | 
			
		||||
            return make_response({'redirect_url': url_for('.service', service=service)}, 500)  # noqa
 | 
			
		||||
        job_input = JobInput(
 | 
			
		||||
            filename=secure_filename(form.pdf.data.filename),
 | 
			
		||||
            job=job,
 | 
			
		||||
            mimetype=form.pdf.data.mimetype
 | 
			
		||||
        )
 | 
			
		||||
        db.session.add(job_input)
 | 
			
		||||
        db.session.flush(objects=[job_input])
 | 
			
		||||
        db.session.refresh(job_input)
 | 
			
		||||
        try:
 | 
			
		||||
            form.pdf.data.save(job_input.path)
 | 
			
		||||
        except OSError as e:
 | 
			
		||||
            current_app.logger.error(e)
 | 
			
		||||
            db.session.rollback()
 | 
			
		||||
            flash('Internal Server Error', 'error')
 | 
			
		||||
            return make_response({'redirect_url': url_for('.service', service=service)}, 500)  # noqa
 | 
			
		||||
        job.status = JobStatus.SUBMITTED
 | 
			
		||||
        db.session.commit()
 | 
			
		||||
        flash(f'Job "{job.title}" added', 'job')
 | 
			
		||||
        return make_response({'redirect_url': url_for('jobs.job', job_id=job.id)}, 201)  # noqa
 | 
			
		||||
    return render_template(
 | 
			
		||||
        f'services/{service.replace("-", "_")}.html.j2',
 | 
			
		||||
        form=form,
 | 
			
		||||
        title=service_manifest['name']
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@bp.route('/spacy-nlp-pipeline', methods=['GET', 'POST'])
 | 
			
		||||
@login_required
 | 
			
		||||
def spacy_nlp_pipeline():
 | 
			
		||||
    service = 'spacy-nlp-pipeline'
 | 
			
		||||
    service_manifest = SERVICES[service]
 | 
			
		||||
    version = request.args.get('version', SERVICES[service]['latest_version'])
 | 
			
		||||
    if version not in service_manifest['versions']:
 | 
			
		||||
        abort(404)
 | 
			
		||||
    form = AddSpacyNLPPipelineJobForm(prefix='add-job-form', version=version)
 | 
			
		||||
    if form.is_submitted():
 | 
			
		||||
        if not form.validate():
 | 
			
		||||
            return make_response(form.errors, 400)
 | 
			
		||||
        service_args = {}
 | 
			
		||||
        service_args['model'] = form.model.data
 | 
			
		||||
        if form.encoding_detection.data:
 | 
			
		||||
            service_args['encoding_detection'] = True
 | 
			
		||||
        job = Job(
 | 
			
		||||
            user=current_user,
 | 
			
		||||
            description=form.description.data,
 | 
			
		||||
            service=service,
 | 
			
		||||
            service_args=service_args,
 | 
			
		||||
            service_version=form.version.data,
 | 
			
		||||
            title=form.title.data
 | 
			
		||||
        )
 | 
			
		||||
        db.session.add(job)
 | 
			
		||||
        db.session.flush(objects=[job])
 | 
			
		||||
        db.session.refresh(job)
 | 
			
		||||
        try:
 | 
			
		||||
            job.makedirs()
 | 
			
		||||
        except OSError as e:
 | 
			
		||||
            current_app.logger.error(e)
 | 
			
		||||
            db.session.rollback()
 | 
			
		||||
            flash('Internal Server Error', 'error')
 | 
			
		||||
            return make_response({'redirect_url': url_for('.service', service=service)}, 500)  # noqa
 | 
			
		||||
        job_input = JobInput(
 | 
			
		||||
            filename=secure_filename(form.txt.data.filename),
 | 
			
		||||
            job=job,
 | 
			
		||||
            mimetype=form.txt.data.mimetype
 | 
			
		||||
        )
 | 
			
		||||
        db.session.add(job_input)
 | 
			
		||||
        db.session.flush(objects=[job_input])
 | 
			
		||||
        db.session.refresh(job_input)
 | 
			
		||||
        try:
 | 
			
		||||
            form.txt.data.save(job_input.path)
 | 
			
		||||
        except OSError as e:
 | 
			
		||||
            current_app.logger.error(e)
 | 
			
		||||
            db.session.rollback()
 | 
			
		||||
            flash('Internal Server Error', 'error')
 | 
			
		||||
            return make_response({'redirect_url': url_for('.service', service=service)}, 500)  # noqa
 | 
			
		||||
        job.status = JobStatus.SUBMITTED
 | 
			
		||||
        db.session.commit()
 | 
			
		||||
        flash(f'Job "{job.title}" added', 'job')
 | 
			
		||||
        return make_response({'redirect_url': url_for('jobs.job', job_id=job.id)}, 201)  # noqa
 | 
			
		||||
    return render_template(
 | 
			
		||||
        f'services/{service.replace("-", "_")}.html.j2',
 | 
			
		||||
        form=form,
 | 
			
		||||
        title=service_manifest['name']
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@bp.route('/corpus-analysis')
 | 
			
		||||
@login_required
 | 
			
		||||
def corpus_analysis():
 | 
			
		||||
    return render_template(
 | 
			
		||||
        'services/corpus_analysis.html.j2',
 | 
			
		||||
        title='Corpus analysis'
 | 
			
		||||
    )
 | 
			
		||||
@@ -1,38 +1,70 @@
 | 
			
		||||
# TODO: This could also be done via GitLab/GitHub APIs
 | 
			
		||||
#file-setup-pipeline:
 | 
			
		||||
file-setup:
 | 
			
		||||
file-setup-pipeline:
 | 
			
		||||
  name: 'File setup pipeline'
 | 
			
		||||
  publisher: 'Bielefeld University - CRC 1288 - INF'
 | 
			
		||||
  latest_version: '0.1.0'
 | 
			
		||||
  versions:
 | 
			
		||||
    0.1.0:
 | 
			
		||||
      publisher: 'Bielefeld University - CRC 1288 - INF'
 | 
			
		||||
      publishing_year: 2022
 | 
			
		||||
      url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup/-/releases/v0.1.0'
 | 
			
		||||
#spacy-nlp-pipeline:
 | 
			
		||||
spacy-nlp:
 | 
			
		||||
  name: 'spaCy NLP'
 | 
			
		||||
      url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup-pipeline/-/releases/v0.1.0'
 | 
			
		||||
tesseract-ocr-pipeline:
 | 
			
		||||
  name: 'Tesseract OCR Pipeline'
 | 
			
		||||
  publisher: 'Bielefeld University - CRC 1288 - INF'
 | 
			
		||||
  latest_version: '0.1.4'
 | 
			
		||||
  versions:
 | 
			
		||||
    0.1.0:
 | 
			
		||||
      methods:
 | 
			
		||||
        - 'binarization'
 | 
			
		||||
      publishing_year: 2022
 | 
			
		||||
      url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.0'
 | 
			
		||||
    0.1.1:
 | 
			
		||||
      methods:
 | 
			
		||||
        - 'binarization'
 | 
			
		||||
      publishing_year: 2022
 | 
			
		||||
      url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.1'
 | 
			
		||||
    0.1.2:
 | 
			
		||||
      methods:
 | 
			
		||||
        - 'binarization'
 | 
			
		||||
      publishing_year: 2022
 | 
			
		||||
      url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.2'
 | 
			
		||||
    0.1.3:
 | 
			
		||||
      methods:
 | 
			
		||||
        - 'binarization'
 | 
			
		||||
      publishing_year: 2022
 | 
			
		||||
      url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.3'
 | 
			
		||||
    0.1.4:
 | 
			
		||||
      methods:
 | 
			
		||||
        - 'binarization'
 | 
			
		||||
      publishing_year: 2022
 | 
			
		||||
      url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.4'
 | 
			
		||||
transkribus-htr-pipeline:
 | 
			
		||||
  name: 'Transkribus HTR Pipeline'
 | 
			
		||||
  publisher: 'Bielefeld University - CRC 1288 - INF'
 | 
			
		||||
  latest_version: '0.1.0'
 | 
			
		||||
  versions:
 | 
			
		||||
    0.1.0:
 | 
			
		||||
      methods:
 | 
			
		||||
        - 'binarization'
 | 
			
		||||
      publishing_year: 2022
 | 
			
		||||
      url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/transkribus-htr-pipeline/-/releases/v0.1.0'
 | 
			
		||||
spacy-nlp-pipeline:
 | 
			
		||||
  name: 'spaCy NLP Pipeline'
 | 
			
		||||
  publisher: 'Bielefeld University - CRC 1288 - INF'
 | 
			
		||||
  latest_version: '0.1.0'
 | 
			
		||||
  versions:
 | 
			
		||||
    0.1.0:
 | 
			
		||||
      methods:
 | 
			
		||||
        - 'encoding_detection'
 | 
			
		||||
      models:
 | 
			
		||||
        ca: 'Catalan'
 | 
			
		||||
        de: 'German'
 | 
			
		||||
        el: 'Greek'
 | 
			
		||||
        en: 'English'
 | 
			
		||||
        es: 'Spanish'
 | 
			
		||||
        fr: 'French'
 | 
			
		||||
        it: 'Italian'
 | 
			
		||||
        pl: 'Polish'
 | 
			
		||||
        ru: 'Russian'
 | 
			
		||||
        zh: 'Chinese'
 | 
			
		||||
      publisher: 'Bielefeld University - CRC 1288 - INF'
 | 
			
		||||
      publishing_year: 2022
 | 
			
		||||
      url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/releases/v0.1.0'
 | 
			
		||||
#tesseract-ocr-pipeline:
 | 
			
		||||
tesseract-ocr:
 | 
			
		||||
  name: 'Tesseract OCR'
 | 
			
		||||
  latest_version: '0.1.0'
 | 
			
		||||
  versions:
 | 
			
		||||
    0.1.0:
 | 
			
		||||
      methods:
 | 
			
		||||
        - 'binarization'
 | 
			
		||||
      publisher: 'Bielefeld University - CRC 1288 - INF'
 | 
			
		||||
      publishing_year: 2022
 | 
			
		||||
      url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/releases/v0.1.0'
 | 
			
		||||
      url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/spacy-nlp-pipeline/-/releases/v0.1.0'
 | 
			
		||||
		Reference in New Issue
	
	Block a user