mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2025-06-12 00:50:40 +00:00
Big update, corpus analysis reworked, versioned services, preliminary work for contributions
This commit is contained in:
@ -1,77 +1,13 @@
|
||||
from flask import Blueprint
|
||||
import os
|
||||
import yaml
|
||||
|
||||
|
||||
SERVICES = {
|
||||
'file-setup': {
|
||||
'name': 'File setup',
|
||||
'versions': {
|
||||
'latest': '1.0.0b',
|
||||
'1.0.0b': {
|
||||
'publishing_data': {
|
||||
'date': None,
|
||||
'title': 'nopaque File setup service',
|
||||
'url': 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup/-/tree/1.0.0b', # noqa
|
||||
'version': '1.0.0'
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
'nlp': {
|
||||
'name': 'Natural Language Processing',
|
||||
'versions': {
|
||||
'latest': '1.0.0b',
|
||||
'1.0.0b': {
|
||||
'check_encoding': True,
|
||||
'models': {
|
||||
'de': 'German',
|
||||
'en': 'English',
|
||||
'it': 'Italian',
|
||||
'nl': 'Dutch',
|
||||
'pl': 'Polish',
|
||||
'zh': 'Chinese'
|
||||
},
|
||||
'publishing_data': {
|
||||
'date': None,
|
||||
'title': 'nopaque NLP service',
|
||||
'url': 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/tree/1.0.0b', # noqa
|
||||
'version': '1.0.0'
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
'ocr': {
|
||||
'name': 'Optical Character Recognition',
|
||||
'versions': {
|
||||
'latest': '1.0.0b',
|
||||
'1.0.0b': {
|
||||
'binarization': True,
|
||||
'models': {
|
||||
'ara': 'Arabic',
|
||||
'chi_tra': 'Chinese - Traditional',
|
||||
'dan': 'Danish',
|
||||
'eng': 'English',
|
||||
'enm': 'English, Middle 1100-1500',
|
||||
'fra': 'French',
|
||||
'frm': 'French, Middle ca. 1400-1600',
|
||||
'deu': 'German',
|
||||
'frk': 'German Fraktur',
|
||||
'ell': 'Greek, Modern (1453-)',
|
||||
'ita': 'Italian',
|
||||
'por': 'Portuguese',
|
||||
'rus': 'Russian',
|
||||
'spa': 'Spanish; Castilian',
|
||||
},
|
||||
'publishing_data': {
|
||||
'date': None,
|
||||
'title': 'nopaque OCR service',
|
||||
'url': 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/tree/1.0.0b', # noqa
|
||||
'version': '1.0.0'
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
services_file = os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)), 'services.yml')
|
||||
with open(services_file, 'r') as f:
|
||||
SERVICES = yaml.safe_load(f)
|
||||
|
||||
|
||||
bp = Blueprint('services', __name__)
|
||||
from . import routes
|
||||
from . import routes # noqa
|
||||
|
@ -1,3 +1,4 @@
|
||||
from app.models import TesseractOCRModel
|
||||
from flask_wtf import FlaskForm
|
||||
from wtforms import (BooleanField, MultipleFileField, SelectField, StringField,
|
||||
SubmitField, ValidationError)
|
||||
@ -6,85 +7,105 @@ from . import SERVICES
|
||||
|
||||
|
||||
class AddJobForm(FlaskForm):
|
||||
description = StringField('Description',
|
||||
validators=[DataRequired(), Length(1, 255)])
|
||||
description = StringField('Description', validators=[DataRequired(), Length(1, 255)]) # noqa
|
||||
submit = SubmitField()
|
||||
title = StringField('Title', validators=[DataRequired(), Length(1, 32)])
|
||||
version = SelectField('Version', validators=[DataRequired()])
|
||||
|
||||
|
||||
class AddNLPJobForm(AddJobForm):
|
||||
check_encoding = BooleanField('Check encoding')
|
||||
class AddSpacyNLPJobForm(AddJobForm):
|
||||
encoding_detection = BooleanField('Encoding detection')
|
||||
files = MultipleFileField('Files', validators=[DataRequired()])
|
||||
language = SelectField('Language', choices=[('', 'Choose your option')],
|
||||
default='', validators=[DataRequired()])
|
||||
model = SelectField(
|
||||
'Model',
|
||||
choices=[('', 'Choose your option')],
|
||||
default='',
|
||||
validators=[DataRequired()]
|
||||
)
|
||||
|
||||
def validate_check_encoding(self, field):
|
||||
if field.data and 'check_encoding' not in SERVICES['nlp']['versions'][self.version.data]: # noqa
|
||||
raise ValidationError('Check encoding is not available in this version') # noqa
|
||||
def validate_encoding_detection(self, field):
|
||||
service_info = SERVICES['spacy-nlp']['versions'][self.version.data]
|
||||
if field.data and 'encoding_detection' not in service_info:
|
||||
raise ValidationError('Encoding detection is not available')
|
||||
|
||||
def validate_files(form, field):
|
||||
valid_extensions = ['.txt']
|
||||
for file in field.data:
|
||||
if not file.filename.lower().endswith('.txt'):
|
||||
raise ValidationError('File does not have an approved '
|
||||
'extension: .txt')
|
||||
if not file.filename.lower().endswith(tuple(valid_extensions)):
|
||||
raise ValidationError(
|
||||
'File does not have an approved extension: '
|
||||
'/'.join(valid_extensions)
|
||||
)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
version = kwargs.pop('version', SERVICES['nlp']['versions']['latest'])
|
||||
version = kwargs.pop('version', SERVICES['spacy-nlp']['latest_version']) # noqa
|
||||
super().__init__(*args, **kwargs)
|
||||
if 'check_encoding' not in SERVICES['nlp']['versions'][version]:
|
||||
self.check_encoding.render_kw = {'disabled': True}
|
||||
self.language.choices += [(x, y) for x, y in SERVICES['nlp']['versions'][version]['models'].items()] # noqa
|
||||
self.version.choices = [(x, x) for x in SERVICES['nlp']['versions'] if x != 'latest'] # noqa
|
||||
service_info = SERVICES['spacy-nlp']['versions'][version]
|
||||
if 'check_encoding' not in service_info['methods']:
|
||||
self.encoding_detection.render_kw = {'disabled': True}
|
||||
self.model.choices += [(x, y) for x, y in service_info['models'].items()] # noqa
|
||||
self.version.choices = [(x, x) for x in SERVICES['spacy-nlp']['versions']] # noqa
|
||||
self.version.default = version
|
||||
|
||||
|
||||
class AddOCRJobForm(AddJobForm):
|
||||
binarization = BooleanField('Binarazation')
|
||||
class AddTesseractOCRJobForm(AddJobForm):
|
||||
binarization = BooleanField('Binarization')
|
||||
files = MultipleFileField('Files', validators=[DataRequired()])
|
||||
language = SelectField('Language', choices=[('', 'Choose your option')],
|
||||
default='', validators=[DataRequired()])
|
||||
model = SelectField(
|
||||
'Model',
|
||||
choices=[('', 'Choose your option')],
|
||||
default='',
|
||||
validators=[DataRequired()]
|
||||
)
|
||||
|
||||
def validate_binarization(self, field):
|
||||
if field.data and 'binarization' not in SERVICES['ocr']['versions'][self.version.data]: # noqa
|
||||
raise ValidationError('Binarization is not available in this version') # noqa
|
||||
service_info = SERVICES['tesseract-ocr']['versions'][self.version.data]
|
||||
if field.data and 'binarization' not in service_info:
|
||||
raise ValidationError('Binarization is not available')
|
||||
|
||||
def validate_files(self, field):
|
||||
valid_extensions = ['.pdf']
|
||||
for file in field.data:
|
||||
if not file.filename.lower().endswith('.pdf'):
|
||||
raise ValidationError('File does not have an approved '
|
||||
'extension: .pdf')
|
||||
if not file.filename.lower().endswith(tuple(valid_extensions)):
|
||||
raise ValidationError(
|
||||
'File does not have an approved extension: '
|
||||
'/'.join(valid_extensions)
|
||||
)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
version = kwargs.pop('version', SERVICES['ocr']['versions']['latest'])
|
||||
version = kwargs.pop('version', SERVICES['tesseract-ocr']['latest_version']) # noqa
|
||||
super().__init__(*args, **kwargs)
|
||||
if 'binarization' not in SERVICES['ocr']['versions'][version]:
|
||||
service_info = SERVICES['tesseract-ocr']['versions'][version]
|
||||
if 'binarization' not in service_info['methods']:
|
||||
self.binarization.render_kw = {'disabled': True}
|
||||
self.language.choices += [(x, y) for x, y in SERVICES['ocr']['versions'][version]['models'].items()] # noqa
|
||||
self.version.choices = [(x, x) for x in SERVICES['ocr']['versions'] if x != 'latest'] # noqa
|
||||
self.version.default = version
|
||||
self.model.choices += [(x.hashid, x.title) for x in TesseractOCRModel.query.all()] # noqa
|
||||
self.version.choices = [(x, x) for x in SERVICES['tesseract-ocr']['versions']] # noqa
|
||||
self.version.data = version
|
||||
self.version.default = SERVICES['tesseract-ocr']['latest_version']
|
||||
|
||||
|
||||
class AddFileSetupJobForm(AddJobForm):
|
||||
files = MultipleFileField('Files', validators=[DataRequired()])
|
||||
|
||||
def validate_files(form, field):
|
||||
valid_extensions = ['.jpeg', '.jpg', '.png', '.tiff', '.tif']
|
||||
for file in field.data:
|
||||
if not file.filename.lower().endswith(('.jpeg', '.jpg', '.png',
|
||||
'.tiff', '.tif')):
|
||||
raise ValidationError('File does not have an approved '
|
||||
'extension: .jpeg | .jpg | .png | .tiff '
|
||||
'| .tif')
|
||||
if not file.filename.lower().endswith(tuple(valid_extensions)):
|
||||
raise ValidationError(
|
||||
'File does not have an approved extension: '
|
||||
'/'.join(valid_extensions)
|
||||
)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
version = kwargs.pop('version', SERVICES['file-setup']['versions']['latest'])
|
||||
version = kwargs.pop('version', SERVICES['file-setup']['latest_version']) # noqa
|
||||
super().__init__(*args, **kwargs)
|
||||
self.version.choices = [(x, x) for x in SERVICES['file-setup']['versions'] if x != 'latest'] # noqa
|
||||
self.version.default = version
|
||||
self.version.choices = [(x, x) for x in SERVICES['file-setup']['versions']] # noqa
|
||||
self.version.data = version
|
||||
self.version.default = SERVICES['file-setup']['latest_version']
|
||||
|
||||
|
||||
AddJobForms = {
|
||||
'file-setup': AddFileSetupJobForm,
|
||||
'ocr': AddOCRJobForm,
|
||||
'nlp': AddNLPJobForm
|
||||
'tesseract-ocr': AddTesseractOCRJobForm,
|
||||
'spacy-nlp': AddSpacyNLPJobForm
|
||||
}
|
||||
|
@ -1,3 +1,4 @@
|
||||
from app import hashids
|
||||
from flask import (abort, current_app, flash, make_response, render_template,
|
||||
request, url_for)
|
||||
from flask_login import current_user, login_required
|
||||
@ -8,7 +9,6 @@ from .. import db
|
||||
from .forms import AddJobForms
|
||||
from ..models import Job, JobInput
|
||||
import json
|
||||
import os
|
||||
|
||||
|
||||
@bp.route('/corpus-analysis')
|
||||
@ -24,57 +24,65 @@ def service(service):
|
||||
# Check if the requested service exist
|
||||
if service not in SERVICES or service not in AddJobForms:
|
||||
abort(404)
|
||||
version = request.args.get(
|
||||
'version', SERVICES[service]['versions']['latest'])
|
||||
version = request.args.get('version', SERVICES[service]['latest_version'])
|
||||
if version not in SERVICES[service]['versions']:
|
||||
abort(404)
|
||||
form = AddJobForms[service](prefix='add-job-form', version=version)
|
||||
form.version.data = version
|
||||
title = SERVICES[service]['name']
|
||||
versions = SERVICES[service]['versions']
|
||||
if form.is_submitted():
|
||||
if not form.validate():
|
||||
return make_response(form.errors, 400)
|
||||
service_args = []
|
||||
if service == 'nlp':
|
||||
service_args.append(f'-l {form.language.data}')
|
||||
if form.check_encoding.data:
|
||||
service_args.append('--check-encoding')
|
||||
if service == 'ocr':
|
||||
service_args.append(f'-l {form.language.data}')
|
||||
service_args = {}
|
||||
if service == 'spacy-nlp':
|
||||
service_args['model'] = form.model.data
|
||||
if form.encoding_detection.data:
|
||||
service_args['encoding_detection'] = True
|
||||
if service == 'tesseract-ocr':
|
||||
service_args['model'] = hashids.decode(form.model.data)
|
||||
if form.binarization.data:
|
||||
service_args.append('--binarize')
|
||||
job = Job(user=current_user,
|
||||
description=form.description.data,
|
||||
service=service, service_args=json.dumps(service_args),
|
||||
service_version=form.version.data,
|
||||
status='preparing', title=form.title.data)
|
||||
service_args['binarization'] = True
|
||||
job = Job(
|
||||
user=current_user,
|
||||
description=form.description.data,
|
||||
service=service,
|
||||
service_args=json.dumps(service_args),
|
||||
service_version=form.version.data,
|
||||
status='preparing',
|
||||
title=form.title.data
|
||||
)
|
||||
db.session.add(job)
|
||||
db.session.flush()
|
||||
db.session.flush(objects=[job])
|
||||
db.session.refresh(job)
|
||||
try:
|
||||
os.makedirs(job.path)
|
||||
except OSError:
|
||||
current_app.logger.error(f'Make dir {job.path} led to an OSError!')
|
||||
job.makedirs()
|
||||
except OSError as e:
|
||||
current_app.logger.error(e)
|
||||
db.session.rollback()
|
||||
flash('Internal Server Error', 'error')
|
||||
return make_response(
|
||||
{'redirect_url': url_for('.service', service=service)}, 500)
|
||||
else:
|
||||
for file in form.files.data:
|
||||
filename = secure_filename(file.filename)
|
||||
job_input = JobInput(
|
||||
filename=filename, job=job, mimetype=file.mimetype)
|
||||
return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa
|
||||
for file in form.files.data:
|
||||
filename = secure_filename(file.filename)
|
||||
job_input = JobInput(
|
||||
filename=filename,
|
||||
job=job,
|
||||
mimetype=file.mimetype
|
||||
)
|
||||
db.session.add(job_input)
|
||||
db.session.flush(objects=[job_input])
|
||||
db.session.refresh(job_input)
|
||||
try:
|
||||
file.save(job_input.path)
|
||||
db.session.add(job_input)
|
||||
job.status = 'submitted'
|
||||
db.session.commit()
|
||||
flash(f'Job "{job.title}" added', 'job')
|
||||
return make_response(
|
||||
{'redirect_url': url_for('jobs.job', job_id=job.id)}, 201)
|
||||
except OSError as e:
|
||||
current_app.logger.error(e)
|
||||
db.session.rollback()
|
||||
flash('Internal Server Error', 'error')
|
||||
return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa
|
||||
job.status = 'submitted'
|
||||
db.session.commit()
|
||||
flash(f'Job "{job.title}" added', 'job')
|
||||
return make_response({'redirect_url': url_for('jobs.job', job_id=job.id)}, 201) # noqa
|
||||
return render_template(
|
||||
f'services/{service.replace("-", "_")}.html.j2',
|
||||
form=form,
|
||||
title=title,
|
||||
versions=versions
|
||||
title=title
|
||||
)
|
||||
|
38
app/services/services.yml
Normal file
38
app/services/services.yml
Normal file
@ -0,0 +1,38 @@
|
||||
# TODO: This could also be done via GitLab/GitHub APIs
|
||||
#file-setup-pipeline:
|
||||
file-setup:
|
||||
name: 'File setup pipeline'
|
||||
latest_version: '0.1.0'
|
||||
versions:
|
||||
0.1.0:
|
||||
publisher: 'Bielefeld University - CRC 1288 - INF'
|
||||
publishing_year: 2022
|
||||
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup/-/releases/v0.1.0'
|
||||
#spacy-nlp-pipeline:
|
||||
spacy-nlp:
|
||||
name: 'spaCy NLP'
|
||||
latest_version: '0.1.0'
|
||||
versions:
|
||||
0.1.0:
|
||||
methods:
|
||||
- 'encoding_detection'
|
||||
models:
|
||||
de: 'German'
|
||||
en: 'English'
|
||||
it: 'Italian'
|
||||
pl: 'Polish'
|
||||
zh: 'Chinese'
|
||||
publisher: 'Bielefeld University - CRC 1288 - INF'
|
||||
publishing_year: 2022
|
||||
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/releases/v0.1.0'
|
||||
#tesseract-ocr-pipeline:
|
||||
tesseract-ocr:
|
||||
name: 'Tesseract OCR'
|
||||
latest_version: '0.1.0'
|
||||
versions:
|
||||
0.1.0:
|
||||
methods:
|
||||
- 'binarization'
|
||||
publisher: 'Bielefeld University - CRC 1288 - INF'
|
||||
publishing_year: 2022
|
||||
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/releases/v0.1.0'
|
Reference in New Issue
Block a user