Rename all services, use scss, cleanup, add sandpaper conversion script

This commit is contained in:
Patrick Jentsch
2022-04-04 13:31:09 +02:00
parent 8fd59f8078
commit ce997e69ea
31 changed files with 1361 additions and 303 deletions

View File

@ -1,5 +1,7 @@
from app.models import TesseractOCRModel
from app.models import Job, TesseractOCRModel
from flask_login import current_user
from flask_wtf import FlaskForm
from flask_wtf.file import FileField, FileAllowed, FileRequired
from wtforms import (
BooleanField,
MultipleFileField,
@ -8,110 +10,143 @@ from wtforms import (
SubmitField,
ValidationError
)
from wtforms.validators import DataRequired, Length
from wtforms.validators import DataRequired, InputRequired, Length
from . import SERVICES
class AddJobForm(FlaskForm):
description = StringField('Description', validators=[DataRequired(), Length(1, 255)]) # noqa
description = StringField('Description', validators=[InputRequired()]) # noqa
submit = SubmitField()
title = StringField('Title', validators=[DataRequired(), Length(1, 32)])
title = StringField('Title', validators=[InputRequired()])
version = SelectField('Version', validators=[DataRequired()])
def validate_description(self, field):
max_length = Job.description.property.columns[0].type.length
if len(field.data) > max_length:
raise ValidationError(
f'Description must be less than {max_length} characters'
)
class AddSpacyNLPJobForm(AddJobForm):
encoding_detection = BooleanField('Encoding detection')
files = MultipleFileField('Files', validators=[DataRequired()])
model = SelectField(
'Model',
choices=[('', 'Choose your option')],
default='',
validators=[DataRequired()]
)
def validate_title(self, field):
max_length = Job.title.property.columns[0].type.length
if len(field.data) > max_length:
raise ValidationError(
f'Title must be less than {max_length} characters'
)
def validate_encoding_detection(self, field):
service_info = SERVICES['spacy-nlp']['versions'][self.version.data]
if field.data and 'encoding_detection' not in service_info['methods']:
raise ValidationError('Encoding detection is not available')
def validate_files(form, field):
valid_extensions = ['.txt']
for file in field.data:
if not file.filename.lower().endswith(tuple(valid_extensions)):
raise ValidationError(
'File does not have an approved extension: '
'/'.join(valid_extensions)
)
class AddFileSetupPipelineJobForm(AddJobForm):
images = MultipleFileField('File(s)', validators=[DataRequired()])
def validate_images(form, field):
valid_mimetypes = ['image/jpeg', 'image/png', 'image/tiff']
for image in field.data:
if image.mimetype not in valid_mimetypes:
raise ValidationError('JPEG, PNG and TIFF files only!')
def __init__(self, *args, **kwargs):
version = kwargs.pop('version', SERVICES['spacy-nlp']['latest_version']) # noqa
service_manifest = SERVICES['file-setup-pipeline']
version = kwargs.pop('version', service_manifest['latest_version'])
super().__init__(*args, **kwargs)
service_info = SERVICES['spacy-nlp']['versions'][version]
if 'encoding_detection' not in service_info['methods']:
self.encoding_detection.render_kw = {'disabled': True}
self.model.choices += [(x, y) for x, y in service_info['models'].items()] # noqa
self.version.choices = [(x, x) for x in SERVICES['spacy-nlp']['versions']] # noqa
self.version.default = version
self.version.choices = [(x, x) for x in service_manifest['versions']]
self.version.data = version
self.version.default = service_manifest['latest_version']
class AddTesseractOCRJobForm(AddJobForm):
class AddTesseractOCRPipelineJobForm(AddJobForm):
binarization = BooleanField('Binarization')
files = MultipleFileField('Files', validators=[DataRequired()])
model = SelectField(
'Model',
choices=[('', 'Choose your option')],
default='',
validators=[DataRequired()]
)
pdf = FileField('File', validators=[FileRequired()])
model = SelectField('Model', validators=[DataRequired()])
def validate_binarization(self, field):
service_info = SERVICES['tesseract-ocr']['versions'][self.version.data]
service_info = SERVICES['tesseract-ocr-pipeline']['versions'][self.version.data]
if field.data and 'binarization' not in service_info['methods']:
raise ValidationError('Binarization is not available')
def validate_files(self, field):
valid_extensions = ['.pdf']
for file in field.data:
if not file.filename.lower().endswith(tuple(valid_extensions)):
raise ValidationError(
'File does not have an approved extension: '
'/'.join(valid_extensions)
)
def validate_pdf(self, field):
if field.data.mimetype != 'application/pdf':
raise ValidationError('PDF files only!')
def __init__(self, *args, **kwargs):
version = kwargs.pop('version', SERVICES['tesseract-ocr']['latest_version']) # noqa
service_manifest = SERVICES['tesseract-ocr-pipeline']
version = kwargs.pop('version', service_manifest['latest_version'])
super().__init__(*args, **kwargs)
service_info = SERVICES['tesseract-ocr']['versions'][version]
service_info = service_manifest['versions'][version]
if 'binarization' not in service_info['methods']:
self.binarization.render_kw = {'disabled': True}
self.model.choices += [(x.hashid, x.title) for x in TesseractOCRModel.query.all()] # noqa
self.version.choices = [(x, x) for x in SERVICES['tesseract-ocr']['versions']] # noqa
compatible_models = [
x for x in TesseractOCRModel.query.filter_by(shared=True).all()
if version in x.compatible_service_versions
]
compatible_models += [
x for x in TesseractOCRModel.query.filter_by(shared=False, user=current_user).all()
if version in x.compatible_service_versions
]
self.model.choices = [('', 'Choose your option')]
self.model.choices += [(x.hashid, x.title) for x in compatible_models]
self.model.default = ''
self.version.choices = [(x, x) for x in service_manifest['versions']]
self.version.data = version
self.version.default = SERVICES['tesseract-ocr']['latest_version']
self.version.default = service_manifest['latest_version']
class AddFileSetupJobForm(AddJobForm):
files = MultipleFileField('Files', validators=[DataRequired()])
class AddTranskribusHTRPipelineJobForm(AddJobForm):
binarization = BooleanField('Binarization')
pdf = FileField('File', validators=[FileRequired()])
model = SelectField('Model', validators=[DataRequired()])
def validate_files(form, field):
valid_extensions = ['.jpeg', '.jpg', '.png', '.tiff', '.tif']
for file in field.data:
if not file.filename.lower().endswith(tuple(valid_extensions)):
raise ValidationError(
'File does not have an approved extension: '
'/'.join(valid_extensions)
)
def validate_binarization(self, field):
service_info = SERVICES['transkribus-htr-pipeline']['versions'][self.version.data]
if field.data and 'binarization' not in service_info['methods']:
raise ValidationError('Binarization is not available')
def validate_pdf(self, field):
if field.data.mimetype != 'application/pdf':
raise ValidationError('PDF files only!')
def __init__(self, *args, **kwargs):
version = kwargs.pop('version', SERVICES['file-setup']['latest_version']) # noqa
service_manifest = SERVICES['transkribus-htr-pipeline']
version = kwargs.pop('version', service_manifest['latest_version'])
super().__init__(*args, **kwargs)
self.version.choices = [(x, x) for x in SERVICES['file-setup']['versions']] # noqa
service_info = service_manifest['versions'][version]
if 'binarization' not in service_info['methods']:
self.binarization.render_kw = {'disabled': True}
self.model.choices = [('', 'Choose your option')]
self.model.choices += [
('37569', 'Tim Model'),
('29539', 'UCLUniversity of Toronto #7')
]
self.model.default = ''
self.version.choices = [(x, x) for x in service_manifest['versions']]
self.version.data = version
self.version.default = SERVICES['file-setup']['latest_version']
self.version.default = service_manifest['latest_version']
AddJobForms = {
'file-setup': AddFileSetupJobForm,
'tesseract-ocr': AddTesseractOCRJobForm,
'spacy-nlp': AddSpacyNLPJobForm
}
class AddSpacyNLPPipelineJobForm(AddJobForm):
encoding_detection = BooleanField('Encoding detection')
txt = FileField('File', validators=[FileRequired()])
model = SelectField('Model', validators=[DataRequired()])
def validate_encoding_detection(self, field):
service_manifest = SERVICES['spacy-nlp-pipeline']
service_info = service_manifest['versions'][self.version.data]
if field.data and 'encoding_detection' not in service_info['methods']:
raise ValidationError('Encoding detection is not available!')
def validate_txt(form, field):
if field.data.mimetype != 'text/plain':
raise ValidationError('Plain text files only!')
def __init__(self, *args, **kwargs):
service_manifest = SERVICES['spacy-nlp-pipeline']
version = kwargs.pop('version', service_manifest['latest_version'])
super().__init__(*args, **kwargs)
service_info = service_manifest['versions'][version]
if 'encoding_detection' not in service_info['methods']:
self.encoding_detection.render_kw = {'disabled': True}
self.model.choices = [('', 'Choose your option')]
self.model.choices += [(x, y) for x, y in service_info['models'].items()] # noqa
self.model.default = ''
self.version.choices = [(x, x) for x in service_manifest['versions']]
self.version.data = version
self.version.default = version

View File

@ -13,47 +13,33 @@ from flask_login import current_user, login_required
from werkzeug.utils import secure_filename
from . import bp
from . import SERVICES
from .forms import AddJobForms
from .forms import (
AddFileSetupPipelineJobForm,
AddTesseractOCRPipelineJobForm,
AddTranskribusHTRPipelineJobForm,
AddSpacyNLPPipelineJobForm
)
import json
@bp.route('/corpus-analysis')
@bp.route('/file-setup-pipeline', methods=['GET', 'POST'])
@login_required
def corpus_analysis():
return render_template(
'services/corpus_analysis.html.j2',
title='Corpus analysis'
)
@bp.route('/<service>', methods=['GET', 'POST'])
@login_required
def service(service):
# Check if the requested service exist
if service not in SERVICES or service not in AddJobForms:
def file_setup_pipeline():
service = 'file-setup-pipeline'
service_manifest = SERVICES[service]
version = request.args.get('version', service_manifest['latest_version'])
if version not in service_manifest['versions']:
abort(404)
version = request.args.get('version', SERVICES[service]['latest_version'])
if version not in SERVICES[service]['versions']:
abort(404)
form = AddJobForms[service](prefix='add-job-form', version=version)
title = SERVICES[service]['name']
form = AddFileSetupPipelineJobForm(prefix='add-job-form', version=version)
if form.is_submitted():
if not form.validate():
return make_response(form.errors, 400)
service_args = {}
if service == 'spacy-nlp':
service_args['model'] = form.model.data
if form.encoding_detection.data:
service_args['encoding_detection'] = True
if service == 'tesseract-ocr':
service_args['model'] = hashids.decode(form.model.data)
if form.binarization.data:
service_args['binarization'] = True
job = Job(
user=current_user,
description=form.description.data,
service=service,
service_args=json.dumps(service_args),
service_args=service_args,
service_version=form.version.data,
title=form.title.data
)
@ -67,18 +53,17 @@ def service(service):
db.session.rollback()
flash('Internal Server Error', 'error')
return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa
for file in form.files.data:
filename = secure_filename(file.filename)
for image_file in form.images.data:
job_input = JobInput(
filename=filename,
filename=secure_filename(image_file.filename),
job=job,
mimetype=file.mimetype
mimetype=image_file.mimetype
)
db.session.add(job_input)
db.session.flush(objects=[job_input])
db.session.refresh(job_input)
try:
file.save(job_input.path)
image_file.save(job_input.path)
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
@ -91,5 +76,196 @@ def service(service):
return render_template(
f'services/{service.replace("-", "_")}.html.j2',
form=form,
title=title
title=service_manifest['name']
)
@bp.route('/tesseract-ocr-pipeline', methods=['GET', 'POST'])
@login_required
def tesseract_ocr_pipeline():
service = 'tesseract-ocr-pipeline'
service_manifest = SERVICES[service]
version = request.args.get('version', service_manifest['latest_version'])
if version not in service_manifest['versions']:
abort(404)
form = AddTesseractOCRPipelineJobForm(prefix='add-job-form', version=version)
if form.is_submitted():
if not form.validate():
return make_response(form.errors, 400)
service_args = {}
service_args['model'] = hashids.decode(form.model.data)
if form.binarization.data:
service_args['binarization'] = True
job = Job(
user=current_user,
description=form.description.data,
service=service,
service_args=service_args,
service_version=form.version.data,
title=form.title.data
)
db.session.add(job)
db.session.flush(objects=[job])
db.session.refresh(job)
try:
job.makedirs()
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
flash('Internal Server Error', 'error')
return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa
job_input = JobInput(
filename=secure_filename(form.pdf.data.filename),
job=job,
mimetype=form.pdf.data.mimetype
)
db.session.add(job_input)
db.session.flush(objects=[job_input])
db.session.refresh(job_input)
try:
form.pdf.data.save(job_input.path)
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
flash('Internal Server Error', 'error')
return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa
job.status = JobStatus.SUBMITTED
db.session.commit()
flash(f'Job "{job.title}" added', 'job')
return make_response({'redirect_url': url_for('jobs.job', job_id=job.id)}, 201) # noqa
return render_template(
f'services/{service.replace("-", "_")}.html.j2',
form=form,
title=service_manifest['name']
)
@bp.route('/transkribus-htr-pipeline', methods=['GET', 'POST'])
@login_required
def transkribus_htr_pipeline():
if not current_app.config.get('NOPAQUE_TRANSKRIBUS_ENABLED'):
abort(404)
service = 'transkribus-htr-pipeline'
service_manifest = SERVICES[service]
version = request.args.get('version', service_manifest['latest_version'])
if version not in service_manifest['versions']:
abort(404)
form = AddTranskribusHTRPipelineJobForm(prefix='add-job-form', version=version)
if form.is_submitted():
if not form.validate():
return make_response(form.errors, 400)
service_args = {}
service_args['model'] = form.model.data
if form.binarization.data:
service_args['binarization'] = True
job = Job(
user=current_user,
description=form.description.data,
service=service,
service_args=service_args,
service_version=form.version.data,
title=form.title.data
)
db.session.add(job)
db.session.flush(objects=[job])
db.session.refresh(job)
try:
job.makedirs()
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
flash('Internal Server Error', 'error')
return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa
job_input = JobInput(
filename=secure_filename(form.pdf.data.filename),
job=job,
mimetype=form.pdf.data.mimetype
)
db.session.add(job_input)
db.session.flush(objects=[job_input])
db.session.refresh(job_input)
try:
form.pdf.data.save(job_input.path)
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
flash('Internal Server Error', 'error')
return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa
job.status = JobStatus.SUBMITTED
db.session.commit()
flash(f'Job "{job.title}" added', 'job')
return make_response({'redirect_url': url_for('jobs.job', job_id=job.id)}, 201) # noqa
return render_template(
f'services/{service.replace("-", "_")}.html.j2',
form=form,
title=service_manifest['name']
)
@bp.route('/spacy-nlp-pipeline', methods=['GET', 'POST'])
@login_required
def spacy_nlp_pipeline():
service = 'spacy-nlp-pipeline'
service_manifest = SERVICES[service]
version = request.args.get('version', SERVICES[service]['latest_version'])
if version not in service_manifest['versions']:
abort(404)
form = AddSpacyNLPPipelineJobForm(prefix='add-job-form', version=version)
if form.is_submitted():
if not form.validate():
return make_response(form.errors, 400)
service_args = {}
service_args['model'] = form.model.data
if form.encoding_detection.data:
service_args['encoding_detection'] = True
job = Job(
user=current_user,
description=form.description.data,
service=service,
service_args=service_args,
service_version=form.version.data,
title=form.title.data
)
db.session.add(job)
db.session.flush(objects=[job])
db.session.refresh(job)
try:
job.makedirs()
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
flash('Internal Server Error', 'error')
return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa
job_input = JobInput(
filename=secure_filename(form.txt.data.filename),
job=job,
mimetype=form.txt.data.mimetype
)
db.session.add(job_input)
db.session.flush(objects=[job_input])
db.session.refresh(job_input)
try:
form.txt.data.save(job_input.path)
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
flash('Internal Server Error', 'error')
return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa
job.status = JobStatus.SUBMITTED
db.session.commit()
flash(f'Job "{job.title}" added', 'job')
return make_response({'redirect_url': url_for('jobs.job', job_id=job.id)}, 201) # noqa
return render_template(
f'services/{service.replace("-", "_")}.html.j2',
form=form,
title=service_manifest['name']
)
@bp.route('/corpus-analysis')
@login_required
def corpus_analysis():
return render_template(
'services/corpus_analysis.html.j2',
title='Corpus analysis'
)

View File

@ -1,38 +1,70 @@
# TODO: This could also be done via GitLab/GitHub APIs
#file-setup-pipeline:
file-setup:
file-setup-pipeline:
name: 'File setup pipeline'
publisher: 'Bielefeld University - CRC 1288 - INF'
latest_version: '0.1.0'
versions:
0.1.0:
publisher: 'Bielefeld University - CRC 1288 - INF'
publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup/-/releases/v0.1.0'
#spacy-nlp-pipeline:
spacy-nlp:
name: 'spaCy NLP'
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup-pipeline/-/releases/v0.1.0'
tesseract-ocr-pipeline:
name: 'Tesseract OCR Pipeline'
publisher: 'Bielefeld University - CRC 1288 - INF'
latest_version: '0.1.4'
versions:
0.1.0:
methods:
- 'binarization'
publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.0'
0.1.1:
methods:
- 'binarization'
publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.1'
0.1.2:
methods:
- 'binarization'
publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.2'
0.1.3:
methods:
- 'binarization'
publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.3'
0.1.4:
methods:
- 'binarization'
publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.4'
transkribus-htr-pipeline:
name: 'Transkribus HTR Pipeline'
publisher: 'Bielefeld University - CRC 1288 - INF'
latest_version: '0.1.0'
versions:
0.1.0:
methods:
- 'binarization'
publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/transkribus-htr-pipeline/-/releases/v0.1.0'
spacy-nlp-pipeline:
name: 'spaCy NLP Pipeline'
publisher: 'Bielefeld University - CRC 1288 - INF'
latest_version: '0.1.0'
versions:
0.1.0:
methods:
- 'encoding_detection'
models:
ca: 'Catalan'
de: 'German'
el: 'Greek'
en: 'English'
es: 'Spanish'
fr: 'French'
it: 'Italian'
pl: 'Polish'
ru: 'Russian'
zh: 'Chinese'
publisher: 'Bielefeld University - CRC 1288 - INF'
publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/releases/v0.1.0'
#tesseract-ocr-pipeline:
tesseract-ocr:
name: 'Tesseract OCR'
latest_version: '0.1.0'
versions:
0.1.0:
methods:
- 'binarization'
publisher: 'Bielefeld University - CRC 1288 - INF'
publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/releases/v0.1.0'
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/spacy-nlp-pipeline/-/releases/v0.1.0'