2022-02-03 11:39:16 +00:00
|
|
|
from app.models import TesseractOCRModel
|
2021-02-19 12:00:52 +00:00
|
|
|
from flask_wtf import FlaskForm
|
|
|
|
from wtforms import (BooleanField, MultipleFileField, SelectField, StringField,
|
|
|
|
SubmitField, ValidationError)
|
|
|
|
from wtforms.validators import DataRequired, Length
|
|
|
|
from . import SERVICES
|
|
|
|
|
|
|
|
|
|
|
|
class AddJobForm(FlaskForm):
|
2022-02-03 11:39:16 +00:00
|
|
|
description = StringField('Description', validators=[DataRequired(), Length(1, 255)]) # noqa
|
2021-02-19 12:00:52 +00:00
|
|
|
submit = SubmitField()
|
|
|
|
title = StringField('Title', validators=[DataRequired(), Length(1, 32)])
|
2021-03-26 12:10:42 +00:00
|
|
|
version = SelectField('Version', validators=[DataRequired()])
|
2021-02-19 12:00:52 +00:00
|
|
|
|
|
|
|
|
2022-02-03 11:39:16 +00:00
|
|
|
class AddSpacyNLPJobForm(AddJobForm):
|
|
|
|
encoding_detection = BooleanField('Encoding detection')
|
2021-03-26 12:10:42 +00:00
|
|
|
files = MultipleFileField('Files', validators=[DataRequired()])
|
2022-02-03 11:39:16 +00:00
|
|
|
model = SelectField(
|
|
|
|
'Model',
|
|
|
|
choices=[('', 'Choose your option')],
|
|
|
|
default='',
|
|
|
|
validators=[DataRequired()]
|
|
|
|
)
|
|
|
|
|
|
|
|
def validate_encoding_detection(self, field):
|
|
|
|
service_info = SERVICES['spacy-nlp']['versions'][self.version.data]
|
|
|
|
if field.data and 'encoding_detection' not in service_info:
|
|
|
|
raise ValidationError('Encoding detection is not available')
|
2021-02-19 12:00:52 +00:00
|
|
|
|
|
|
|
def validate_files(form, field):
|
2022-02-03 11:39:16 +00:00
|
|
|
valid_extensions = ['.txt']
|
2021-02-19 12:00:52 +00:00
|
|
|
for file in field.data:
|
2022-02-03 11:39:16 +00:00
|
|
|
if not file.filename.lower().endswith(tuple(valid_extensions)):
|
|
|
|
raise ValidationError(
|
|
|
|
'File does not have an approved extension: '
|
|
|
|
'/'.join(valid_extensions)
|
|
|
|
)
|
2021-02-19 12:00:52 +00:00
|
|
|
|
2021-03-26 12:10:42 +00:00
|
|
|
def __init__(self, *args, **kwargs):
|
2022-02-03 11:39:16 +00:00
|
|
|
version = kwargs.pop('version', SERVICES['spacy-nlp']['latest_version']) # noqa
|
2021-03-26 12:10:42 +00:00
|
|
|
super().__init__(*args, **kwargs)
|
2022-02-03 11:39:16 +00:00
|
|
|
service_info = SERVICES['spacy-nlp']['versions'][version]
|
|
|
|
if 'check_encoding' not in service_info['methods']:
|
|
|
|
self.encoding_detection.render_kw = {'disabled': True}
|
|
|
|
self.model.choices += [(x, y) for x, y in service_info['models'].items()] # noqa
|
|
|
|
self.version.choices = [(x, x) for x in SERVICES['spacy-nlp']['versions']] # noqa
|
2021-03-26 12:10:42 +00:00
|
|
|
self.version.default = version
|
|
|
|
|
2021-02-19 12:00:52 +00:00
|
|
|
|
2022-02-03 11:39:16 +00:00
|
|
|
class AddTesseractOCRJobForm(AddJobForm):
|
|
|
|
binarization = BooleanField('Binarization')
|
2021-02-19 12:00:52 +00:00
|
|
|
files = MultipleFileField('Files', validators=[DataRequired()])
|
2022-02-03 11:39:16 +00:00
|
|
|
model = SelectField(
|
|
|
|
'Model',
|
|
|
|
choices=[('', 'Choose your option')],
|
|
|
|
default='',
|
|
|
|
validators=[DataRequired()]
|
|
|
|
)
|
2021-03-26 12:10:42 +00:00
|
|
|
|
|
|
|
def validate_binarization(self, field):
|
2022-02-03 11:39:16 +00:00
|
|
|
service_info = SERVICES['tesseract-ocr']['versions'][self.version.data]
|
|
|
|
if field.data and 'binarization' not in service_info:
|
|
|
|
raise ValidationError('Binarization is not available')
|
2021-02-19 12:00:52 +00:00
|
|
|
|
2021-03-26 12:10:42 +00:00
|
|
|
def validate_files(self, field):
|
2022-02-03 11:39:16 +00:00
|
|
|
valid_extensions = ['.pdf']
|
2021-02-19 12:00:52 +00:00
|
|
|
for file in field.data:
|
2022-02-03 11:39:16 +00:00
|
|
|
if not file.filename.lower().endswith(tuple(valid_extensions)):
|
|
|
|
raise ValidationError(
|
|
|
|
'File does not have an approved extension: '
|
|
|
|
'/'.join(valid_extensions)
|
|
|
|
)
|
2021-02-19 12:00:52 +00:00
|
|
|
|
2021-03-26 12:10:42 +00:00
|
|
|
def __init__(self, *args, **kwargs):
|
2022-02-03 11:39:16 +00:00
|
|
|
version = kwargs.pop('version', SERVICES['tesseract-ocr']['latest_version']) # noqa
|
2021-03-26 12:10:42 +00:00
|
|
|
super().__init__(*args, **kwargs)
|
2022-02-03 11:39:16 +00:00
|
|
|
service_info = SERVICES['tesseract-ocr']['versions'][version]
|
|
|
|
if 'binarization' not in service_info['methods']:
|
2021-03-26 12:10:42 +00:00
|
|
|
self.binarization.render_kw = {'disabled': True}
|
2022-02-03 11:39:16 +00:00
|
|
|
self.model.choices += [(x.hashid, x.title) for x in TesseractOCRModel.query.all()] # noqa
|
|
|
|
self.version.choices = [(x, x) for x in SERVICES['tesseract-ocr']['versions']] # noqa
|
|
|
|
self.version.data = version
|
|
|
|
self.version.default = SERVICES['tesseract-ocr']['latest_version']
|
2021-02-19 12:00:52 +00:00
|
|
|
|
|
|
|
|
|
|
|
class AddFileSetupJobForm(AddJobForm):
|
|
|
|
files = MultipleFileField('Files', validators=[DataRequired()])
|
|
|
|
|
|
|
|
def validate_files(form, field):
|
2022-02-03 11:39:16 +00:00
|
|
|
valid_extensions = ['.jpeg', '.jpg', '.png', '.tiff', '.tif']
|
2021-02-19 12:00:52 +00:00
|
|
|
for file in field.data:
|
2022-02-03 11:39:16 +00:00
|
|
|
if not file.filename.lower().endswith(tuple(valid_extensions)):
|
|
|
|
raise ValidationError(
|
|
|
|
'File does not have an approved extension: '
|
|
|
|
'/'.join(valid_extensions)
|
|
|
|
)
|
2021-03-26 12:10:42 +00:00
|
|
|
|
|
|
|
def __init__(self, *args, **kwargs):
|
2022-02-03 11:39:16 +00:00
|
|
|
version = kwargs.pop('version', SERVICES['file-setup']['latest_version']) # noqa
|
2021-03-26 12:10:42 +00:00
|
|
|
super().__init__(*args, **kwargs)
|
2022-02-03 11:39:16 +00:00
|
|
|
self.version.choices = [(x, x) for x in SERVICES['file-setup']['versions']] # noqa
|
|
|
|
self.version.data = version
|
|
|
|
self.version.default = SERVICES['file-setup']['latest_version']
|
2021-08-04 10:26:49 +00:00
|
|
|
|
|
|
|
|
|
|
|
AddJobForms = {
|
|
|
|
'file-setup': AddFileSetupJobForm,
|
2022-02-03 11:39:16 +00:00
|
|
|
'tesseract-ocr': AddTesseractOCRJobForm,
|
|
|
|
'spacy-nlp': AddSpacyNLPJobForm
|
2021-08-04 10:26:49 +00:00
|
|
|
}
|