Disable binarization for old ocr service versions. Add new ocr service version (including binarization)

This commit is contained in:
Patrick Jentsch
2022-04-13 16:39:51 +02:00
parent 43ea898394
commit d35ca7c261
3 changed files with 148 additions and 25 deletions

View File

@ -46,8 +46,12 @@ class AddTesseractOCRPipelineJobForm(AddJobForm):
def validate_binarization(self, field):
service_info = SERVICES['tesseract-ocr-pipeline']['versions'][self.version.data]
if field.data and 'binarization' not in service_info['methods']:
raise ValidationError('Binarization is not available')
if field.data:
if(
'methods' not in service_info
or 'binarization' not in service_info['methods']
):
raise ValidationError('Binarization is not available')
def validate_pdf(self, field):
if field.data.mimetype != 'application/pdf':
@ -58,8 +62,13 @@ class AddTesseractOCRPipelineJobForm(AddJobForm):
version = kwargs.pop('version', service_manifest['latest_version'])
super().__init__(*args, **kwargs)
service_info = service_manifest['versions'][version]
if 'binarization' not in service_info['methods']:
self.binarization.render_kw = {'disabled': True}
if self.binarization.render_kw is None:
self.binarization.render_kw = {}
self.binarization.render_kw['disabled'] = True
if 'methods' in service_info:
if 'binarization' in service_info['methods']:
if 'disabled' in self.binarization.render_kw:
del self.binarization.render_kw['disabled']
compatible_models = [
x for x in TesseractOCRModel.query.filter_by(shared=True).all()
if version in x.compatible_service_versions
@ -83,8 +92,12 @@ class AddTranskribusHTRPipelineJobForm(AddJobForm):
def validate_binarization(self, field):
service_info = SERVICES['transkribus-htr-pipeline']['versions'][self.version.data]
if field.data and 'binarization' not in service_info['methods']:
raise ValidationError('Binarization is not available')
if field.data:
if(
'methods' not in service_info
or 'binarization' not in service_info['methods']
):
raise ValidationError('Binarization is not available')
def validate_pdf(self, field):
if field.data.mimetype != 'application/pdf':
@ -95,8 +108,13 @@ class AddTranskribusHTRPipelineJobForm(AddJobForm):
version = kwargs.pop('version', service_manifest['latest_version'])
super().__init__(*args, **kwargs)
service_info = service_manifest['versions'][version]
if 'binarization' not in service_info['methods']:
self.binarization.render_kw = {'disabled': True}
if self.binarization.render_kw is None:
self.binarization.render_kw = {}
self.binarization.render_kw['disabled'] = True
if 'methods' in service_info:
if 'binarization' in service_info['methods']:
if 'disabled' in self.binarization.render_kw:
del self.binarization.render_kw['disabled']
self.model.choices = [('', 'Choose your option')]
self.model.choices += [
('37569', 'Tim Model'),
@ -109,15 +127,18 @@ class AddTranskribusHTRPipelineJobForm(AddJobForm):
class AddSpacyNLPPipelineJobForm(AddJobForm):
encoding_detection = BooleanField('Encoding detection')
encoding_detection = BooleanField('Encoding detection', render_kw={'disabled': True})
txt = FileField('File', validators=[FileRequired()])
model = SelectField('Model', validators=[DataRequired()])
def validate_encoding_detection(self, field):
service_manifest = SERVICES['spacy-nlp-pipeline']
service_info = service_manifest['versions'][self.version.data]
if field.data and 'encoding_detection' not in service_info['methods']:
raise ValidationError('Encoding detection is not available!')
service_info = SERVICES['spacy-nlp-pipeline']['versions'][self.version.data]
if field.data:
if(
'methods' not in service_info
or 'encoding_detection' not in service_info['methods']
):
raise ValidationError('Encoding detection is not available')
def validate_txt(form, field):
if field.data.mimetype != 'text/plain':
@ -128,8 +149,13 @@ class AddSpacyNLPPipelineJobForm(AddJobForm):
version = kwargs.pop('version', service_manifest['latest_version'])
super().__init__(*args, **kwargs)
service_info = service_manifest['versions'][version]
if 'encoding_detection' not in service_info['methods']:
self.encoding_detection.render_kw = {'disabled': True}
if self.encoding_detection.render_kw is None:
self.encoding_detection.render_kw = {}
self.encoding_detection.render_kw['disabled'] = True
if 'methods' in service_info:
if 'encoding_detection' in service_info['methods']:
if 'disabled' in self.encoding_detection.render_kw:
del self.encoding_detection.render_kw['disabled']
self.model.choices = [('', 'Choose your option')]
self.model.choices += [(x, y) for x, y in service_info['models'].items()] # noqa
self.model.default = ''

View File

@ -10,33 +10,28 @@ file-setup-pipeline:
tesseract-ocr-pipeline:
name: 'Tesseract OCR Pipeline'
publisher: 'Bielefeld University - CRC 1288 - INF'
latest_version: '0.1.4'
latest_version: '0.1.5'
versions:
0.1.0:
methods:
- 'binarization'
publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.0'
0.1.1:
methods:
- 'binarization'
publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.1'
0.1.2:
methods:
- 'binarization'
publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.2'
0.1.3:
methods:
- 'binarization'
publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.3'
0.1.4:
publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.4'
0.1.5:
methods:
- 'binarization'
publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.4'
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.5'
transkribus-htr-pipeline:
name: 'Transkribus HTR Pipeline'
publisher: 'Bielefeld University - CRC 1288 - INF'