From f4f0628b60b5d88dd5cd5d42aa355c99ec073294 Mon Sep 17 00:00:00 2001 From: Inga Kirschnick Date: Thu, 10 Nov 2022 12:14:03 +0100 Subject: [PATCH 1/3] Ocropus nlbin threshold extension --- app/services/forms.py | 4 ++++ app/services/routes.py | 3 ++- app/templates/services/tesseract_ocr_pipeline.html.j2 | 5 +++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/app/services/forms.py b/app/services/forms.py index 5c0af906..9d7edf80 100644 --- a/app/services/forms.py +++ b/app/services/forms.py @@ -3,6 +3,7 @@ from flask_wtf import FlaskForm from flask_wtf.file import FileField, FileRequired from wtforms import ( BooleanField, + IntegerRangeField, MultipleFileField, SelectField, StringField, @@ -49,6 +50,9 @@ class CreateTesseractOCRPipelineJobForm(CreateJobBaseForm): binarization = BooleanField('Binarization') pdf = FileField('File', validators=[FileRequired()]) model = SelectField('Model', validators=[InputRequired()]) + ocropus_nlbin_threshold = IntegerRangeField( + render_kw={'tooltips':'false', 'min': 0, 'max': 1, 'step': 0.1, 'start': [0.5]} + ) def validate_binarization(self, field): service_info = SERVICES['tesseract-ocr-pipeline']['versions'][self.version.data] diff --git a/app/services/routes.py b/app/services/routes.py index b34d0619..7fe7a0eb 100644 --- a/app/services/routes.py +++ b/app/services/routes.py @@ -78,7 +78,8 @@ def tesseract_ocr_pipeline(): service=service_name, service_args={ 'binarization': form.binarization.data, - 'model': hashids.decode(form.model.data) + 'model': hashids.decode(form.model.data), + 'ocropus_nlbin_threshold': form.ocropus_nlbin_threshold.data }, service_version=form.version.data, user=current_user diff --git a/app/templates/services/tesseract_ocr_pipeline.html.j2 b/app/templates/services/tesseract_ocr_pipeline.html.j2 index 982265bc..31bf155c 100644 --- a/app/templates/services/tesseract_ocr_pipeline.html.j2 +++ b/app/templates/services/tesseract_ocr_pipeline.html.j2 @@ -83,6 +83,11 @@ +

 

+
+

Intensity

+

{{ form.ocropus_nlbin_threshold() }}

+

 

 

From 79d76f158f754ed5ec1b4937b910d9ed88995624 Mon Sep 17 00:00:00 2001 From: Inga Kirschnick Date: Thu, 10 Nov 2022 16:19:58 +0100 Subject: [PATCH 2/3] update binarization threshold --- app/daemon/job_utils.py | 2 ++ app/services/forms.py | 6 +++--- app/services/routes.py | 2 +- app/templates/services/tesseract_ocr_pipeline.html.j2 | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/app/daemon/job_utils.py b/app/daemon/job_utils.py index 32def73d..99a6ee75 100644 --- a/app/daemon/job_utils.py +++ b/app/daemon/job_utils.py @@ -59,6 +59,8 @@ def _create_job_service(job): command += f' -m {job.service_args["model"]}' if 'binarization' in job.service_args and job.service_args['binarization']: command += ' --binarize' + value = job.service_args['ocropus_nlbin_threshold'] + command += f' --ocropus-nlbin-threshold {value}' elif job.service == 'transkribus-htr-pipeline': transkribus_htr_pipeline_model_id = job.service_args['model'] command += f' -m {transkribus_htr_pipeline_model_id}' diff --git a/app/services/forms.py b/app/services/forms.py index 9d7edf80..97218ddb 100644 --- a/app/services/forms.py +++ b/app/services/forms.py @@ -3,7 +3,7 @@ from flask_wtf import FlaskForm from flask_wtf.file import FileField, FileRequired from wtforms import ( BooleanField, - IntegerRangeField, + DecimalRangeField, MultipleFileField, SelectField, StringField, @@ -50,8 +50,8 @@ class CreateTesseractOCRPipelineJobForm(CreateJobBaseForm): binarization = BooleanField('Binarization') pdf = FileField('File', validators=[FileRequired()]) model = SelectField('Model', validators=[InputRequired()]) - ocropus_nlbin_threshold = IntegerRangeField( - render_kw={'tooltips':'false', 'min': 0, 'max': 1, 'step': 0.1, 'start': [0.5]} + ocropus_nlbin_threshold = DecimalRangeField( + render_kw={'min': 0, 'max': 1, 'step': 0.1, 'start': [0.5]} ) def validate_binarization(self, field): diff --git a/app/services/routes.py b/app/services/routes.py index 7fe7a0eb..4e0266d8 100644 --- a/app/services/routes.py +++ b/app/services/routes.py @@ -79,7 +79,7 @@ def tesseract_ocr_pipeline(): service_args={ 'binarization': form.binarization.data, 'model': hashids.decode(form.model.data), - 'ocropus_nlbin_threshold': form.ocropus_nlbin_threshold.data + 'ocropus_nlbin_threshold': float(form.ocropus_nlbin_threshold.data) }, service_version=form.version.data, user=current_user diff --git a/app/templates/services/tesseract_ocr_pipeline.html.j2 b/app/templates/services/tesseract_ocr_pipeline.html.j2 index 31bf155c..8f4f2cc4 100644 --- a/app/templates/services/tesseract_ocr_pipeline.html.j2 +++ b/app/templates/services/tesseract_ocr_pipeline.html.j2 @@ -85,7 +85,7 @@

 

-

Intensity

+

Intensity (between 0 and 1)

{{ form.ocropus_nlbin_threshold() }}

 

From 176a67757aa1dcdab8cebc713cdeb131a94aea12 Mon Sep 17 00:00:00 2001 From: Inga Kirschnick Date: Mon, 14 Nov 2022 12:25:26 +0100 Subject: [PATCH 3/3] small fixes --- app/daemon/job_utils.py | 3 +-- app/services/forms.py | 25 +++++++++++++------------ app/services/services.yml | 1 + 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/app/daemon/job_utils.py b/app/daemon/job_utils.py index ce43a563..cfb362db 100644 --- a/app/daemon/job_utils.py +++ b/app/daemon/job_utils.py @@ -65,6 +65,7 @@ def _create_job_service(job): command += f' -m {job.service_args["model"]}' if 'binarization' in job.service_args and job.service_args['binarization']: command += ' --binarize' + if 'ocropus_nlbin_threshold' in job.service_args and job.service_args['ocropus_nlbin_threshold']: value = job.service_args['ocropus_nlbin_threshold'] command += f' --ocropus-nlbin-threshold {value}' elif job.service == 'transkribus-htr-pipeline': @@ -146,8 +147,6 @@ def _create_job_service(job): ) ''' ## Restart policy ## ''' restart_policy = docker.types.RestartPolicy() - print(command) - print(mounts) try: docker_client.services.create( image, diff --git a/app/services/forms.py b/app/services/forms.py index 60ac9a50..96caecc4 100644 --- a/app/services/forms.py +++ b/app/services/forms.py @@ -1,17 +1,12 @@ from flask_login import current_user from flask_wtf import FlaskForm from flask_wtf.file import FileField, FileRequired -from wtforms import ( - BooleanField, - DecimalRangeField, - MultipleFileField, - SelectField, - StringField, - SubmitField, - ValidationError -) +from wtforms import (BooleanField, DecimalRangeField, MultipleFileField, + SelectField, StringField, SubmitField, ValidationError) from wtforms.validators import InputRequired, Length -from app.models import TesseractOCRPipelineModel, SpaCyNLPPipelineModel + +from app.models import SpaCyNLPPipelineModel, TesseractOCRPipelineModel + from . import SERVICES @@ -51,7 +46,7 @@ class CreateTesseractOCRPipelineJobForm(CreateJobBaseForm): pdf = FileField('File', validators=[FileRequired()]) model = SelectField('Model', validators=[InputRequired()]) ocropus_nlbin_threshold = DecimalRangeField( - render_kw={'min': 0, 'max': 1, 'step': 0.1, 'start': [0.5]} + render_kw={'min': 0, 'max': 1, 'step': 0.1, 'start': [0.5], 'disabled': True} ) def validate_binarization(self, field): @@ -59,7 +54,7 @@ class CreateTesseractOCRPipelineJobForm(CreateJobBaseForm): if field.data: if not('methods' in service_info and 'binarization' in service_info['methods']): raise ValidationError('Binarization is not available') - + def validate_pdf(self, field): if field.data.mimetype != 'application/pdf': raise ValidationError('PDF files only!') @@ -72,10 +67,16 @@ class CreateTesseractOCRPipelineJobForm(CreateJobBaseForm): if self.binarization.render_kw is None: self.binarization.render_kw = {} self.binarization.render_kw['disabled'] = True + if self.ocropus_nlbin_threshold.render_kw is None: + self.ocropus_nlbin_threshold.render_kw = {} + self.ocropus_nlbin_threshold.render_kw['disabled'] = True if 'methods' in service_info: if 'binarization' in service_info['methods']: if 'disabled' in self.binarization.render_kw: del self.binarization.render_kw['disabled'] + if 'ocropus_nlbin_threshold' in service_info['methods']: + if 'disabled' in self.ocropus_nlbin_threshold.render_kw: + del self.ocropus_nlbin_threshold.render_kw['disabled'] models = [ x for x in TesseractOCRPipelineModel.query.order_by(TesseractOCRPipelineModel.title).all() if version in x.compatible_service_versions and (x.shared == True or x.user == current_user) diff --git a/app/services/services.yml b/app/services/services.yml index c9d61e08..8a8377d5 100644 --- a/app/services/services.yml +++ b/app/services/services.yml @@ -20,6 +20,7 @@ tesseract-ocr-pipeline: 0.1.1: methods: - 'binarization' + - 'ocropus_nlbin_threshold' publishing_year: 2022 url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.1' transkribus-htr-pipeline: