Contribution Package Tesseract OCR

This commit is contained in:
Inga Kirschnick 2022-11-03 15:38:35 +01:00
parent 024eeaa063
commit 46ba14b923
11 changed files with 495 additions and 25 deletions

View File

@ -1,3 +1,4 @@
from xml.dom import ValidationErr
from flask_wtf import FlaskForm from flask_wtf import FlaskForm
from flask_wtf.file import FileField, FileRequired from flask_wtf.file import FileField, FileRequired
from wtforms import ( from wtforms import (
@ -5,13 +6,13 @@ from wtforms import (
StringField, StringField,
SubmitField, SubmitField,
SelectMultipleField, SelectMultipleField,
IntegerField IntegerField,
ValidationError
) )
from wtforms.validators import InputRequired, Length from wtforms.validators import InputRequired, Length
from app.services import SERVICES from app.services import SERVICES
class CreateContributionBaseForm(FlaskForm):
class TesseractOCRModelContributionForm(FlaskForm):
title = StringField( title = StringField(
'Title', 'Title',
validators=[InputRequired(), Length(max=64)] validators=[InputRequired(), Length(max=64)]
@ -24,9 +25,6 @@ class TesseractOCRModelContributionForm(FlaskForm):
'Version', 'Version',
validators=[InputRequired(), Length(max=16)] validators=[InputRequired(), Length(max=16)]
) )
compatible_service_versions = SelectMultipleField(
'Compatible service versions'
)
publisher = StringField( publisher = StringField(
'Publisher', 'Publisher',
validators=[InputRequired(), Length(max=128)] validators=[InputRequired(), Length(max=128)]
@ -43,10 +41,22 @@ class TesseractOCRModelContributionForm(FlaskForm):
'Publishing year', 'Publishing year',
validators=[InputRequired()] validators=[InputRequired()]
) )
shared = BooleanField('Shared', validators=[InputRequired()]) shared = BooleanField(
model_file = FileField('File',validators=[FileRequired()]) 'Shared'
)
submit = SubmitField() submit = SubmitField()
class TesseractOCRModelContributionForm(CreateContributionBaseForm):
tesseract_model_file = FileField(
'File',
validators=[FileRequired()]
)
compatible_service_versions = SelectMultipleField(
'Compatible service versions'
)
def validate_traineddata(self, field):
if field.data.mimetype != '.traineddata':
raise ValidationError('traineddata files only!')
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
service_manifest = SERVICES['tesseract-ocr-pipeline'] service_manifest = SERVICES['tesseract-ocr-pipeline']
@ -56,3 +66,17 @@ class TesseractOCRModelContributionForm(FlaskForm):
(x, x) for x in service_manifest['versions'].keys() (x, x) for x in service_manifest['versions'].keys()
] ]
self.compatible_service_versions.default = '' self.compatible_service_versions.default = ''
class TesseractOCRModelEditForm(CreateContributionBaseForm):
def prefill(self, model_file):
''' Pre-fill the form with data of an exististing corpus file '''
self.title.data = model_file.title
self.description.data = model_file.description
self.publisher.data = model_file.publisher
self.publishing_year.data = model_file.publishing_year
self.publisher_url.data = model_file.publisher_url
self.publishing_url.data = model_file.publishing_url
self.version.data = model_file.version
self.shared.data = model_file.shared

View File

@ -1,10 +1,11 @@
from flask import abort, flash, Markup, render_template, url_for from flask import abort, current_app, flash, Markup, redirect, render_template, url_for
from flask_login import login_required from flask_login import login_required, current_user
from threading import Thread
from app import db from app import db
from app.decorators import permission_required from app.decorators import admin_required, permission_required
from app.models import TesseractOCRPipelineModel, Permission from app.models import TesseractOCRPipelineModel, Permission
from . import bp from . import bp
from .forms import TesseractOCRModelContributionForm from .forms import TesseractOCRModelContributionForm, TesseractOCRModelEditForm
@bp.before_request @bp.before_request
@ -14,13 +15,77 @@ def before_request():
pass pass
@bp.route('') @bp.route('/')
@login_required
@admin_required
def contributions(): def contributions():
pass tesseract_ocr_user_models = [
x for x in current_user.tesseract_ocr_pipeline_models
]
return render_template(
'contributions/contribution_overview.html.j2',
tesseractOCRUserModels=tesseract_ocr_user_models,
userId = current_user.hashid,
title='Contribution Overview'
)
@bp.route('/<hashid:tesseract_ocr_pipeline_model_id>', methods=['GET', 'POST'])
@login_required
def tesseract_ocr_pipeline_model(tesseract_ocr_pipeline_model_id):
tesseract_ocr_pipeline_model = TesseractOCRPipelineModel.query.get_or_404(
tesseract_ocr_pipeline_model_id
)
form = TesseractOCRModelEditForm(prefix='tesseract-ocr-model-edit-form')
if form.validate_on_submit():
if tesseract_ocr_pipeline_model.title != form.title.data:
tesseract_ocr_pipeline_model.title = form.title.data
if tesseract_ocr_pipeline_model.description != form.description.data:
tesseract_ocr_pipeline_model.description = form.description.data
if tesseract_ocr_pipeline_model.publisher != form.publisher.data:
tesseract_ocr_pipeline_model.publisher = form.publisher.data
if tesseract_ocr_pipeline_model.publishing_year != form.publishing_year.data:
tesseract_ocr_pipeline_model.publishing_year = form.publishing_year.data
if tesseract_ocr_pipeline_model.publisher_url != form.publisher_url.data:
tesseract_ocr_pipeline_model.publisher_url = form.publisher_url.data
if tesseract_ocr_pipeline_model.publishing_url != form.publishing_url.data:
tesseract_ocr_pipeline_model.publishing_url = form.publishing_url.data
if tesseract_ocr_pipeline_model.version != form.version.data:
tesseract_ocr_pipeline_model.version = form.version.data
if tesseract_ocr_pipeline_model.shared != form.shared.data:
tesseract_ocr_pipeline_model.shared = form.shared.data
db.session.commit()
message = Markup(f'Model "<a href="contribute/{tesseract_ocr_pipeline_model.hashid}">{tesseract_ocr_pipeline_model.title}</a>" updated')
flash(message, category='corpus')
return {}, 201, {'Location': url_for('contributions.contributions')}
form.prefill(tesseract_ocr_pipeline_model)
return render_template(
'contributions/tesseract_ocr_pipeline_model.html.j2',
tesseract_ocr_pipeline_model=tesseract_ocr_pipeline_model,
form=form,
title='Edit your Tesseract OCR model'
)
@bp.route('/tesseract-ocr-pipeline-models', methods=['GET', 'POST']) @bp.route('/<hashid:tesseract_ocr_pipeline_model_id>', methods=['DELETE'])
def tesseract_ocr_pipeline_models(): @login_required
def delete_tesseract_model(tesseract_ocr_pipeline_model_id):
def _delete_tesseract_model(app, tesseract_ocr_pipeline_model_id):
with app.app_context():
model = TesseractOCRPipelineModel.query.get(tesseract_ocr_pipeline_model_id)
model.delete()
db.session.commit()
model = TesseractOCRPipelineModel.query.get_or_404(tesseract_ocr_pipeline_model_id)
if not (model.user == current_user or current_user.is_administrator()):
abort(403)
thread = Thread(
target=_delete_tesseract_model,
args=(current_app._get_current_object(), tesseract_ocr_pipeline_model_id)
)
thread.start()
return {}, 202
@bp.route('/add-tesseract-ocr-pipeline-model', methods=['GET', 'POST'])
def add_tesseract_ocr_pipeline_model():
form = TesseractOCRModelContributionForm( form = TesseractOCRModelContributionForm(
prefix='contribute-tesseract-ocr-pipeline-model-form' prefix='contribute-tesseract-ocr-pipeline-model-form'
) )
@ -30,7 +95,7 @@ def tesseract_ocr_pipeline_models():
return response, 400 return response, 400
try: try:
tesseract_ocr_model = TesseractOCRPipelineModel.create( tesseract_ocr_model = TesseractOCRPipelineModel.create(
form.file.data, form.tesseract_model_file.data,
compatible_service_versions=form.compatible_service_versions.data, compatible_service_versions=form.compatible_service_versions.data,
description=form.description.data, description=form.description.data,
publisher=form.publisher.data, publisher=form.publisher.data,
@ -39,7 +104,8 @@ def tesseract_ocr_pipeline_models():
publishing_year=form.publishing_year.data, publishing_year=form.publishing_year.data,
shared=form.shared.data, shared=form.shared.data,
title=form.title.data, title=form.title.data,
version=form.version.data version=form.version.data,
user=current_user
) )
except OSError: except OSError:
abort(500) abort(500)
@ -47,8 +113,13 @@ def tesseract_ocr_pipeline_models():
message = Markup(f'Model "{tesseract_ocr_model.title}" created') message = Markup(f'Model "{tesseract_ocr_model.title}" created')
flash(message) flash(message)
return {}, 201, {'Location': url_for('contributions.contributions')} return {}, 201, {'Location': url_for('contributions.contributions')}
tesseract_ocr_pipeline_models = [
x for x in TesseractOCRPipelineModel.query.all()
]
return render_template( return render_template(
'contributions/contribute.html.j2', 'contributions/contribute_tesseract_ocr_models.html.j2',
form=form, form=form,
title='Contribution' tesseract_ocr_pipeline_models=tesseract_ocr_pipeline_models,
title='Tesseract OCR Model Contribution'
) )

View File

@ -603,6 +603,13 @@ class TesseractOCRPipelineModel(FileMixin, HashidMixin, db.Model):
pbar.close() pbar.close()
db.session.commit() db.session.commit()
def delete(self):
try:
os.remove(self.path)
except OSError as e:
current_app.logger.error(e)
db.session.delete(self)
def to_json(self, backrefs=False, relationships=False): def to_json(self, backrefs=False, relationships=False):
_json = { _json = {
'id': self.hashid, 'id': self.hashid,
@ -1023,11 +1030,8 @@ class CorpusFile(FileMixin, HashidMixin, db.Model):
def delete(self): def delete(self):
try: try:
os.remove(self.path) os.remove(self.path)
except OSError: except OSError as e:
current_app.logger.error( current_app.logger.error(e)
f'Removing {self.path} led to an OSError!'
)
pass
db.session.delete(self) db.session.delete(self)
self.corpus.status = CorpusStatus.UNPREPARED self.corpus.status = CorpusStatus.UNPREPARED

View File

@ -0,0 +1,18 @@
class CreateContributionForm extends Form {
static autoInit() {
let createContributionFormElements = document.querySelectorAll('.create-contribution-form');
for (let createContributionFormElement of createContributionFormElements) {
new CreateContributionForm(createContributionFormElement);
}
}
constructor(formElement) {
super(formElement);
this.addEventListener('requestLoad', (event) => {
if (event.target.status === 201) {
window.location.href = event.target.getResponseHeader('Location');
}
});
}
}

View File

@ -1,5 +1,6 @@
class Form { class Form {
static autoInit() { static autoInit() {
CreateContributionForm.autoInit();
CreateCorpusFileForm.autoInit(); CreateCorpusFileForm.autoInit();
CreateJobForm.autoInit(); CreateJobForm.autoInit();
} }

View File

@ -0,0 +1,77 @@
class TesseractOCRModelList {
constructor () {
this.elements = {
tesseractOCRModelList: document.querySelector('#tesseract-ocr-model-list'),
deleteButtons: document.querySelectorAll('.delete-button'),
editButtons: document.querySelectorAll('.edit-button'),
}
}
init () {
let userId = this.elements.tesseractOCRModelList.dataset.userId;
for (let deleteButton of this.elements.deleteButtons) {
deleteButton.addEventListener('click', () => {this.deleteModel(deleteButton, userId);});
}
for (let editButton of this.elements.editButtons) {
editButton.addEventListener('click', () => {this.editModel(editButton);});
}
}
deleteModel(deleteButton, userId) {
return new Promise((resolve, reject) => {
let modelId = deleteButton.dataset.modelId;
let model = app.data.users[userId].tesseract_ocr_pipeline_models[modelId];
let modalElement = Utils.elementFromString(
`
<div class="modal">
<div class="modal-content">
<h4>Confirm job deletion</h4>
<p>Do you really want to delete? All files will be permanently deleted!</p>
</div>
<div class="modal-footer">
<a class="action-button btn modal-close waves-effect waves-light" data-action="cancel">Cancel</a>
<a class="action-button btn modal-close red waves-effect waves-light" data-action="confirm">Delete</a>
</div>
</div>
`
);
document.querySelector('#modals').appendChild(modalElement);
let modal = M.Modal.init(
modalElement,
{
dismissible: false,
onCloseEnd: () => {
modal.destroy();
modalElement.remove();
}
}
);
let confirmElement = modalElement.querySelector('.action-button[data-action="confirm"]');
confirmElement.addEventListener('click', (event) => {
let modelTitle = model.title;
fetch(`/contributions/${modelId}`, {method: 'DELETE'})
.then(
(response) => {
app.flash(`Model "${modelTitle}" marked for deletion`, 'corpus');
resolve(response);
},
(response) => {
if (response.status === 403) {app.flash('Forbidden', 'error');}
if (response.status === 404) {app.flash('Not Found', 'error');}
reject(response);
}
);
});
modal.open();
});
}
editModel(editButton) {
window.location.href = `/contributions/${editButton.dataset.modelId}`;
}
}

View File

@ -9,6 +9,7 @@
'js/Forms/Form.js', 'js/Forms/Form.js',
'js/Forms/CreateCorpusFileForm.js', 'js/Forms/CreateCorpusFileForm.js',
'js/Forms/CreateJobForm.js', 'js/Forms/CreateJobForm.js',
'js/Forms/CreateContributionForm.js',
'js/CorpusAnalysis/CQiClient.js', 'js/CorpusAnalysis/CQiClient.js',
'js/CorpusAnalysis/CorpusAnalysisApp.js', 'js/CorpusAnalysis/CorpusAnalysisApp.js',
'js/CorpusAnalysis/CorpusAnalysisConcordance.js', 'js/CorpusAnalysis/CorpusAnalysisConcordance.js',
@ -24,6 +25,7 @@
'js/RessourceLists/JobInputList.js', 'js/RessourceLists/JobInputList.js',
'js/RessourceLists/JobResultList.js', 'js/RessourceLists/JobResultList.js',
'js/RessourceLists/QueryResultList.js', 'js/RessourceLists/QueryResultList.js',
'js/RessourceLists/TesseractOCRModelList.js',
'js/RessourceLists/UserList.js' 'js/RessourceLists/UserList.js'
%} %}
<script src="{{ ASSET_URL }}"></script> <script src="{{ ASSET_URL }}"></script>

View File

@ -0,0 +1,18 @@
{% set breadcrumbs %}
<li class="tab disabled"><i class="material-icons">navigate_next</i></li>
{% if request.path == url_for('.contributions') %}
<li class="tab"><a class="active" href="{{ url_for('.contributions') }}" target="_self">Contributions Overview</a></li>
{% elif request.path == url_for('.tesseract_ocr_pipeline_model', tesseract_ocr_pipeline_model_id=tesseract_ocr_pipeline_model.id) %}
<li class="tab"><a href="{{ url_for('.contributions') }}" target="_self">Contributions Overview</a></li>
<li class="tab disabled"><i class="material-icons">navigate_next</i></li>
<li class="tab">
<a class="active" href="{{ url_for('.tesseract_ocr_pipeline_model', tesseract_ocr_pipeline_model_id=tesseract_ocr_pipeline_model.hashid) }}" target="_self">
Edit {{ tesseract_ocr_pipeline_model.title }}
</a>
</li>
{% elif request.path == url_for('.add_tesseract_ocr_pipeline_model, tesseract_ocr_pipeline_model=nn') %}
<li class="tab"><a href="{{ url_for('.contributions', tesseract_ocr_pipeline_model_id=nn) }}" target="_self">Contributions Overview</a></li>
<li class="tab disabled"><i class="material-icons">navigate_next</i></li>
<li class="tab"><a class="active" href="{{ url_for('.add_tesseract_ocr_pipeline_model') }}" target="_self">{{ title }}</a></li>
{% endif %}
{% endset %}

View File

@ -0,0 +1,124 @@
{% extends "base.html.j2" %}
{% import "materialize/wtf.html.j2" as wtf %}
{# {% from "contributions/_breadcrumbs.html.j2" import breadcrumbs with context %} #}
{% block main_attribs %} class="service-scheme" data-service="tesseract-ocr-pipeline"{% endblock main_attribs %}
{% block page_content %}
<div class="container">
<div class="row">
<div class="col s12">
<h1 id="title">{{ title }}</h1>
</div>
<div class="col s12 m3 push-m9">
<div class="center-align">
<p class="hide-on-small-only">&nbsp;</p>
<p class="hide-on-small-only">&nbsp;</p>
<a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
<i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr-pipeline"></i>
</a>
</div>
</div>
<div class="col s12 m9 pull-m3">
<div class="card service-color-border border-darken" data-service="tesseract-ocr-pipeline" style="border-top: 10px solid;">
<div class="card-content">
<div class="row">
<div class="col s12">
<div class="card-panel z-depth-0">
<span class="card-title"><i class="left material-icons">layers</i>Tesseract OCR Models</span>
<p>You can add more Tesseract OCR models using the form below. They will automatically appear in the list of usable models.</p>
<p><a class="modal-trigger" href="#models-modal">Information about the already existing models.</a></p>
<p><a href="">Edit already uploaded models</a></p>
</div>
</div>
</div>
</div>
</div>
</div>
<div class="col s12">
<h2>Add a model</h2>
<div class="card">
<form class="create-contribution-form" enctype="multipart/form-data" method="POST">
<div class="card-content">
{{ form.hidden_tag() }}
<div class="row">
<div class="col s12 l5">
{{ wtf.render_field(form.tesseract_model_file, accept='.traineddata', placeholder='Choose a .traineddata file') }}
</div>
<div class="col s12 l7">
{{ wtf.render_field(form.title, material_icon='title') }}
</div>
<div class="col s12">
{{ wtf.render_field(form.description, material_icon='description') }}
</div>
<div class="col s12 l6">
{{ wtf.render_field(form.publisher, material_icon='account_balance') }}
</div>
<div class="col s12 l6">
{{ wtf.render_field(form.publishing_year, material_icon='calendar_month') }}
</div>
<div class="col s12">
{{ wtf.render_field(form.publisher_url, material_icon='link') }}
</div>
<div class="col s12">
{{ wtf.render_field(form.publishing_url, material_icon='link') }}
</div>
<div class="col s12 l10">
{{ wtf.render_field(form.version, material_icon='apps') }}
</div>
<div class="col s12 l6">
{{ wtf.render_field(form.compatible_service_versions) }}
</div>
<div class="col s12 l6 right-align" style="padding-right:20px;">
<p></p>
<br>
{{ wtf.render_field(form.shared) }}
</div>
</div>
</div>
<div class="card-action right-align">
{{ wtf.render_field(form.submit, material_icon='send') }}
</div>
</form>
</div>
</div>
</div>
</div>
{% endblock page_content %}
{% block modals %}
{{ super() }}
<div id="models-modal" class="modal">
<div class="modal-content">
<h4>Tesseract OCR Pipeline models</h4>
<table>
<thead>
<tr>
<th>Title</th>
<th>Description</th>
<th>Biblio</th>
</tr>
</thead>
<tbody>
{% for m in tesseract_ocr_pipeline_models %}
<tr id="tesseract-ocr-pipeline-model-{{ m.hashid }}">
<td>{{ m.title }}</td>
{% if m.description == '' %}
<td>Description is not available.</td>
{% else %}
<td>{{ m.description }}</td>
{% endif %}
<td><a href="{{ m.publisher_url }}">{{ m.publisher }}</a> ({{ m.publishing_year }}), {{ m.title }} {{ m.version}}, <a href="{{ m.publishing_url }}">{{ m.publishing_url }}</a></td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
<div class="modal-footer">
<a href="#!" class="modal-close waves-effect waves-light btn">Close</a>
</div>
</div>
{% endblock modals %}

View File

@ -0,0 +1,75 @@
{% extends "base.html.j2" %}
{% import "materialize/wtf.html.j2" as wtf %}
{% from "contributions/_breadcrumbs.html.j2" import breadcrumbs with context %}
{% block page_content %}
<div class="container">
<div class="row">
<div class="col s12">
<h1 id="title">{{ title }}</h1>
{# Tesseract OCR Models #}
<div>
<h3>My Tesseract OCR Pipeline Models</h3>
<p>Here you can see and edit the models that you have created. You can also create new models.</p>
<div class="row">
<div class="col s12">
<div class="card">
<div class="card-content">
<div id="tesseract-ocr-model-list" data-user-id="{{ userId }}">
<table>
<thead>
<tr>
<th>Title</th>
<th>Description</th>
<th>Biblio</th>
<th></th>
</tr>
</thead>
<tbody>
{% if tesseractOCRUserModels|length > 0 %}
{% for m in tesseractOCRUserModels %}
<tr id="tesseract-ocr-pipeline-model-{{ m.hashid }}">
<td>{{ m.title }}</td>
{% if m.description == '' %}
<td>Description is not available.</td>
{% else %}
<td>{{ m.description }}</td>
{% endif %}
<td><a href="{{ m.publisher_url }}">{{ m.publisher }}</a> ({{ m.publishing_year }}), {{ m.title }} {{ m.version}}, <a href="{{ m.publishing_url }}">{{ m.publishing_url }}</a></td>
<td class="right-align">
<a class="delete-button btn-floating red waves-effect waves-light" data-model-id="{{ m.hashid }}"><i class="material-icons">delete</i></a>
<a class="edit-button btn-floating service-color darken waves-effect waves-light" data-model-id="{{ m.hashid }}"><i class="material-icons">edit</i></a>
</td>
</tr>
{% endfor %}
{% else %}
<tr>
<td colspan="4">No models available.</td>
</tr>
{% endif %}
</tbody>
</table>
</div>
</div>
<div class="card-action right-align">
<a href="{{ url_for('contributions.add_tesseract_ocr_pipeline_model') }}" class="btn waves-effect waves-light"><i class="material-icons left">add</i>Add model file</a>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
{% endblock page_content %}
{% block scripts %}
{{ super() }}
<script>
const tesseractOCRModelList = new TesseractOCRModelList();
tesseractOCRModelList.init();
</script>
{% endblock scripts %}

View File

@ -0,0 +1,56 @@
{% extends "base.html.j2" %}
{% import "materialize/wtf.html.j2" as wtf %}
{% from "contributions/_breadcrumbs.html.j2" import breadcrumbs with context %}
{% block main_attribs %} class="service-scheme" data-service="tesseract-ocr-pipeline"{% endblock main_attribs %}
{% block page_content %}
<div class="container">
<div class="row">
<div class="col s12">
<h1 id="title">{{ title }}</h1>
</div>
<div class="col s12">
<div class="card">
<form class="create-contribution-form" enctype="multipart/form-data" method="POST">
<div class="card-content">
{{ form.hidden_tag() }}
<div class="row">
<div class="col s12 l7">
{{ wtf.render_field(form.title, material_icon='title') }}
</div>
<div class="col s12">
{{ wtf.render_field(form.description, material_icon='description') }}
</div>
<div class="col s12 l6">
{{ wtf.render_field(form.publisher, material_icon='account_balance') }}
</div>
<div class="col s12 l6">
{{ wtf.render_field(form.publishing_year, material_icon='calendar_month') }}
</div>
<div class="col s12">
{{ wtf.render_field(form.publisher_url, material_icon='link') }}
</div>
<div class="col s12">
{{ wtf.render_field(form.publishing_url, material_icon='link') }}
</div>
<div class="col s12 l10">
{{ wtf.render_field(form.version, material_icon='apps') }}
</div>
<div class="col s12 l6 right-align" style="padding-right:20px;">
<p></p>
<br>
{{ wtf.render_field(form.shared) }}
</div>
</div>
</div>
<div class="card-action right-align">
{{ wtf.render_field(form.submit, material_icon='send') }}
</div>
</form>
</div>
</div>
</div>
</div>
{% endblock page_content %}