Rename all services, use scss, cleanup, add sandpaper conversion script

This commit is contained in:
Patrick Jentsch 2022-04-04 13:31:09 +02:00
parent 8fd59f8078
commit ce997e69ea
31 changed files with 1361 additions and 303 deletions

View File

@ -168,3 +168,11 @@ NOPAQUE_SOCKETIO_MESSAGE_QUEUE_URI=
# DEFAULT: 0 # DEFAULT: 0
# Number of values to trust for X-Forwarded-Proto # Number of values to trust for X-Forwarded-Proto
# NOPAQUE_PROXY_FIX_X_PROTO= # NOPAQUE_PROXY_FIX_X_PROTO=
# CHOOSE ONE: False, True
# DEFAULT: False
# NOPAQUE_TRANSKRIBUS_ENABLED=
# READ-COOP account data: https://readcoop.eu/
# NOPAQUE_READCOOP_USERNAME=
# NOPAQUE_READCOOP_PASSWORD=

3
.gitignore vendored
View File

@ -18,7 +18,8 @@ data/**
pip-log.txt pip-log.txt
# Logs in log folder # Logs in log folder
logs/*.log logs/*
!logs/dummy
# Packages # Packages
*.egg *.egg

View File

@ -1,7 +1,7 @@
FROM python:3.9.8-slim-buster FROM python:3.8.13-slim-buster
LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <sporada@uni-bielefeld.de>" LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>"
ARG DOCKER_GID ARG DOCKER_GID

File diff suppressed because it is too large Load Diff

View File

View File

@ -1,5 +1,6 @@
from config import Config from config import Config
from flask import Flask from flask import Flask
from flask_apscheduler import APScheduler
from flask_login import LoginManager from flask_login import LoginManager
from flask_mail import Mail from flask_mail import Mail
from flask_migrate import Migrate from flask_migrate import Migrate
@ -20,6 +21,7 @@ mail: Mail = Mail()
migrate: Migrate = Migrate() migrate: Migrate = Migrate()
paranoid: Paranoid = Paranoid() paranoid: Paranoid = Paranoid()
paranoid.redirect_view = '/' paranoid.redirect_view = '/'
scheduler: APScheduler = APScheduler() # TODO: Use this!
socketio: SocketIO = SocketIO() socketio: SocketIO = SocketIO()

View File

@ -2,9 +2,8 @@ from flask import current_app
from flask_migrate import upgrade from flask_migrate import upgrade
from . import db from . import db
from .models import Corpus, Job, Role, User, TesseractOCRModel from .models import Corpus, Job, Role, User, TesseractOCRModel
import json import click
import os import os
import re
def _make_default_dirs(): def _make_default_dirs():
@ -56,6 +55,19 @@ def register(app):
daemon: Daemon = Daemon() daemon: Daemon = Daemon()
daemon.run() daemon.run()
@app.cli.group()
def converter():
''' Converter commands. '''
pass
@converter.command()
@click.argument('json_db')
@click.argument('data_dir')
def sandpaper(json_db, data_dir):
''' Sandpaper converter '''
from app.converters.sandpaper import convert
convert(json_db, data_dir)
@app.cli.group() @app.cli.group()
def test(): def test():
''' Test commands. ''' ''' Test commands. '''
@ -68,55 +80,3 @@ def register(app):
from unittest.suite import TestSuite from unittest.suite import TestSuite
tests: TestSuite = TestLoader().discover('tests') tests: TestSuite = TestLoader().discover('tests')
TextTestRunner(verbosity=2).run(tests) TextTestRunner(verbosity=2).run(tests)
@app.cli.group()
def convert():
''' Datebase convert commands. '''
@convert.command()
def nlp_jobs():
for job in Job.query.filter_by(service='nlp').all():
job.service = 'spacy-nlp'
service_args = json.loads(job.service_args)
new_service_args = {}
for service_arg in service_args:
if service_arg == '--check-encoding':
new_service_args['encoding_detection'] = True
elif re.match(r'-l ([a-z]{2})', service_arg):
language_code = re.search(r'-l ([a-z]{2})', service_arg).group(1) # noqa
new_service_args['language'] = language_code
job.service_args = json.dumps(new_service_args)
db.session.commit()
@convert.command()
def ocr_jobs():
# Language code to TesseractOCRModel.title lookup
language_code_lookup = {
'ara': 'Arabic',
'chi_tra': 'Chinese - Traditional',
'dan': 'Danish',
'eng': 'English',
'enm': 'English, Middle (1100-1500)',
'fra': 'French',
'frm': 'French, Middle (ca. 1400-1600)',
'deu': 'German',
'frk': 'German Fraktur',
'ell': 'Greek, Modern (1453-)',
'ita': 'Italian',
'por': 'Portuguese',
'rus': 'Russian',
'spa': 'Spanish; Castilian'
}
for job in Job.query.filter_by(service='ocr').all():
job.service = 'tesseract-ocr'
service_args = json.loads(job.service_args)
new_service_args = {}
for service_arg in service_args:
if service_arg == '--binarize':
new_service_args['binarization'] = True
elif re.match(r'-l ([a-z]{3})', service_arg):
language_code = re.search(r'-l ([a-z]{3})', service_arg).group(1) # noqa
tesseract_ocr_model = TesseractOCRModel.query.filter_by(title=language_code_lookup[language_code]).first() # noqa
new_service_args['model'] = tesseract_ocr_model.id
job.service_args = json.dumps(new_service_args)
db.session.commit()

View File

215
app/converters/sandpaper.py Normal file
View File

@ -0,0 +1,215 @@
from flask import current_app
from app import db
from app.models import User, Corpus, CorpusFile
from datetime import datetime
import json
import os
def convert(json_db_file, data_dir):
with open(json_db_file, 'r') as f:
json_db = json.loads(f.read())
for json_user in json_db:
if not json_user['confirmed']:
current_app.logger.info(f'Skip unconfirmed user {json_user["username"]}')
continue
user_dir = os.path.join(data_dir, json_user['id'])
convert_user(json_user, user_dir)
db.session.commit()
def convert_user(json_user, user_dir):
current_app.logger.info(f'Create User {json_user["username"]}...')
user = User(
confirmed=json_user['confirmed'],
email=json_user['email'],
last_seen=datetime.fromtimestamp(json_user['last_seen']),
member_since=datetime.fromtimestamp(json_user['member_since']),
password_hash=json_user['password_hash'], # TODO: Needs to be added manually
username=json_user['username']
)
db.session.add(user)
db.session.flush(objects=[user])
db.session.refresh(user)
try:
user.makedirs()
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
raise Exception('Internal Server Error')
for json_corpus in json_user['corpora'].values():
if not json_corpus['files'].values():
current_app.logger.info(f'Skip empty corpus {json_corpus["title"]}')
continue
corpus_dir = os.path.join(user_dir, 'corpora', json_corpus['id'])
convert_corpus(json_corpus, user, corpus_dir)
current_app.logger.info('Done')
def convert_corpus(json_corpus, user, corpus_dir):
current_app.logger.info(f'Create Corpus {json_corpus["title"]}...')
corpus = Corpus(
user=user,
creation_date=datetime.fromtimestamp(json_corpus['creation_date']),
description=json_corpus['description'],
last_edited_date=datetime.fromtimestamp(json_corpus['last_edited_date']),
title=json_corpus['title']
)
db.session.add(corpus)
db.session.flush(objects=[corpus])
db.session.refresh(corpus)
try:
corpus.makedirs()
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
raise Exception('Internal Server Error')
for json_corpus_file in json_corpus['files'].values():
corpus_file_dir = os.path.join(corpus_dir, 'files', json_corpus_file['id'])
convert_corpus_file(json_corpus_file, corpus, corpus_file_dir)
current_app.logger.info('Done')
def convert_corpus_file(json_corpus_file, corpus, corpus_file_dir):
current_app.logger.info(f'Create CorpusFile {json_corpus_file["title"]}...')
corpus_file = CorpusFile(
corpus=corpus,
address=json_corpus_file['address'],
author=json_corpus_file['author'],
booktitle=json_corpus_file['booktitle'],
chapter=json_corpus_file['chapter'],
editor=json_corpus_file['editor'],
filename=json_corpus_file['filename'],
institution=json_corpus_file['institution'],
journal=json_corpus_file['journal'],
mimetype='application/vrt+xml',
pages=json_corpus_file['pages'],
publisher=json_corpus_file['publisher'],
publishing_year=json_corpus_file['publishing_year'],
school=json_corpus_file['school'],
title=json_corpus_file['title']
)
db.session.add(corpus_file)
db.session.flush(objects=[corpus_file])
db.session.refresh(corpus_file)
try:
convert_vrt(
os.path.join(corpus_file_dir, json_corpus_file['filename']),
corpus_file.path
)
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
raise Exception('Internal Server Error')
current_app.logger.info('Done')
def convert_vrt(input_file, output_file):
def check_pos_attribute_order(vrt_lines):
# The following orders are possible:
# since 26.02.2019: 'word,lemma,simple_pos,pos,ner'
# since 26.03.2021: 'word,pos,lemma,simple_pos,ner'
# since 27.01.2022: 'word,pos,lemma,simple_pos'
# This Function tries to find out which order we have by looking at the
# number of attributes and the position of the simple_pos attribute
SIMPLE_POS_LABELS = [
'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ',
'DET', 'INTJ', 'NOUN', 'NUM', 'PART',
'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM',
'VERB', 'X'
]
for line in vrt_lines:
if line.startswith('<'):
continue
pos_attrs = line.rstrip('\n').split('\t')
num_pos_attrs = len(pos_attrs)
if num_pos_attrs == 4:
if pos_attrs[3] in SIMPLE_POS_LABELS:
return ['word', 'pos', 'lemma', 'simple_pos']
continue
elif num_pos_attrs == 5:
if pos_attrs[2] in SIMPLE_POS_LABELS:
return ['word', 'lemma', 'simple_pos', 'pos', 'ner']
elif pos_attrs[3] in SIMPLE_POS_LABELS:
return ['word', 'pos', 'lemma', 'simple_pos', 'ner']
continue
return None
def check_has_ent_as_s_attr(vrt_lines):
for line in vrt_lines:
if line.startswith('<ent'):
return True
return False
def pos_attrs_to_string_1(pos_attrs):
return f'{pos_attrs[0]}\t{pos_attrs[3]}\t{pos_attrs[1]}\t{pos_attrs[2]}\n'
def pos_attrs_to_string_2(pos_attrs):
return f'{pos_attrs[0]}\t{pos_attrs[1]}\t{pos_attrs[2]}\t{pos_attrs[3]}\n'
with open(input_file) as f:
input_vrt_lines = f.readlines()
pos_attr_order = check_pos_attribute_order(input_vrt_lines)
has_ent_as_s_attr = check_has_ent_as_s_attr(input_vrt_lines)
print(f'Detected pos_attr_order: [{",".join(pos_attr_order)}]')
print(f'Detected has_ent_as_s_attr: {has_ent_as_s_attr}')
if pos_attr_order == ['word', 'lemma', 'simple_pos', 'pos', 'ner']:
pos_attrs_to_string_function = pos_attrs_to_string_1
elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos', 'ner']:
pos_attrs_to_string_function = pos_attrs_to_string_2
elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos']:
pos_attrs_to_string_function = pos_attrs_to_string_2
else:
raise Exception('Can not handle format')
current_ent = None
output_vrt = ''
for line in input_vrt_lines:
if line.strip() == '':
continue
if line.startswith('<'):
if not has_ent_as_s_attr:
if current_ent is not None:
output_vrt += '</ent>\n'
current_ent = None
if (
line.startswith('<corpus')
or line.startswith('</corpus')
or line.startswith('<nlp')
):
continue
elif line.startswith('<text'):
output_vrt += '<text>\n'
continue
elif line.startswith('<s'):
output_vrt += '<s>\n'
continue
output_vrt += line
continue
pos_attrs = line.rstrip('\n').split('\t')
if not has_ent_as_s_attr:
if pos_attrs[4].lower() in ['null', 'none']:
if current_ent:
output_vrt += '</ent>\n'
current_ent = None
else:
if current_ent is None:
output_vrt += f'<ent type="{pos_attrs[4]}">\n'
current_ent = pos_attrs[4]
elif current_ent != pos_attrs[4]:
output_vrt += '</ent>\n'
current_ent = None
output_vrt += f'<ent type="{pos_attrs[4]}">\n'
current_ent = pos_attrs[4]
output_vrt += pos_attrs_to_string_function(pos_attrs)
with open(output_file, 'w') as f:
f.write(output_vrt)

View File

@ -319,7 +319,7 @@ def corpus_file(corpus_id, corpus_file_id):
form.title.data = corpus_file.title form.title.data = corpus_file.title
return render_template( return render_template(
'corpora/corpus_file.html.j2', 'corpora/corpus_file.html.j2',
corpus=corpus, corpus=corpus_file.corpus,
corpus_file=corpus_file, corpus_file=corpus_file,
form=form, form=form,
title='Edit corpus file' title='Edit corpus file'

View File

@ -22,34 +22,46 @@ class CheckJobsMixin:
def create_job_service(self, job): def create_job_service(self, job):
''' # Docker service settings # ''' ''' # Docker service settings # '''
''' ## Service specific settings ## ''' ''' ## Service specific settings ## '''
if job.service == 'file-setup': if job.service == 'file-setup-pipeline':
mem_mb = 512 mem_mb = 512
n_cores = 2 n_cores = 2
executable = 'file-setup' executable = 'file-setup-pipeline'
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}file-setup:v{job.service_version}' # noqa image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}file-setup-pipeline:v{job.service_version}' # noqa
elif job.service == 'tesseract-ocr': elif job.service == 'tesseract-ocr-pipeline':
mem_mb = 2048 mem_mb = 1024
n_cores = 4 n_cores = 4
executable = 'ocr' executable = 'tesseract-ocr-pipeline'
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}ocr:v{job.service_version}' # noqa image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}tesseract-ocr-pipeline:v{job.service_version}' # noqa
elif job.service == 'spacy-nlp': elif job.service == 'transkribus-htr-pipeline':
mem_mb = 1024
n_cores = 4
executable = 'transkribus-htr-pipeline'
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}transkribus-htr-pipeline:v{job.service_version}' # noqa
elif job.service == 'spacy-nlp-pipeline':
mem_mb = 1024 mem_mb = 1024
n_cores = 1 n_cores = 1
executable = 'nlp' executable = 'spacy-nlp-pipeline'
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}nlp:v{job.service_version}' # noqa image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}spacy-nlp-pipeline:v{job.service_version}' # noqa
''' ## Command ## ''' ''' ## Command ## '''
command = f'{executable} -i /input -o /output' command = f'{executable} -i /input -o /output'
command += ' --log-dir /logs' command += ' --log-dir /logs'
command += f' --mem-mb {mem_mb}' command += f' --mem-mb {mem_mb}'
command += f' --n-cores {n_cores}' command += f' --n-cores {n_cores}'
service_args = json.loads(job.service_args) if job.service == 'spacy-nlp-pipeline':
if job.service == 'spacy-nlp': command += f' -m {job.service_args["model"]}'
command += f' -m {service_args["model"]}' if 'encoding_detection' in job.service_args and job.service_args['encoding_detection']: # noqa
if 'encoding_detection' in service_args and service_args['encoding_detection']: # noqa
command += ' --check-encoding' command += ' --check-encoding'
elif job.service == 'tesseract-ocr': elif job.service == 'tesseract-ocr-pipeline':
command += f' -m {service_args["model"]}' command += f' -m {job.service_args["model"]}'
if 'binarization' in service_args and service_args['binarization']: if 'binarization' in job.service_args and job.service_args['binarization']:
command += ' --binarize'
elif job.service == 'transkribus-htr-pipeline':
command += f' -m {job.service_args["model"]}'
readcoop_username = current_app.config.get('NOPAQUE_READCOOP_USERNAME')
command += f' --readcoop-username "{readcoop_username}"'
readcoop_password = current_app.config.get('NOPAQUE_READCOOP_PASSWORD')
command += f' --readcoop-password "{readcoop_password}"'
if 'binarization' in job.service_args and job.service_args['binarization']:
command += ' --binarize' command += ' --binarize'
''' ## Constraints ## ''' ''' ## Constraints ## '''
constraints = ['node.role==worker'] constraints = ['node.role==worker']
@ -63,16 +75,15 @@ class CheckJobsMixin:
mounts = [] mounts = []
''' ### Input mount(s) ### ''' ''' ### Input mount(s) ### '''
input_mount_target_base = '/input' input_mount_target_base = '/input'
if job.service == 'file-setup': if job.service == 'file-setup-pipeline':
input_mount_target_base += f'/{secure_filename(job.title)}' input_mount_target_base += f'/{secure_filename(job.title)}'
for job_input in job.inputs: for job_input in job.inputs:
input_mount_source = job_input.path input_mount_source = job_input.path
input_mount_target = f'/{input_mount_target_base}/{job_input.filename}' # noqa input_mount_target = f'{input_mount_target_base}/{job_input.filename}' # noqa
input_mount = f'{input_mount_source}:{input_mount_target}:ro' input_mount = f'{input_mount_source}:{input_mount_target}:ro'
mounts.append(input_mount) mounts.append(input_mount)
if job.service == 'tesseract-ocr': if job.service == 'tesseract-ocr-pipeline':
service_args = json.loads(job.service_args) model = TesseractOCRModel.query.get(job.service_args['model'])
model = TesseractOCRModel.query.get(service_args['model'])
if model is None: if model is None:
job.status = JobStatus.FAILED job.status = JobStatus.FAILED
return return
@ -114,7 +125,8 @@ class CheckJobsMixin:
mounts=mounts, mounts=mounts,
name=name, name=name,
resources=resources, resources=resources,
restart_policy=restart_policy restart_policy=restart_policy,
user='1000:1000'
) )
except docker.errors.APIError as e: except docker.errors.APIError as e:
current_app.logger.error( current_app.logger.error(

View File

@ -36,14 +36,23 @@ class IntEnumColumn(db.TypeDecorator):
return self.enum_type(value) return self.enum_type(value)
class Permission(IntEnum): class ContainerColumn(db.TypeDecorator):
''' impl = db.String
Defines User permissions as integers by the power of 2. User permission
can be evaluated using the bitwise operator &. def __init__(self, container_type, *args, **kwargs):
''' super().__init__(*args, **kwargs)
ADMINISTRATE = 4 self.container_type = container_type
CONTRIBUTE = 2
USE_API = 1 def process_bind_param(self, value, dialect):
if isinstance(value, self.container_type):
return json.dumps(value)
elif isinstance(value, str) and isinstance(json.loads(value), self.container_type): # noqa
return value
else:
return TypeError()
def process_result_value(self, value, dialect):
return json.loads(value)
class FileMixin: class FileMixin:
@ -61,6 +70,16 @@ class FileMixin:
} }
class Permission(IntEnum):
'''
Defines User permissions as integers by the power of 2. User permission
can be evaluated using the bitwise operator &.
'''
ADMINISTRATE = 1
CONTRIBUTE = 2
USE_API = 4
class Role(HashidMixin, db.Model): class Role(HashidMixin, db.Model):
__tablename__ = 'roles' __tablename__ = 'roles'
# Primary key # Primary key
@ -102,7 +121,7 @@ class Role(HashidMixin, db.Model):
'permissions': self.permissions 'permissions': self.permissions
} }
if relationships: if relationships:
dict_role['users']: { dict_role['users'] = {
x.to_dict(backrefs=False, relationships=True) x.to_dict(backrefs=False, relationships=True)
for x in self.users for x in self.users
} }
@ -339,10 +358,11 @@ class TesseractOCRModel(FileMixin, HashidMixin, db.Model):
# Foreign keys # Foreign keys
user_id = db.Column(db.Integer, db.ForeignKey('users.id')) user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
# Fields # Fields
compatible_service_versions = db.Column(db.String(255)) compatible_service_versions = db.Column(ContainerColumn(list, 255))
description = db.Column(db.String(255)) description = db.Column(db.String(255))
publisher = db.Column(db.String(128)) publisher = db.Column(db.String(128))
publishing_year = db.Column(db.Integer) publishing_year = db.Column(db.Integer)
shared = db.Column(db.Boolean, default=False)
title = db.Column(db.String(64)) title = db.Column(db.String(64))
version = db.Column(db.String(16)) version = db.Column(db.String(16))
# Backrefs: user: User # Backrefs: user: User
@ -356,11 +376,10 @@ class TesseractOCRModel(FileMixin, HashidMixin, db.Model):
) )
def to_dict(self, backrefs=False, relationships=False): def to_dict(self, backrefs=False, relationships=False):
compatible_service_versions = json.loads(self.compatible_service_versions) # noqa
dict_tesseract_ocr_model = { dict_tesseract_ocr_model = {
'id': self.hashid, 'id': self.hashid,
'user_id': self.user.hashid, 'user_id': self.user.hashid,
'compatible_service_versions': compatible_service_versions, 'compatible_service_versions': self.compatible_service_versions,
'description': self.description, 'description': self.description,
'publisher': self.publisher, 'publisher': self.publisher,
'publishing_year': self.publishing_year, 'publishing_year': self.publishing_year,
@ -384,31 +403,39 @@ class TesseractOCRModel(FileMixin, HashidMixin, db.Model):
with open(defaults_file, 'r') as f: with open(defaults_file, 'r') as f:
defaults = yaml.safe_load(f) defaults = yaml.safe_load(f)
for m in defaults: for m in defaults:
if TesseractOCRModel.query.filter_by(title=m['title'], version=m['version']).first() is not None: # noqa model = TesseractOCRModel.query.filter_by(title=m['title'], version=m['version']).first() # noqa
if model is not None:
model.compatible_service_versions = m['compatible_service_versions']
model.description = m['description']
model.publisher = m['publisher']
model.publishing_year = m['publishing_year']
model.title = m['title']
model.version = m['version']
continue continue
tesseract_ocr_model = TesseractOCRModel( model = TesseractOCRModel(
compatible_service_versions=json.dumps(m['compatible_service_versions']), # noqa compatible_service_versions=m['compatible_service_versions'],
description=m['description'], description=m['description'],
publisher=m['publisher'], publisher=m['publisher'],
publishing_year=m['publishing_year'], publishing_year=m['publishing_year'],
shared=True,
title=m['title'], title=m['title'],
user=user, user=user,
version=m['version'] version=m['version']
) )
db.session.add(tesseract_ocr_model) db.session.add(model)
db.session.flush(objects=[tesseract_ocr_model]) db.session.flush(objects=[model])
db.session.refresh(tesseract_ocr_model) db.session.refresh(model)
tesseract_ocr_model.filename = f'{tesseract_ocr_model.id}.traineddata' # noqa model.filename = f'{model.id}.traineddata'
r = requests.get(m['url'], stream=True) r = requests.get(m['url'], stream=True)
pbar = tqdm( pbar = tqdm(
desc=f'{tesseract_ocr_model.title} ({tesseract_ocr_model.filename})', # noqa desc=f'{model.title} ({model.filename})',
unit="B", unit="B",
unit_scale=True, unit_scale=True,
unit_divisor=1024, unit_divisor=1024,
total=int(r.headers['Content-Length']) total=int(r.headers['Content-Length'])
) )
pbar.clear() pbar.clear()
with open(tesseract_ocr_model.path, 'wb') as f: with open(model.path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024): for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks if chunk: # filter out keep-alive new chunks
pbar.update(len(chunk)) pbar.update(len(chunk))
@ -560,11 +587,7 @@ class Job(HashidMixin, db.Model):
description = db.Column(db.String(255)) description = db.Column(db.String(255))
end_date = db.Column(db.DateTime()) end_date = db.Column(db.DateTime())
service = db.Column(db.String(64)) service = db.Column(db.String(64))
''' service_args = db.Column(ContainerColumn(dict, 255))
' Dictionary as JSON formatted string.
' Example: {"binarization": True}
'''
service_args = db.Column(db.String(255))
service_version = db.Column(db.String(16)) service_version = db.Column(db.String(16))
status = db.Column( status = db.Column(
IntEnumColumn(JobStatus), IntEnumColumn(JobStatus),
@ -643,10 +666,6 @@ class Job(HashidMixin, db.Model):
self.status = JobStatus.SUBMITTED self.status = JobStatus.SUBMITTED
def to_dict(self, backrefs=False, relationships=False): def to_dict(self, backrefs=False, relationships=False):
service_args = json.loads(self.service_args)
if self.service == 'tesseract-ocr' and 'model' in service_args:
tesseract_ocr_pipeline_model = TesseractOCRModel.query.get(service_args['model']) # noqa
service_args['model'] = tesseract_ocr_pipeline_model.title
dict_job = { dict_job = {
'id': self.hashid, 'id': self.hashid,
'user_id': self.user.hashid, 'user_id': self.user.hashid,
@ -654,7 +673,7 @@ class Job(HashidMixin, db.Model):
'description': self.description, 'description': self.description,
'end_date': None if self.end_date is None else f'{self.end_date.isoformat()}Z', # noqa 'end_date': None if self.end_date is None else f'{self.end_date.isoformat()}Z', # noqa
'service': self.service, 'service': self.service,
'service_args': service_args, 'service_args': self.service_args,
'service_version': self.service_version, 'service_version': self.service_version,
'status': self.status.name, 'status': self.status.name,
'title': self.title, 'title': self.title,
@ -798,7 +817,6 @@ class Corpus(HashidMixin, db.Model):
title = db.Column(db.String(32)) title = db.Column(db.String(32))
num_analysis_sessions = db.Column(db.Integer, default=0) num_analysis_sessions = db.Column(db.Integer, default=0)
num_tokens = db.Column(db.Integer, default=0) num_tokens = db.Column(db.Integer, default=0)
archive_file = db.Column(db.String(255))
# Backrefs: user: User # Backrefs: user: User
# Relationships # Relationships
files = db.relationship( files = db.relationship(

View File

@ -1,5 +1,7 @@
from app.models import TesseractOCRModel from app.models import Job, TesseractOCRModel
from flask_login import current_user
from flask_wtf import FlaskForm from flask_wtf import FlaskForm
from flask_wtf.file import FileField, FileAllowed, FileRequired
from wtforms import ( from wtforms import (
BooleanField, BooleanField,
MultipleFileField, MultipleFileField,
@ -8,110 +10,143 @@ from wtforms import (
SubmitField, SubmitField,
ValidationError ValidationError
) )
from wtforms.validators import DataRequired, Length from wtforms.validators import DataRequired, InputRequired, Length
from . import SERVICES from . import SERVICES
class AddJobForm(FlaskForm): class AddJobForm(FlaskForm):
description = StringField('Description', validators=[DataRequired(), Length(1, 255)]) # noqa description = StringField('Description', validators=[InputRequired()]) # noqa
submit = SubmitField() submit = SubmitField()
title = StringField('Title', validators=[DataRequired(), Length(1, 32)]) title = StringField('Title', validators=[InputRequired()])
version = SelectField('Version', validators=[DataRequired()]) version = SelectField('Version', validators=[DataRequired()])
def validate_description(self, field):
max_length = Job.description.property.columns[0].type.length
if len(field.data) > max_length:
raise ValidationError(
f'Description must be less than {max_length} characters'
)
class AddSpacyNLPJobForm(AddJobForm): def validate_title(self, field):
encoding_detection = BooleanField('Encoding detection') max_length = Job.title.property.columns[0].type.length
files = MultipleFileField('Files', validators=[DataRequired()]) if len(field.data) > max_length:
model = SelectField( raise ValidationError(
'Model', f'Title must be less than {max_length} characters'
choices=[('', 'Choose your option')], )
default='',
validators=[DataRequired()]
)
def validate_encoding_detection(self, field):
service_info = SERVICES['spacy-nlp']['versions'][self.version.data]
if field.data and 'encoding_detection' not in service_info['methods']:
raise ValidationError('Encoding detection is not available')
def validate_files(form, field): class AddFileSetupPipelineJobForm(AddJobForm):
valid_extensions = ['.txt'] images = MultipleFileField('File(s)', validators=[DataRequired()])
for file in field.data:
if not file.filename.lower().endswith(tuple(valid_extensions)): def validate_images(form, field):
raise ValidationError( valid_mimetypes = ['image/jpeg', 'image/png', 'image/tiff']
'File does not have an approved extension: ' for image in field.data:
'/'.join(valid_extensions) if image.mimetype not in valid_mimetypes:
) raise ValidationError('JPEG, PNG and TIFF files only!')
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
version = kwargs.pop('version', SERVICES['spacy-nlp']['latest_version']) # noqa service_manifest = SERVICES['file-setup-pipeline']
version = kwargs.pop('version', service_manifest['latest_version'])
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
service_info = SERVICES['spacy-nlp']['versions'][version] self.version.choices = [(x, x) for x in service_manifest['versions']]
if 'encoding_detection' not in service_info['methods']: self.version.data = version
self.encoding_detection.render_kw = {'disabled': True} self.version.default = service_manifest['latest_version']
self.model.choices += [(x, y) for x, y in service_info['models'].items()] # noqa
self.version.choices = [(x, x) for x in SERVICES['spacy-nlp']['versions']] # noqa
self.version.default = version
class AddTesseractOCRJobForm(AddJobForm): class AddTesseractOCRPipelineJobForm(AddJobForm):
binarization = BooleanField('Binarization') binarization = BooleanField('Binarization')
files = MultipleFileField('Files', validators=[DataRequired()]) pdf = FileField('File', validators=[FileRequired()])
model = SelectField( model = SelectField('Model', validators=[DataRequired()])
'Model',
choices=[('', 'Choose your option')],
default='',
validators=[DataRequired()]
)
def validate_binarization(self, field): def validate_binarization(self, field):
service_info = SERVICES['tesseract-ocr']['versions'][self.version.data] service_info = SERVICES['tesseract-ocr-pipeline']['versions'][self.version.data]
if field.data and 'binarization' not in service_info['methods']: if field.data and 'binarization' not in service_info['methods']:
raise ValidationError('Binarization is not available') raise ValidationError('Binarization is not available')
def validate_files(self, field): def validate_pdf(self, field):
valid_extensions = ['.pdf'] if field.data.mimetype != 'application/pdf':
for file in field.data: raise ValidationError('PDF files only!')
if not file.filename.lower().endswith(tuple(valid_extensions)):
raise ValidationError(
'File does not have an approved extension: '
'/'.join(valid_extensions)
)
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
version = kwargs.pop('version', SERVICES['tesseract-ocr']['latest_version']) # noqa service_manifest = SERVICES['tesseract-ocr-pipeline']
version = kwargs.pop('version', service_manifest['latest_version'])
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
service_info = SERVICES['tesseract-ocr']['versions'][version] service_info = service_manifest['versions'][version]
if 'binarization' not in service_info['methods']: if 'binarization' not in service_info['methods']:
self.binarization.render_kw = {'disabled': True} self.binarization.render_kw = {'disabled': True}
self.model.choices += [(x.hashid, x.title) for x in TesseractOCRModel.query.all()] # noqa compatible_models = [
self.version.choices = [(x, x) for x in SERVICES['tesseract-ocr']['versions']] # noqa x for x in TesseractOCRModel.query.filter_by(shared=True).all()
if version in x.compatible_service_versions
]
compatible_models += [
x for x in TesseractOCRModel.query.filter_by(shared=False, user=current_user).all()
if version in x.compatible_service_versions
]
self.model.choices = [('', 'Choose your option')]
self.model.choices += [(x.hashid, x.title) for x in compatible_models]
self.model.default = ''
self.version.choices = [(x, x) for x in service_manifest['versions']]
self.version.data = version self.version.data = version
self.version.default = SERVICES['tesseract-ocr']['latest_version'] self.version.default = service_manifest['latest_version']
class AddFileSetupJobForm(AddJobForm): class AddTranskribusHTRPipelineJobForm(AddJobForm):
files = MultipleFileField('Files', validators=[DataRequired()]) binarization = BooleanField('Binarization')
pdf = FileField('File', validators=[FileRequired()])
model = SelectField('Model', validators=[DataRequired()])
def validate_files(form, field): def validate_binarization(self, field):
valid_extensions = ['.jpeg', '.jpg', '.png', '.tiff', '.tif'] service_info = SERVICES['transkribus-htr-pipeline']['versions'][self.version.data]
for file in field.data: if field.data and 'binarization' not in service_info['methods']:
if not file.filename.lower().endswith(tuple(valid_extensions)): raise ValidationError('Binarization is not available')
raise ValidationError(
'File does not have an approved extension: ' def validate_pdf(self, field):
'/'.join(valid_extensions) if field.data.mimetype != 'application/pdf':
) raise ValidationError('PDF files only!')
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
version = kwargs.pop('version', SERVICES['file-setup']['latest_version']) # noqa service_manifest = SERVICES['transkribus-htr-pipeline']
version = kwargs.pop('version', service_manifest['latest_version'])
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.version.choices = [(x, x) for x in SERVICES['file-setup']['versions']] # noqa service_info = service_manifest['versions'][version]
if 'binarization' not in service_info['methods']:
self.binarization.render_kw = {'disabled': True}
self.model.choices = [('', 'Choose your option')]
self.model.choices += [
('37569', 'Tim Model'),
('29539', 'UCLUniversity of Toronto #7')
]
self.model.default = ''
self.version.choices = [(x, x) for x in service_manifest['versions']]
self.version.data = version self.version.data = version
self.version.default = SERVICES['file-setup']['latest_version'] self.version.default = service_manifest['latest_version']
AddJobForms = { class AddSpacyNLPPipelineJobForm(AddJobForm):
'file-setup': AddFileSetupJobForm, encoding_detection = BooleanField('Encoding detection')
'tesseract-ocr': AddTesseractOCRJobForm, txt = FileField('File', validators=[FileRequired()])
'spacy-nlp': AddSpacyNLPJobForm model = SelectField('Model', validators=[DataRequired()])
}
def validate_encoding_detection(self, field):
service_manifest = SERVICES['spacy-nlp-pipeline']
service_info = service_manifest['versions'][self.version.data]
if field.data and 'encoding_detection' not in service_info['methods']:
raise ValidationError('Encoding detection is not available!')
def validate_txt(form, field):
if field.data.mimetype != 'text/plain':
raise ValidationError('Plain text files only!')
def __init__(self, *args, **kwargs):
service_manifest = SERVICES['spacy-nlp-pipeline']
version = kwargs.pop('version', service_manifest['latest_version'])
super().__init__(*args, **kwargs)
service_info = service_manifest['versions'][version]
if 'encoding_detection' not in service_info['methods']:
self.encoding_detection.render_kw = {'disabled': True}
self.model.choices = [('', 'Choose your option')]
self.model.choices += [(x, y) for x, y in service_info['models'].items()] # noqa
self.model.default = ''
self.version.choices = [(x, x) for x in service_manifest['versions']]
self.version.data = version
self.version.default = version

View File

@ -13,47 +13,33 @@ from flask_login import current_user, login_required
from werkzeug.utils import secure_filename from werkzeug.utils import secure_filename
from . import bp from . import bp
from . import SERVICES from . import SERVICES
from .forms import AddJobForms from .forms import (
AddFileSetupPipelineJobForm,
AddTesseractOCRPipelineJobForm,
AddTranskribusHTRPipelineJobForm,
AddSpacyNLPPipelineJobForm
)
import json import json
@bp.route('/corpus-analysis') @bp.route('/file-setup-pipeline', methods=['GET', 'POST'])
@login_required @login_required
def corpus_analysis(): def file_setup_pipeline():
return render_template( service = 'file-setup-pipeline'
'services/corpus_analysis.html.j2', service_manifest = SERVICES[service]
title='Corpus analysis' version = request.args.get('version', service_manifest['latest_version'])
) if version not in service_manifest['versions']:
@bp.route('/<service>', methods=['GET', 'POST'])
@login_required
def service(service):
# Check if the requested service exist
if service not in SERVICES or service not in AddJobForms:
abort(404) abort(404)
version = request.args.get('version', SERVICES[service]['latest_version']) form = AddFileSetupPipelineJobForm(prefix='add-job-form', version=version)
if version not in SERVICES[service]['versions']:
abort(404)
form = AddJobForms[service](prefix='add-job-form', version=version)
title = SERVICES[service]['name']
if form.is_submitted(): if form.is_submitted():
if not form.validate(): if not form.validate():
return make_response(form.errors, 400) return make_response(form.errors, 400)
service_args = {} service_args = {}
if service == 'spacy-nlp':
service_args['model'] = form.model.data
if form.encoding_detection.data:
service_args['encoding_detection'] = True
if service == 'tesseract-ocr':
service_args['model'] = hashids.decode(form.model.data)
if form.binarization.data:
service_args['binarization'] = True
job = Job( job = Job(
user=current_user, user=current_user,
description=form.description.data, description=form.description.data,
service=service, service=service,
service_args=json.dumps(service_args), service_args=service_args,
service_version=form.version.data, service_version=form.version.data,
title=form.title.data title=form.title.data
) )
@ -67,18 +53,17 @@ def service(service):
db.session.rollback() db.session.rollback()
flash('Internal Server Error', 'error') flash('Internal Server Error', 'error')
return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa
for file in form.files.data: for image_file in form.images.data:
filename = secure_filename(file.filename)
job_input = JobInput( job_input = JobInput(
filename=filename, filename=secure_filename(image_file.filename),
job=job, job=job,
mimetype=file.mimetype mimetype=image_file.mimetype
) )
db.session.add(job_input) db.session.add(job_input)
db.session.flush(objects=[job_input]) db.session.flush(objects=[job_input])
db.session.refresh(job_input) db.session.refresh(job_input)
try: try:
file.save(job_input.path) image_file.save(job_input.path)
except OSError as e: except OSError as e:
current_app.logger.error(e) current_app.logger.error(e)
db.session.rollback() db.session.rollback()
@ -91,5 +76,196 @@ def service(service):
return render_template( return render_template(
f'services/{service.replace("-", "_")}.html.j2', f'services/{service.replace("-", "_")}.html.j2',
form=form, form=form,
title=title title=service_manifest['name']
)
@bp.route('/tesseract-ocr-pipeline', methods=['GET', 'POST'])
@login_required
def tesseract_ocr_pipeline():
service = 'tesseract-ocr-pipeline'
service_manifest = SERVICES[service]
version = request.args.get('version', service_manifest['latest_version'])
if version not in service_manifest['versions']:
abort(404)
form = AddTesseractOCRPipelineJobForm(prefix='add-job-form', version=version)
if form.is_submitted():
if not form.validate():
return make_response(form.errors, 400)
service_args = {}
service_args['model'] = hashids.decode(form.model.data)
if form.binarization.data:
service_args['binarization'] = True
job = Job(
user=current_user,
description=form.description.data,
service=service,
service_args=service_args,
service_version=form.version.data,
title=form.title.data
)
db.session.add(job)
db.session.flush(objects=[job])
db.session.refresh(job)
try:
job.makedirs()
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
flash('Internal Server Error', 'error')
return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa
job_input = JobInput(
filename=secure_filename(form.pdf.data.filename),
job=job,
mimetype=form.pdf.data.mimetype
)
db.session.add(job_input)
db.session.flush(objects=[job_input])
db.session.refresh(job_input)
try:
form.pdf.data.save(job_input.path)
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
flash('Internal Server Error', 'error')
return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa
job.status = JobStatus.SUBMITTED
db.session.commit()
flash(f'Job "{job.title}" added', 'job')
return make_response({'redirect_url': url_for('jobs.job', job_id=job.id)}, 201) # noqa
return render_template(
f'services/{service.replace("-", "_")}.html.j2',
form=form,
title=service_manifest['name']
)
@bp.route('/transkribus-htr-pipeline', methods=['GET', 'POST'])
@login_required
def transkribus_htr_pipeline():
if not current_app.config.get('NOPAQUE_TRANSKRIBUS_ENABLED'):
abort(404)
service = 'transkribus-htr-pipeline'
service_manifest = SERVICES[service]
version = request.args.get('version', service_manifest['latest_version'])
if version not in service_manifest['versions']:
abort(404)
form = AddTranskribusHTRPipelineJobForm(prefix='add-job-form', version=version)
if form.is_submitted():
if not form.validate():
return make_response(form.errors, 400)
service_args = {}
service_args['model'] = form.model.data
if form.binarization.data:
service_args['binarization'] = True
job = Job(
user=current_user,
description=form.description.data,
service=service,
service_args=service_args,
service_version=form.version.data,
title=form.title.data
)
db.session.add(job)
db.session.flush(objects=[job])
db.session.refresh(job)
try:
job.makedirs()
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
flash('Internal Server Error', 'error')
return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa
job_input = JobInput(
filename=secure_filename(form.pdf.data.filename),
job=job,
mimetype=form.pdf.data.mimetype
)
db.session.add(job_input)
db.session.flush(objects=[job_input])
db.session.refresh(job_input)
try:
form.pdf.data.save(job_input.path)
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
flash('Internal Server Error', 'error')
return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa
job.status = JobStatus.SUBMITTED
db.session.commit()
flash(f'Job "{job.title}" added', 'job')
return make_response({'redirect_url': url_for('jobs.job', job_id=job.id)}, 201) # noqa
return render_template(
f'services/{service.replace("-", "_")}.html.j2',
form=form,
title=service_manifest['name']
)
@bp.route('/spacy-nlp-pipeline', methods=['GET', 'POST'])
@login_required
def spacy_nlp_pipeline():
service = 'spacy-nlp-pipeline'
service_manifest = SERVICES[service]
version = request.args.get('version', SERVICES[service]['latest_version'])
if version not in service_manifest['versions']:
abort(404)
form = AddSpacyNLPPipelineJobForm(prefix='add-job-form', version=version)
if form.is_submitted():
if not form.validate():
return make_response(form.errors, 400)
service_args = {}
service_args['model'] = form.model.data
if form.encoding_detection.data:
service_args['encoding_detection'] = True
job = Job(
user=current_user,
description=form.description.data,
service=service,
service_args=service_args,
service_version=form.version.data,
title=form.title.data
)
db.session.add(job)
db.session.flush(objects=[job])
db.session.refresh(job)
try:
job.makedirs()
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
flash('Internal Server Error', 'error')
return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa
job_input = JobInput(
filename=secure_filename(form.txt.data.filename),
job=job,
mimetype=form.txt.data.mimetype
)
db.session.add(job_input)
db.session.flush(objects=[job_input])
db.session.refresh(job_input)
try:
form.txt.data.save(job_input.path)
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
flash('Internal Server Error', 'error')
return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa
job.status = JobStatus.SUBMITTED
db.session.commit()
flash(f'Job "{job.title}" added', 'job')
return make_response({'redirect_url': url_for('jobs.job', job_id=job.id)}, 201) # noqa
return render_template(
f'services/{service.replace("-", "_")}.html.j2',
form=form,
title=service_manifest['name']
)
@bp.route('/corpus-analysis')
@login_required
def corpus_analysis():
return render_template(
'services/corpus_analysis.html.j2',
title='Corpus analysis'
) )

View File

@ -1,38 +1,70 @@
# TODO: This could also be done via GitLab/GitHub APIs # TODO: This could also be done via GitLab/GitHub APIs
#file-setup-pipeline: file-setup-pipeline:
file-setup:
name: 'File setup pipeline' name: 'File setup pipeline'
publisher: 'Bielefeld University - CRC 1288 - INF'
latest_version: '0.1.0' latest_version: '0.1.0'
versions: versions:
0.1.0: 0.1.0:
publisher: 'Bielefeld University - CRC 1288 - INF'
publishing_year: 2022 publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup/-/releases/v0.1.0' url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup-pipeline/-/releases/v0.1.0'
#spacy-nlp-pipeline: tesseract-ocr-pipeline:
spacy-nlp: name: 'Tesseract OCR Pipeline'
name: 'spaCy NLP' publisher: 'Bielefeld University - CRC 1288 - INF'
latest_version: '0.1.4'
versions:
0.1.0:
methods:
- 'binarization'
publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.0'
0.1.1:
methods:
- 'binarization'
publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.1'
0.1.2:
methods:
- 'binarization'
publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.2'
0.1.3:
methods:
- 'binarization'
publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.3'
0.1.4:
methods:
- 'binarization'
publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.4'
transkribus-htr-pipeline:
name: 'Transkribus HTR Pipeline'
publisher: 'Bielefeld University - CRC 1288 - INF'
latest_version: '0.1.0'
versions:
0.1.0:
methods:
- 'binarization'
publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/transkribus-htr-pipeline/-/releases/v0.1.0'
spacy-nlp-pipeline:
name: 'spaCy NLP Pipeline'
publisher: 'Bielefeld University - CRC 1288 - INF'
latest_version: '0.1.0' latest_version: '0.1.0'
versions: versions:
0.1.0: 0.1.0:
methods: methods:
- 'encoding_detection' - 'encoding_detection'
models: models:
ca: 'Catalan'
de: 'German' de: 'German'
el: 'Greek'
en: 'English' en: 'English'
es: 'Spanish'
fr: 'French'
it: 'Italian' it: 'Italian'
pl: 'Polish' pl: 'Polish'
ru: 'Russian'
zh: 'Chinese' zh: 'Chinese'
publisher: 'Bielefeld University - CRC 1288 - INF'
publishing_year: 2022 publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/releases/v0.1.0' url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/spacy-nlp-pipeline/-/releases/v0.1.0'
#tesseract-ocr-pipeline:
tesseract-ocr:
name: 'Tesseract OCR'
latest_version: '0.1.0'
versions:
0.1.0:
methods:
- 'binarization'
publisher: 'Bielefeld University - CRC 1288 - INF'
publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/releases/v0.1.0'

View File

@ -28,20 +28,25 @@ $color: (
"darken": #6b3f89, "darken": #6b3f89,
"lighten": #ebe8f6 "lighten": #ebe8f6
), ),
"file-setup": ( "file-setup-pipeline": (
"base": #d5dc95, "base": #d5dc95,
"darken": #a1b300, "darken": #a1b300,
"lighten": #f2f3e1 "lighten": #f2f3e1
), ),
"spacy-nlp": ( "spacy-nlp-pipeline": (
"base": #98acd2, "base": #98acd2,
"darken": #0064a3, "darken": #0064a3,
"lighten": #e5e8f5 "lighten": #e5e8f5
), ),
"tesseract-ocr": ( "tesseract-ocr-pipeline": (
"base": #a9d8c8, "base": #a9d8c8,
"darken": #00a58b, "darken": #00a58b,
"lighten": #e7f4f1 "lighten": #e7f4f1
),
"transkribus-htr-pipeline": (
"base": #607d8b,
"darken": #37474f,
"lighten": #cfd8dc
) )
), ),
"status": ( "status": (

View File

@ -43,9 +43,10 @@ h1 .nopaque-icons, h2 .nopaque-icons, h3 .nopaque-icons, h4 .nopaque-icons, .tab
.job-status-text {text-transform: lowercase;} .job-status-text {text-transform: lowercase;}
.job-status-text[data-job-status]:empty:before {content: attr(data-job-status);} .job-status-text[data-job-status]:empty:before {content: attr(data-job-status);}
.nopaque-icons.service-icon[data-service="file-setup"]:empty:before {content: "E";} .nopaque-icons.service-icon[data-service="file-setup-pipeline"]:empty:before {content: "E";}
.nopaque-icons.service-icon[data-service="tesseract-ocr"]:empty:before {content: "F";} .nopaque-icons.service-icon[data-service="tesseract-ocr-pipeline"]:empty:before {content: "F";}
.nopaque-icons.service-icon[data-service="spacy-nlp"]:empty:before {content: "G";} .nopaque-icons.service-icon[data-service="transkribus-htr-pipeline"]:empty:before {content: "F";}
.nopaque-icons.service-icon[data-service="spacy-nlp-pipeline"]:empty:before {content: "G";}
.nopaque-icons.service-icon[data-service="corpus-analysis"]:empty:before {content: "H";} .nopaque-icons.service-icon[data-service="corpus-analysis"]:empty:before {content: "H";}
.hoverable {cursor: pointer;} .hoverable {cursor: pointer;}

View File

@ -3,11 +3,13 @@
<h2>Roadmap</h2> <h2>Roadmap</h2>
<p>The roadmap guides you through nopaque's workflow! If you have the necessary input fie formats, you can directly jump into the corresponding process. If not, you can use the roadmap to jump right to the preceding process.</p> <p>The roadmap guides you through nopaque's workflow! If you have the necessary input fie formats, you can directly jump into the corresponding process. If not, you can use the roadmap to jump right to the preceding process.</p>
<ul class="tabs tabs-fixed-width"> <ul class="tabs tabs-fixed-width">
<li class="tab"><a{%if request.path == url_for('services.service', service='file-setup') %} class="active"{% endif %} href="{{ url_for('services.service', service='file-setup') }}" target="_self">File setup</a></li> <li class="tab"><a{%if request.path == url_for('services.file_setup_pipeline') %} class="active"{% endif %} href="{{ url_for('services.file_setup_pipeline') }}" target="_self">File setup</a></li>
<li class="tab disabled"><i class="material-icons">navigate_next</i></li> <li class="tab disabled"><i class="material-icons">navigate_next</i></li>
<li class="tab"><a{%if request.path == url_for('services.service', service='tesseract-ocr') %} class="active"{% endif %} href="{{ url_for('services.service', service='tesseract-ocr') }}" target="_self">OCR</a></li> <li class="tab"><a{%if request.path == url_for('services.tesseract_ocr_pipeline') %} class="active"{% endif %} href="{{ url_for('services.tesseract_ocr_pipeline') }}" target="_self">OCR</a></li>
<li class="tab disabled"><i class="material-icons">more_vert</i></li>
<li class="tab"><a{%if request.path == url_for('services.transkribus_htr_pipeline') %} class="active"{% endif %} href="{{ url_for('services.transkribus_htr_pipeline') }}" target="_self">HTR</a></li>
<li class="tab disabled"><i class="material-icons">navigate_next</i></li> <li class="tab disabled"><i class="material-icons">navigate_next</i></li>
<li class="tab"><a{%if request.path == url_for('services.service', service='spacy-nlp') %} class="active"{% endif %} href="{{ url_for('services.service', service='spacy-nlp') }}" target="_self">NLP</a></li> <li class="tab"><a{%if request.path == url_for('services.spacy_nlp_pipeline') %} class="active"{% endif %} href="{{ url_for('services.spacy_nlp_pipeline') }}" target="_self">NLP</a></li>
<li class="tab disabled"><i class="material-icons">navigate_next</i></li> <li class="tab disabled"><i class="material-icons">navigate_next</i></li>
<li class="tab"><a{%if request.path == url_for('corpora.add_corpus') %} class="active"{% endif %} href="{{ url_for('corpora.add_corpus') }}" target="_self">Add corpus</a></li> <li class="tab"><a{%if request.path == url_for('corpora.add_corpus') %} class="active"{% endif %} href="{{ url_for('corpora.add_corpus') }}" target="_self">Add corpus</a></li>
<li class="tab disabled"><i class="material-icons">navigate_next</i></li> <li class="tab disabled"><i class="material-icons">navigate_next</i></li>

View File

@ -14,10 +14,13 @@
<li><a href="{{ url_for('main.dashboard', _anchor='jobs') }}" style="padding-left: 47px;"><i class="nopaque-icons">J</i>My Jobs</a></li> <li><a href="{{ url_for('main.dashboard', _anchor='jobs') }}" style="padding-left: 47px;"><i class="nopaque-icons">J</i>My Jobs</a></li>
<li><div class="divider"></div></li> <li><div class="divider"></div></li>
<li><a class="subheader">Processes & Services</a></li> <li><a class="subheader">Processes & Services</a></li>
<li class="service-color service-color-border border-darken" data-service="file-setup" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='file-setup') }}"><i class="nopaque-icons service-icon" data-service="file-setup"></i>File setup</a></li> <li class="service-color service-color-border border-darken" data-service="file-setup-pipeline" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.file_setup_pipeline') }}"><i class="nopaque-icons service-icon" data-service="file-setup-pipeline"></i>File setup</a></li>
<li class="service-color service-color-border border-darken" data-service="tesseract-ocr" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='tesseract-ocr') }}"><i class="nopaque-icons service-icon" data-service="tesseract-ocr"></i>OCR</a></li> <li class="service-color service-color-border border-darken" data-service="tesseract-ocr-pipeline" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.tesseract_ocr_pipeline') }}"><i class="nopaque-icons service-icon" data-service="tesseract-ocr-pipeline"></i>OCR</a></li>
<li class="service-color service-color-border border-darken" data-service="spacy-nlp" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='spacy-nlp') }}"><i class="nopaque-icons service-icon" data-service="spacy-nlp"></i>NLP</a></li> {% if config.NOPAQUE_TRANSKRIBUS_ENABLED %}
<li class="service-color service-color-border border-darken" data-service="corpus-analysis" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='corpus-analysis') }}"><i class="nopaque-icons service-icon" data-service="corpus-analysis"></i>Corpus analysis</a></li> <li class="service-color service-color-border border-darken" data-service="transkribus-htr-pipeline" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.transkribus_htr_pipeline') }}"><i class="nopaque-icons service-icon" data-service="transkribus-htr-pipeline"></i>HTR</a></li>
{% endif %}
<li class="service-color service-color-border border-darken" data-service="spacy-nlp-pipeline" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.spacy_nlp_pipeline') }}"><i class="nopaque-icons service-icon" data-service="spacy-nlp-pipeline"></i>NLP</a></li>
<li class="service-color service-color-border border-darken" data-service="corpus-analysis" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.corpus_analysis') }}"><i class="nopaque-icons service-icon" data-service="corpus-analysis"></i>Corpus analysis</a></li>
<li><div class="divider"></div></li> <li><div class="divider"></div></li>
<li><a class="subheader">Account</a></li> <li><a class="subheader">Account</a></li>
<li><a href="{{ url_for('settings.index') }}"><i class="material-icons">settings</i>Settings</a></li> <li><a href="{{ url_for('settings.index') }}"><i class="material-icons">settings</i>Settings</a></li>

View File

@ -115,37 +115,37 @@
<div class="col s12 m4"> <div class="col s12 m4">
<div class="card-panel center-align hoverable"> <div class="card-panel center-align hoverable">
<br> <br>
<a href="{{ url_for('services.service', service='file-setup') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);"> <a href="{{ url_for('services.file_setup_pipeline') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);">
<i class="nopaque-icons service-color darken service-icon" data-service="file-setup"></i> <i class="nopaque-icons service-color darken service-icon" data-service="file-setup-pipeline"></i>
</a> </a>
<br><br> <br><br>
<p class="service-color-text darken" data-service="file-setup"><b>File setup</b></p> <p class="service-color-text darken" data-service="file-setup-pipeline"><b>File setup</b></p>
<p class="light">Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing.</p> <p class="light">Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing.</p>
<a href="{{ url_for('services.service', service='file-setup') }}" class="waves-effect waves-light btn service-color darken" data-service="file-setup">Create Job</a> <a href="{{ url_for('services.file_setup_pipeline') }}" class="waves-effect waves-light btn service-color darken" data-service="file-setup-pipeline">Create Job</a>
</div> </div>
</div> </div>
<div class="col s12 m4"> <div class="col s12 m4">
<div class="card-panel center-align hoverable"> <div class="card-panel center-align hoverable">
<br> <br>
<a href="{{ url_for('services.service', service='tesseract-ocr') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);"> <a href="{{ url_for('services.tesseract_ocr_pipeline') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);">
<i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr" style="font-size: 2.5rem;"></i> <i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr-pipeline" style="font-size: 2.5rem;"></i>
</a> </a>
<br><br> <br><br>
<p class="service-color-text darken" data-service="tesseract-ocr"><b>Optical Character Recognition</b></p> <p class="service-color-text darken" data-service="tesseract-ocr-pipeline"><b>Optical Character Recognition</b></p>
<p class="light">nopaque converts your image data like photos or scans into text data through a process called OCR. This step enables you to proceed with further computational analysis of your documents.</p> <p class="light">nopaque converts your image data like photos or scans into text data through a process called OCR. This step enables you to proceed with further computational analysis of your documents.</p>
<a href="{{ url_for('services.service', service='tesseract-ocr') }}" class="waves-effect waves-light btn service-color darken" data-service="tesseract-ocr">Create Job</a> <a href="{{ url_for('services.tesseract_ocr_pipeline') }}" class="waves-effect waves-light btn service-color darken" data-service="tesseract-ocr-pipeline">Create Job</a>
</div> </div>
</div> </div>
<div class="col s12 m4"> <div class="col s12 m4">
<div class="card-panel center-align hoverable"> <div class="card-panel center-align hoverable">
<br> <br>
<a href="{{ url_for('services.service', service='spacy-nlp') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);"> <a href="{{ url_for('services.spacy_nlp_pipeline') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);">
<i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp" style="font-size: 2.5rem;"></i> <i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp-pipeline" style="font-size: 2.5rem;"></i>
</a> </a>
<br><br> <br><br>
<p class="service-color-text darken" data-service="spacy-nlp"><b>Natural Language Processing</b></p> <p class="service-color-text darken" data-service="spacy-nlp-pipeline"><b>Natural Language Processing</b></p>
<p class="light">By means of computational linguistic data processing (tokenization, lemmatization, part-of-speech tagging and named-entity recognition) nopaque extracts additional information from your text.</p> <p class="light">By means of computational linguistic data processing (tokenization, lemmatization, part-of-speech tagging and named-entity recognition) nopaque extracts additional information from your text.</p>
<a href="{{ url_for('services.service', service='spacy-nlp') }}" class="waves-effect waves-light btn service-color darken" data-service="spacy-nlp">Create Job</a> <a href="{{ url_for('services.spacy_nlp_pipeline') }}" class="waves-effect waves-light btn service-color darken" data-service="spacy-nlp-pipeline">Create Job</a>
</div> </div>
</div> </div>
</div> </div>

View File

@ -35,9 +35,9 @@
<p>Our source code is spread over multiple Git repositories.</p> <p>Our source code is spread over multiple Git repositories.</p>
<ul> <ul>
<li>nopaque frontend: <a href="https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque" target="_blank">https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque</a></li> <li>nopaque frontend: <a href="https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque" target="_blank">https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque</a></li>
<li>File setup: <a href="https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup" target="_blank">https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup</a></li> <li>File Setup Pipeline: <a href="https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup-pipeline" target="_blank">https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup-pipeline</a></li>
<li>OCR: <a href="https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr" target="_blank">https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr</a></li> <li>Tesseract OCR Pipeline: <a href="https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline" target="_blank">https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline</a></li>
<li>NLP: <a href="https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp" target="_blank">https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp</a></li> <li>spaCy NLP Pipeline: <a href="https://gitlab.ub.uni-bielefeld.de/sfb1288inf/spacy-nlp-pipeline" target="_blank">https://gitlab.ub.uni-bielefeld.de/sfb1288inf/spacy-nlp-pipeline</a></li>
<li>Corpus analysis backend: <a href="https://gitlab.ub.uni-bielefeld.de/sfb1288inf/cqpserver" target="_blank">https://gitlab.ub.uni-bielefeld.de/sfb1288inf/cqpserver</a></li> <li>Corpus analysis backend: <a href="https://gitlab.ub.uni-bielefeld.de/sfb1288inf/cqpserver" target="_blank">https://gitlab.ub.uni-bielefeld.de/sfb1288inf/cqpserver</a></li>
<li>Corpus analysis backend connector: <a href="https://github.com/Pevtrick/cqi-py" target="_blank">https://github.com/Pevtrick/cqi-py</a></li> <li>Corpus analysis backend connector: <a href="https://github.com/Pevtrick/cqi-py" target="_blank">https://github.com/Pevtrick/cqi-py</a></li>
</ul> </ul>

View File

@ -76,31 +76,31 @@
<p class="hide-on-small-only">&nbsp;</p> <p class="hide-on-small-only">&nbsp;</p>
<div class="row"> <div class="row">
<div class="col s12 m6 l3 center-align"> <div class="col s12 m6 l3 center-align">
<a href="{{ url_for('services.service', service='file-setup') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light"> <a href="{{ url_for('services.file_setup_pipeline') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
<i class="nopaque-icons service-color darken service-icon" data-service="file-setup"></i> <i class="nopaque-icons service-color darken service-icon" data-service="file-setup-pipeline"></i>
</a> </a>
<br><br> <br><br>
<p class="service-color-text text-darken" data-service="file-setup"><b>File setup</b></p> <p class="service-color-text text-darken" data-service="file-setup-pipeline"><b>File setup</b></p>
<p class="light">Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing and the application of other services.</p> <p class="light">Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing and the application of other services.</p>
</div> </div>
<div class="col s12 m6 l3 center-align"> <div class="col s12 m6 l3 center-align">
<a href="{{ url_for('services.service', service='tesseract-ocr') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light"> <a href="{{ url_for('services.tesseract_ocr_pipeline') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
<i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr"></i> <i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr-pipeline"></i>
</a> </a>
<br><br> <br><br>
<p class="service-color-text text-darken" data-service="tesseract-ocr"><b>Optical Character Recognition</b></p> <p class="service-color-text text-darken" data-service="tesseract-ocr-pipeline"><b>Optical Character Recognition</b></p>
<p class="light">nopaque converts your image data like photos or scans into text data through OCR making it machine readable. This step enables you to proceed with further computational analysis of your documents.</p> <p class="light">nopaque converts your image data like photos or scans into text data through OCR making it machine readable. This step enables you to proceed with further computational analysis of your documents.</p>
</div> </div>
<div class="col s12 m6 l3 center-align"> <div class="col s12 m6 l3 center-align">
<a href="{{ url_for('services.service', service='spacy-nlp') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light"> <a href="{{ url_for('services.spacy_nlp_pipeline') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
<i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp"></i> <i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp-pipeline"></i>
</a> </a>
<br><br> <br><br>
<p class="service-color-text text-darken" data-service="spacy-nlp"><b>Natural Language Processing</b></p> <p class="service-color-text text-darken" data-service="spacy-nlp-pipeline"><b>Natural Language Processing</b></p>
<p class="light">By means of computational linguistic data processing (tokenization, lemmatization, part-of-speech tagging and named-entity recognition) nopaque extracts additional information from your text.</p> <p class="light">By means of computational linguistic data processing (tokenization, lemmatization, part-of-speech tagging and named-entity recognition) nopaque extracts additional information from your text.</p>
</div> </div>
<div class="col s12 m6 l3 center-align"> <div class="col s12 m6 l3 center-align">
<a href="{{ url_for('services.service', service='corpus_analysis') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light"> <a href="{{ url_for('services.corpus_analysis') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
<i class="nopaque-icons service-color darken service-icon" data-service="corpus-analysis"></i> <i class="nopaque-icons service-color darken service-icon" data-service="corpus-analysis"></i>
</a> </a>
<br><br> <br><br>

View File

@ -2,13 +2,15 @@
<li class="tab disabled"><i class="material-icons">navigate_next</i></li> <li class="tab disabled"><i class="material-icons">navigate_next</i></li>
<li class="tab"><a href="{{ url_for('main.index', _anchor='services') }}" target="_self">Processes & Services</a></li> <li class="tab"><a href="{{ url_for('main.index', _anchor='services') }}" target="_self">Processes & Services</a></li>
<li class="tab disabled"><i class="material-icons">navigate_next</i></li> <li class="tab disabled"><i class="material-icons">navigate_next</i></li>
{% if request.path == url_for('.service', service='corpus-analysis') %} {% if request.path == url_for('.corpus_analysis') %}
<li class="tab"><a class="active" href="{{ url_for('.service', service='corpus-analysis') }}" target="_self">{{ title }}</a></li> <li class="tab"><a class="active" href="{{ url_for('.corpus_analysis') }}" target="_self">{{ title }}</a></li>
{% elif request.path == url_for('.service', service='file-setup') %} {% elif request.path == url_for('.file_setup_pipeline') %}
<li class="tab"><a class="active" href="{{ url_for('.service', service='file-setup') }}" target="_self">{{ title }}</a></li> <li class="tab"><a class="active" href="{{ url_for('.file_setup_pipeline') }}" target="_self">{{ title }}</a></li>
{% elif request.path == url_for('.service', service='nlp') %} {% elif request.path == url_for('.spacy_nlp_pipeline') %}
<li class="tab"><a class="active" href="{{ url_for('.service', service='nlp') }}" target="_self">{{ title }}</a></li> <li class="tab"><a class="active" href="{{ url_for('.spacy_nlp_pipeline') }}" target="_self">{{ title }}</a></li>
{% elif request.path == url_for('.service', service='ocr') %} {% elif request.path == url_for('.tesseract_ocr_pipeline') %}
<li class="tab"><a class="active" href="{{ url_for('.service', service='ocr') }}" target="_self">{{ title }}</a></li> <li class="tab"><a class="active" href="{{ url_for('.tesseract_ocr_pipeline') }}" target="_self">{{ title }}</a></li>
{% elif request.path == url_for('.transkribus_htr_pipeline') %}
<li class="tab"><a class="active" href="{{ url_for('.transkribus_htr_pipeline') }}" target="_self">{{ title }}</a></li>
{% endif %} {% endif %}
{% endset %} {% endset %}

View File

@ -2,7 +2,7 @@
{% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %} {% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %}
{% import "materialize/wtf.html.j2" as wtf %} {% import "materialize/wtf.html.j2" as wtf %}
{% block main_attribs %} class="service-scheme" data-service="file-setup"{% endblock main_attribs %} {% block main_attribs %} class="service-scheme" data-service="file-setup-pipeline"{% endblock main_attribs %}
{% block page_content %} {% block page_content %}
<div class="container"> <div class="container">
@ -16,13 +16,13 @@
<p class="hide-on-small-only">&nbsp;</p> <p class="hide-on-small-only">&nbsp;</p>
<p class="hide-on-small-only">&nbsp;</p> <p class="hide-on-small-only">&nbsp;</p>
<a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light"> <a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
<i class="nopaque-icons service-color darken service-icon" data-service="file-setup"></i> <i class="nopaque-icons service-color darken service-icon" data-service="file-setup-pipeline"></i>
</a> </a>
</div> </div>
</div> </div>
<div class="col s12 m9 pull-m3"> <div class="col s12 m9 pull-m3">
<div class="card service-color-border border-darken" data-service="file-setup" style="border-top: 10px solid;"> <div class="card service-color-border border-darken" data-service="file-setup-pipeline" style="border-top: 10px solid;">
<div class="card-content"> <div class="card-content">
<div class="row"> <div class="row">
<div class="col s12"> <div class="col s12">
@ -50,7 +50,7 @@
{{ wtf.render_field(form.description, data_length='255', material_icon='description') }} {{ wtf.render_field(form.description, data_length='255', material_icon='description') }}
</div> </div>
<div class="col s12 l9"> <div class="col s12 l9">
{{ wtf.render_field(form.files, accept='image/jpeg, image/png, image/tiff', placeholder='Choose your .jpeg, .png or .tiff files') }} {{ wtf.render_field(form.images, accept='image/jpeg, image/png, image/tiff', placeholder='Choose JPEG, PNG or TIFF files') }}
</div> </div>
<div class="col s12 l3"> <div class="col s12 l3">
{{ wtf.render_field(form.version, material_icon='apps') }} {{ wtf.render_field(form.version, material_icon='apps') }}

View File

@ -2,7 +2,7 @@
{% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %} {% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %}
{% import "materialize/wtf.html.j2" as wtf %} {% import "materialize/wtf.html.j2" as wtf %}
{% block main_attribs %} class="service-scheme" data-service="spacy-nlp"{% endblock main_attribs %} {% block main_attribs %} class="service-scheme" data-service="spacy-nlp-pipeline"{% endblock main_attribs %}
{% block page_content %} {% block page_content %}
<div class="container"> <div class="container">
@ -16,13 +16,13 @@
<p class="hide-on-small-only">&nbsp;</p> <p class="hide-on-small-only">&nbsp;</p>
<p class="hide-on-small-only">&nbsp;</p> <p class="hide-on-small-only">&nbsp;</p>
<a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light"> <a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
<i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp"></i> <i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp-pipeline"></i>
</a> </a>
</div> </div>
</div> </div>
<div class="col s12 m9 pull-m3"> <div class="col s12 m9 pull-m3">
<div class="card service-color-border border-darken" data-service="spacy-nlp" style="border-top: 10px solid;"> <div class="card service-color-border border-darken" data-service="spacy-nlp-pipeline" style="border-top: 10px solid;">
<div class="card-content"> <div class="card-content">
<div class="row"> <div class="row">
<div class="col s12 m6"> <div class="col s12 m6">
@ -68,7 +68,7 @@
{{ wtf.render_field(form.description, data_length='255', material_icon='description') }} {{ wtf.render_field(form.description, data_length='255', material_icon='description') }}
</div> </div>
<div class="col s12 l5"> <div class="col s12 l5">
{{ wtf.render_field(form.files, accept='text/plain', placeholder='Choose your .txt files') }} {{ wtf.render_field(form.txt, accept='text/plain', placeholder='Choose a plain text file') }}
</div> </div>
<div class="col s12 l4"> <div class="col s12 l4">
{{ wtf.render_field(form.model, material_icon='language') }} {{ wtf.render_field(form.model, material_icon='language') }}

View File

@ -2,7 +2,7 @@
{% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %} {% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %}
{% import "materialize/wtf.html.j2" as wtf %} {% import "materialize/wtf.html.j2" as wtf %}
{% block main_attribs %} class="service-scheme" data-service="tesseract-ocr"{% endblock main_attribs %} {% block main_attribs %} class="service-scheme" data-service="tesseract-ocr-pipeline"{% endblock main_attribs %}
{% block page_content %} {% block page_content %}
<div class="container"> <div class="container">
@ -16,13 +16,13 @@
<p class="hide-on-small-only">&nbsp;</p> <p class="hide-on-small-only">&nbsp;</p>
<p class="hide-on-small-only">&nbsp;</p> <p class="hide-on-small-only">&nbsp;</p>
<a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light"> <a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
<i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr"></i> <i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr-pipeline"></i>
</a> </a>
</div> </div>
</div> </div>
<div class="col s12 m9 pull-m3"> <div class="col s12 m9 pull-m3">
<div class="card service-color-border border-darken" data-service="tesseract-ocr" style="border-top: 10px solid;"> <div class="card service-color-border border-darken" data-service="tesseract-ocr-pipeline" style="border-top: 10px solid;">
<div class="card-content"> <div class="card-content">
<div class="row"> <div class="row">
<div class="col s12"> <div class="col s12">
@ -50,7 +50,7 @@
{{ wtf.render_field(form.description, data_length='255', material_icon='description') }} {{ wtf.render_field(form.description, data_length='255', material_icon='description') }}
</div> </div>
<div class="col s12 l5"> <div class="col s12 l5">
{{ wtf.render_field(form.files, accept='application/pdf', placeholder='Choose your .pdf files') }} {{ wtf.render_field(form.pdf, accept='application/pdf', placeholder='Choose a PDF file') }}
</div> </div>
<div class="col s12 l4"> <div class="col s12 l4">
{{ wtf.render_field(form.model, material_icon='language') }} {{ wtf.render_field(form.model, material_icon='language') }}

View File

@ -0,0 +1,169 @@
{% extends "base.html.j2" %}
{% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %}
{% import "materialize/wtf.html.j2" as wtf %}
{% block main_attribs %} class="service-scheme" data-service="transkribus-htr-pipeline"{% endblock main_attribs %}
{% block page_content %}
<div class="container">
<div class="row">
<div class="col s12">
<h1 id="title">{{ title }}</h1>
</div>
<div class="col s12 m3 push-m9">
<div class="center-align">
<p class="hide-on-small-only">&nbsp;</p>
<p class="hide-on-small-only">&nbsp;</p>
<a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
<i class="nopaque-icons service-color darken service-icon" data-service="transkribus-htr-pipeline"></i>
</a>
</div>
</div>
<div class="col s12 m9 pull-m3">
<div class="card service-color-border border-darken" data-service="transkribus-htr-pipeline" style="border-top: 10px solid;">
<div class="card-content">
<div class="row">
<div class="col s12">
<div class="card-panel z-depth-0">
<span class="card-title"><i class="left material-icons">layers</i>HTR</span>
<p>In this process, nopaque converts your image data like photos or scans into text data. This step enables you to proceed with the computational analysis of your documents.</p>
<p class="right-align">
<a href="https://readcoop.eu/de/transkribus/" target="_blank">
<img src="https://readcoop.eu/wp-content/uploads/2020/02/Logo_Transkribus_web.svg" title="Logoo_Transkribus_web" alt="Logoo_Transkribus_web" style="width: 30%;">
</a>
</p>
</div>
</div>
</div>
</div>
</div>
</div>
<div class="col s12">
<h2>Submit a job</h2>
<div class="card">
<form class="nopaque-upload-form" data-progress-modal="progress-modal">
<div class="card-content">
{{ form.hidden_tag() }}
<div class="row">
<div class="col s12 l4">
{{ wtf.render_field(form.title, data_length='32', material_icon='title') }}
</div>
<div class="col s12 l8">
{{ wtf.render_field(form.description, data_length='255', material_icon='description') }}
</div>
<div class="col s12 l5">
{{ wtf.render_field(form.pdf, accept='application/pdf', placeholder='Choose a PDF file') }}
</div>
<div class="col s12 l4">
{{ wtf.render_field(form.model, material_icon='language') }}
</div>
<div class="col s12 l3">
{{ wtf.render_field(form.version, material_icon='apps') }}
</div>
<div class="col s12">
<span class="card-title">Preprocessing</span>
</div>
<div class="col s9">
<p>{{ form.binarization.label.text }}</p>
<p class="light">Based on a brightness threshold pixels are converted into either black or white. It is useful to reduce noise in images. (<b>longer duration</b>)</p>
</div>
<div class="col s3 right-align">
<div class="switch">
<label>
{{ form.binarization() }}
<span class="lever"></span>
</label>
</div>
</div>
<div class="col s12"><p>&nbsp;</p></div>
<div class="col s12 divider"></div>
<div class="col s12"><p>&nbsp;</p></div>
<div class="col s9">
<p>Page range</p>
<p class="light"></p>
</div>
<div class="col s3 right-align">
<div class="switch">
<label>
<input disabled type="checkbox">
<span class="lever"></span>
</label>
</div>
</div>
<div class="col s12"><p>&nbsp;</p></div>
<div class="col s12 divider"></div>
<div class="col s12"><p>&nbsp;</p></div>
<div class="col s9">
<p>Page rotation</p>
<p class="light"></p>
</div>
<div class="col s3 right-align">
<div class="switch">
<label>
<input disabled type="checkbox">
<span class="lever"></span>
</label>
</div>
</div>
<div class="col s12"><p>&nbsp;</p></div>
<div class="col s12 divider"></div>
<div class="col s12"><p>&nbsp;</p></div>
<div class="col s9">
<p>Page split</p>
<p class="light"></p>
</div>
<div class="col s3 right-align">
<div class="switch">
<label>
<input disabled type="checkbox">
<span class="lever"></span>
</label>
</div>
</div>
<!--
Seperate each setting with the following
<div class="col s12"><p>&nbsp;</p></div>
<div class="col s12 divider"></div>
<div class="col s12"><p>&nbsp;</p></div>
-->
</div>
</div>
<div class="card-action right-align">
{{ wtf.render_field(form.submit, material_icon='send') }}
</div>
</form>
</div>
</div>
</div>
</div>
{% endblock page_content %}
{% block modals %}
{{ super() }}
<div id="progress-modal" class="modal">
<div class="modal-content">
<h4><i class="material-icons left">file_upload</i>Uploading files...</h4>
<div class="progress">
<div class="determinate" style="width: 0%"></div>
</div>
</div>
<div class="modal-footer">
<a href="#!" class="modal-close waves-effect waves-light btn red abort-request">Cancel</a>
</div>
</div>
{% endblock modals %}
{% block scripts %}
{{ super() }}
<script>
let versionField = document.querySelector('#add-job-form-version');
versionField.addEventListener('change', (event) => {
let url = new URL(window.location.href);
url.search = `?version=${event.target.value}`;
window.location.href = url.toString();
});
</script>
{% endblock scripts %}

View File

@ -92,6 +92,11 @@ class Config:
NOPAQUE_PROXY_FIX_X_PROTO = \ NOPAQUE_PROXY_FIX_X_PROTO = \
int(os.environ.get('NOPAQUE_PROXY_FIX_X_PROTO', '0')) int(os.environ.get('NOPAQUE_PROXY_FIX_X_PROTO', '0'))
NOPAQUE_TRANSKRIBUS_ENABLED = \
os.environ.get('NOPAQUE_TRANSKRIBUS_ENABLED', 'true').lower() == 'true'
NOPAQUE_READCOOP_USERNAME = os.environ.get('NOPAQUE_READCOOP_USERNAME')
NOPAQUE_READCOOP_PASSWORD = os.environ.get('NOPAQUE_READCOOP_PASSWORD')
@classmethod @classmethod
def init_app(cls, app: Flask): def init_app(cls, app: Flask):
# Set up logging according to the corresponding (NOPAQUE_LOG_*) # Set up logging according to the corresponding (NOPAQUE_LOG_*)

View File

@ -18,13 +18,15 @@ services:
- "traefik.http.middlewares.http-nopaque-headers.headers.customrequestheaders.X-Forwarded-Proto=http" - "traefik.http.middlewares.http-nopaque-headers.headers.customrequestheaders.X-Forwarded-Proto=http"
- "traefik.http.routers.http-nopaque.entrypoints=http" - "traefik.http.routers.http-nopaque.entrypoints=http"
- "traefik.http.routers.http-nopaque.middlewares=http-nopaque-headers, redirect-to-https@file" - "traefik.http.routers.http-nopaque.middlewares=http-nopaque-headers, redirect-to-https@file"
- "traefik.http.routers.http-nopaque.rule=Host(`${SERVER_NAME}`)" # Replace <nopaque-domain> with your domain
- "traefik.http.routers.http-nopaque.rule=Host(`<nopaque-domain>`)"
### </http> ### ### </http> ###
### <https> ### ### <https> ###
- "traefik.http.middlewares.https-nopaque-headers.headers.customrequestheaders.X-Forwarded-Proto=https" - "traefik.http.middlewares.https-nopaque-headers.headers.customrequestheaders.X-Forwarded-Proto=https"
- "traefik.http.routers.https-nopaque.entrypoints=https" - "traefik.http.routers.https-nopaque.entrypoints=https"
- "traefik.http.routers.https-nopaque.middlewares=hsts-header@file, https-nopaque-headers" - "traefik.http.routers.https-nopaque.middlewares=hsts-header@file, https-nopaque-headers"
- "traefik.http.routers.https-nopaque.rule=Host(`${SERVER_NAME}`)" # Replace <nopaque-domain> with your domain
- "traefik.http.routers.https-nopaque.rule=Host(`<nopaque-domain>`)"
- "traefik.http.routers.https-nopaque.tls.certresolver=<CERTRESOLVER>" - "traefik.http.routers.https-nopaque.tls.certresolver=<CERTRESOLVER>"
- "traefik.http.routers.https-nopaque.tls.options=intermediate@file" - "traefik.http.routers.https-nopaque.tls.options=intermediate@file"
### </https> ### ### </https> ###

View File

@ -1,8 +1,8 @@
"""empty message """empty message
Revision ID: 097aae1f02d7 Revision ID: aa855b80cf1d
Revises: Revises:
Create Date: 2022-02-08 10:02:03.748588 Create Date: 2022-04-01 12:14:42.606685
""" """
from alembic import op from alembic import op
@ -10,7 +10,7 @@ import sqlalchemy as sa
# revision identifiers, used by Alembic. # revision identifiers, used by Alembic.
revision = '097aae1f02d7' revision = 'aa855b80cf1d'
down_revision = None down_revision = None
branch_labels = None branch_labels = None
depends_on = None depends_on = None
@ -56,7 +56,6 @@ def upgrade():
sa.Column('title', sa.String(length=32), nullable=True), sa.Column('title', sa.String(length=32), nullable=True),
sa.Column('num_analysis_sessions', sa.Integer(), nullable=True), sa.Column('num_analysis_sessions', sa.Integer(), nullable=True),
sa.Column('num_tokens', sa.Integer(), nullable=True), sa.Column('num_tokens', sa.Integer(), nullable=True),
sa.Column('archive_file', sa.String(length=255), nullable=True),
sa.ForeignKeyConstraint(['user_id'], ['users.id'], ), sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
sa.PrimaryKeyConstraint('id') sa.PrimaryKeyConstraint('id')
) )
@ -85,6 +84,7 @@ def upgrade():
sa.Column('description', sa.String(length=255), nullable=True), sa.Column('description', sa.String(length=255), nullable=True),
sa.Column('publisher', sa.String(length=128), nullable=True), sa.Column('publisher', sa.String(length=128), nullable=True),
sa.Column('publishing_year', sa.Integer(), nullable=True), sa.Column('publishing_year', sa.Integer(), nullable=True),
sa.Column('shared', sa.Boolean(), nullable=True),
sa.Column('title', sa.String(length=64), nullable=True), sa.Column('title', sa.String(length=64), nullable=True),
sa.Column('version', sa.String(length=16), nullable=True), sa.Column('version', sa.String(length=16), nullable=True),
sa.ForeignKeyConstraint(['user_id'], ['users.id'], ), sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),

View File

@ -2,6 +2,7 @@ cqi
docker docker
eventlet==0.30.2 eventlet==0.30.2
Flask==1.1.4 Flask==1.1.4
Flask-APScheduler
Flask-Assets Flask-Assets
Flask-Hashids Flask-Hashids
Flask-HTTPAuth Flask-HTTPAuth
@ -16,6 +17,7 @@ Flask-WTF
gunicorn gunicorn
hiredis hiredis
jsonschema jsonschema
MarkupSafe==2.0.1
psycopg2 psycopg2
pyScss pyScss
python-dotenv python-dotenv