mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2025-01-24 00:30:35 +00:00
Rename all services, use scss, cleanup, add sandpaper conversion script
This commit is contained in:
parent
8fd59f8078
commit
ce997e69ea
8
.env.tpl
8
.env.tpl
@ -168,3 +168,11 @@ NOPAQUE_SOCKETIO_MESSAGE_QUEUE_URI=
|
||||
# DEFAULT: 0
|
||||
# Number of values to trust for X-Forwarded-Proto
|
||||
# NOPAQUE_PROXY_FIX_X_PROTO=
|
||||
|
||||
# CHOOSE ONE: False, True
|
||||
# DEFAULT: False
|
||||
# NOPAQUE_TRANSKRIBUS_ENABLED=
|
||||
|
||||
# READ-COOP account data: https://readcoop.eu/
|
||||
# NOPAQUE_READCOOP_USERNAME=
|
||||
# NOPAQUE_READCOOP_PASSWORD=
|
3
.gitignore
vendored
3
.gitignore
vendored
@ -18,7 +18,8 @@ data/**
|
||||
pip-log.txt
|
||||
|
||||
# Logs in log folder
|
||||
logs/*.log
|
||||
logs/*
|
||||
!logs/dummy
|
||||
|
||||
# Packages
|
||||
*.egg
|
||||
|
@ -1,7 +1,7 @@
|
||||
FROM python:3.9.8-slim-buster
|
||||
FROM python:3.8.13-slim-buster
|
||||
|
||||
|
||||
LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <sporada@uni-bielefeld.de>"
|
||||
LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>"
|
||||
|
||||
|
||||
ARG DOCKER_GID
|
||||
|
File diff suppressed because it is too large
Load Diff
0
app/TranskribusHTRModel.defaults.yml
Normal file
0
app/TranskribusHTRModel.defaults.yml
Normal file
@ -1,5 +1,6 @@
|
||||
from config import Config
|
||||
from flask import Flask
|
||||
from flask_apscheduler import APScheduler
|
||||
from flask_login import LoginManager
|
||||
from flask_mail import Mail
|
||||
from flask_migrate import Migrate
|
||||
@ -20,6 +21,7 @@ mail: Mail = Mail()
|
||||
migrate: Migrate = Migrate()
|
||||
paranoid: Paranoid = Paranoid()
|
||||
paranoid.redirect_view = '/'
|
||||
scheduler: APScheduler = APScheduler() # TODO: Use this!
|
||||
socketio: SocketIO = SocketIO()
|
||||
|
||||
|
||||
|
68
app/cli.py
68
app/cli.py
@ -2,9 +2,8 @@ from flask import current_app
|
||||
from flask_migrate import upgrade
|
||||
from . import db
|
||||
from .models import Corpus, Job, Role, User, TesseractOCRModel
|
||||
import json
|
||||
import click
|
||||
import os
|
||||
import re
|
||||
|
||||
|
||||
def _make_default_dirs():
|
||||
@ -56,6 +55,19 @@ def register(app):
|
||||
daemon: Daemon = Daemon()
|
||||
daemon.run()
|
||||
|
||||
@app.cli.group()
|
||||
def converter():
|
||||
''' Converter commands. '''
|
||||
pass
|
||||
|
||||
@converter.command()
|
||||
@click.argument('json_db')
|
||||
@click.argument('data_dir')
|
||||
def sandpaper(json_db, data_dir):
|
||||
''' Sandpaper converter '''
|
||||
from app.converters.sandpaper import convert
|
||||
convert(json_db, data_dir)
|
||||
|
||||
@app.cli.group()
|
||||
def test():
|
||||
''' Test commands. '''
|
||||
@ -68,55 +80,3 @@ def register(app):
|
||||
from unittest.suite import TestSuite
|
||||
tests: TestSuite = TestLoader().discover('tests')
|
||||
TextTestRunner(verbosity=2).run(tests)
|
||||
|
||||
@app.cli.group()
|
||||
def convert():
|
||||
''' Datebase convert commands. '''
|
||||
|
||||
@convert.command()
|
||||
def nlp_jobs():
|
||||
for job in Job.query.filter_by(service='nlp').all():
|
||||
job.service = 'spacy-nlp'
|
||||
service_args = json.loads(job.service_args)
|
||||
new_service_args = {}
|
||||
for service_arg in service_args:
|
||||
if service_arg == '--check-encoding':
|
||||
new_service_args['encoding_detection'] = True
|
||||
elif re.match(r'-l ([a-z]{2})', service_arg):
|
||||
language_code = re.search(r'-l ([a-z]{2})', service_arg).group(1) # noqa
|
||||
new_service_args['language'] = language_code
|
||||
job.service_args = json.dumps(new_service_args)
|
||||
db.session.commit()
|
||||
|
||||
@convert.command()
|
||||
def ocr_jobs():
|
||||
# Language code to TesseractOCRModel.title lookup
|
||||
language_code_lookup = {
|
||||
'ara': 'Arabic',
|
||||
'chi_tra': 'Chinese - Traditional',
|
||||
'dan': 'Danish',
|
||||
'eng': 'English',
|
||||
'enm': 'English, Middle (1100-1500)',
|
||||
'fra': 'French',
|
||||
'frm': 'French, Middle (ca. 1400-1600)',
|
||||
'deu': 'German',
|
||||
'frk': 'German Fraktur',
|
||||
'ell': 'Greek, Modern (1453-)',
|
||||
'ita': 'Italian',
|
||||
'por': 'Portuguese',
|
||||
'rus': 'Russian',
|
||||
'spa': 'Spanish; Castilian'
|
||||
}
|
||||
for job in Job.query.filter_by(service='ocr').all():
|
||||
job.service = 'tesseract-ocr'
|
||||
service_args = json.loads(job.service_args)
|
||||
new_service_args = {}
|
||||
for service_arg in service_args:
|
||||
if service_arg == '--binarize':
|
||||
new_service_args['binarization'] = True
|
||||
elif re.match(r'-l ([a-z]{3})', service_arg):
|
||||
language_code = re.search(r'-l ([a-z]{3})', service_arg).group(1) # noqa
|
||||
tesseract_ocr_model = TesseractOCRModel.query.filter_by(title=language_code_lookup[language_code]).first() # noqa
|
||||
new_service_args['model'] = tesseract_ocr_model.id
|
||||
job.service_args = json.dumps(new_service_args)
|
||||
db.session.commit()
|
||||
|
0
app/converters/__init__.py
Normal file
0
app/converters/__init__.py
Normal file
215
app/converters/sandpaper.py
Normal file
215
app/converters/sandpaper.py
Normal file
@ -0,0 +1,215 @@
|
||||
from flask import current_app
|
||||
from app import db
|
||||
from app.models import User, Corpus, CorpusFile
|
||||
from datetime import datetime
|
||||
import json
|
||||
import os
|
||||
|
||||
|
||||
def convert(json_db_file, data_dir):
|
||||
with open(json_db_file, 'r') as f:
|
||||
json_db = json.loads(f.read())
|
||||
|
||||
for json_user in json_db:
|
||||
if not json_user['confirmed']:
|
||||
current_app.logger.info(f'Skip unconfirmed user {json_user["username"]}')
|
||||
continue
|
||||
user_dir = os.path.join(data_dir, json_user['id'])
|
||||
convert_user(json_user, user_dir)
|
||||
db.session.commit()
|
||||
|
||||
|
||||
def convert_user(json_user, user_dir):
|
||||
current_app.logger.info(f'Create User {json_user["username"]}...')
|
||||
user = User(
|
||||
confirmed=json_user['confirmed'],
|
||||
email=json_user['email'],
|
||||
last_seen=datetime.fromtimestamp(json_user['last_seen']),
|
||||
member_since=datetime.fromtimestamp(json_user['member_since']),
|
||||
password_hash=json_user['password_hash'], # TODO: Needs to be added manually
|
||||
username=json_user['username']
|
||||
)
|
||||
db.session.add(user)
|
||||
db.session.flush(objects=[user])
|
||||
db.session.refresh(user)
|
||||
try:
|
||||
user.makedirs()
|
||||
except OSError as e:
|
||||
current_app.logger.error(e)
|
||||
db.session.rollback()
|
||||
raise Exception('Internal Server Error')
|
||||
for json_corpus in json_user['corpora'].values():
|
||||
if not json_corpus['files'].values():
|
||||
current_app.logger.info(f'Skip empty corpus {json_corpus["title"]}')
|
||||
continue
|
||||
corpus_dir = os.path.join(user_dir, 'corpora', json_corpus['id'])
|
||||
convert_corpus(json_corpus, user, corpus_dir)
|
||||
current_app.logger.info('Done')
|
||||
|
||||
|
||||
def convert_corpus(json_corpus, user, corpus_dir):
|
||||
current_app.logger.info(f'Create Corpus {json_corpus["title"]}...')
|
||||
corpus = Corpus(
|
||||
user=user,
|
||||
creation_date=datetime.fromtimestamp(json_corpus['creation_date']),
|
||||
description=json_corpus['description'],
|
||||
last_edited_date=datetime.fromtimestamp(json_corpus['last_edited_date']),
|
||||
title=json_corpus['title']
|
||||
)
|
||||
db.session.add(corpus)
|
||||
db.session.flush(objects=[corpus])
|
||||
db.session.refresh(corpus)
|
||||
try:
|
||||
corpus.makedirs()
|
||||
except OSError as e:
|
||||
current_app.logger.error(e)
|
||||
db.session.rollback()
|
||||
raise Exception('Internal Server Error')
|
||||
for json_corpus_file in json_corpus['files'].values():
|
||||
corpus_file_dir = os.path.join(corpus_dir, 'files', json_corpus_file['id'])
|
||||
convert_corpus_file(json_corpus_file, corpus, corpus_file_dir)
|
||||
current_app.logger.info('Done')
|
||||
|
||||
|
||||
def convert_corpus_file(json_corpus_file, corpus, corpus_file_dir):
|
||||
current_app.logger.info(f'Create CorpusFile {json_corpus_file["title"]}...')
|
||||
corpus_file = CorpusFile(
|
||||
corpus=corpus,
|
||||
address=json_corpus_file['address'],
|
||||
author=json_corpus_file['author'],
|
||||
booktitle=json_corpus_file['booktitle'],
|
||||
chapter=json_corpus_file['chapter'],
|
||||
editor=json_corpus_file['editor'],
|
||||
filename=json_corpus_file['filename'],
|
||||
institution=json_corpus_file['institution'],
|
||||
journal=json_corpus_file['journal'],
|
||||
mimetype='application/vrt+xml',
|
||||
pages=json_corpus_file['pages'],
|
||||
publisher=json_corpus_file['publisher'],
|
||||
publishing_year=json_corpus_file['publishing_year'],
|
||||
school=json_corpus_file['school'],
|
||||
title=json_corpus_file['title']
|
||||
)
|
||||
db.session.add(corpus_file)
|
||||
db.session.flush(objects=[corpus_file])
|
||||
db.session.refresh(corpus_file)
|
||||
try:
|
||||
convert_vrt(
|
||||
os.path.join(corpus_file_dir, json_corpus_file['filename']),
|
||||
corpus_file.path
|
||||
)
|
||||
except OSError as e:
|
||||
current_app.logger.error(e)
|
||||
db.session.rollback()
|
||||
raise Exception('Internal Server Error')
|
||||
current_app.logger.info('Done')
|
||||
|
||||
|
||||
def convert_vrt(input_file, output_file):
|
||||
def check_pos_attribute_order(vrt_lines):
|
||||
# The following orders are possible:
|
||||
# since 26.02.2019: 'word,lemma,simple_pos,pos,ner'
|
||||
# since 26.03.2021: 'word,pos,lemma,simple_pos,ner'
|
||||
# since 27.01.2022: 'word,pos,lemma,simple_pos'
|
||||
# This Function tries to find out which order we have by looking at the
|
||||
# number of attributes and the position of the simple_pos attribute
|
||||
SIMPLE_POS_LABELS = [
|
||||
'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ',
|
||||
'DET', 'INTJ', 'NOUN', 'NUM', 'PART',
|
||||
'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM',
|
||||
'VERB', 'X'
|
||||
]
|
||||
for line in vrt_lines:
|
||||
if line.startswith('<'):
|
||||
continue
|
||||
pos_attrs = line.rstrip('\n').split('\t')
|
||||
num_pos_attrs = len(pos_attrs)
|
||||
if num_pos_attrs == 4:
|
||||
if pos_attrs[3] in SIMPLE_POS_LABELS:
|
||||
return ['word', 'pos', 'lemma', 'simple_pos']
|
||||
continue
|
||||
elif num_pos_attrs == 5:
|
||||
if pos_attrs[2] in SIMPLE_POS_LABELS:
|
||||
return ['word', 'lemma', 'simple_pos', 'pos', 'ner']
|
||||
elif pos_attrs[3] in SIMPLE_POS_LABELS:
|
||||
return ['word', 'pos', 'lemma', 'simple_pos', 'ner']
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
def check_has_ent_as_s_attr(vrt_lines):
|
||||
for line in vrt_lines:
|
||||
if line.startswith('<ent'):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def pos_attrs_to_string_1(pos_attrs):
|
||||
return f'{pos_attrs[0]}\t{pos_attrs[3]}\t{pos_attrs[1]}\t{pos_attrs[2]}\n'
|
||||
|
||||
|
||||
def pos_attrs_to_string_2(pos_attrs):
|
||||
return f'{pos_attrs[0]}\t{pos_attrs[1]}\t{pos_attrs[2]}\t{pos_attrs[3]}\n'
|
||||
|
||||
|
||||
with open(input_file) as f:
|
||||
input_vrt_lines = f.readlines()
|
||||
|
||||
pos_attr_order = check_pos_attribute_order(input_vrt_lines)
|
||||
has_ent_as_s_attr = check_has_ent_as_s_attr(input_vrt_lines)
|
||||
|
||||
print(f'Detected pos_attr_order: [{",".join(pos_attr_order)}]')
|
||||
print(f'Detected has_ent_as_s_attr: {has_ent_as_s_attr}')
|
||||
|
||||
if pos_attr_order == ['word', 'lemma', 'simple_pos', 'pos', 'ner']:
|
||||
pos_attrs_to_string_function = pos_attrs_to_string_1
|
||||
elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos', 'ner']:
|
||||
pos_attrs_to_string_function = pos_attrs_to_string_2
|
||||
elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos']:
|
||||
pos_attrs_to_string_function = pos_attrs_to_string_2
|
||||
else:
|
||||
raise Exception('Can not handle format')
|
||||
|
||||
current_ent = None
|
||||
output_vrt = ''
|
||||
for line in input_vrt_lines:
|
||||
if line.strip() == '':
|
||||
continue
|
||||
if line.startswith('<'):
|
||||
if not has_ent_as_s_attr:
|
||||
if current_ent is not None:
|
||||
output_vrt += '</ent>\n'
|
||||
current_ent = None
|
||||
if (
|
||||
line.startswith('<corpus')
|
||||
or line.startswith('</corpus')
|
||||
or line.startswith('<nlp')
|
||||
):
|
||||
continue
|
||||
elif line.startswith('<text'):
|
||||
output_vrt += '<text>\n'
|
||||
continue
|
||||
elif line.startswith('<s'):
|
||||
output_vrt += '<s>\n'
|
||||
continue
|
||||
output_vrt += line
|
||||
continue
|
||||
pos_attrs = line.rstrip('\n').split('\t')
|
||||
if not has_ent_as_s_attr:
|
||||
if pos_attrs[4].lower() in ['null', 'none']:
|
||||
if current_ent:
|
||||
output_vrt += '</ent>\n'
|
||||
current_ent = None
|
||||
else:
|
||||
if current_ent is None:
|
||||
output_vrt += f'<ent type="{pos_attrs[4]}">\n'
|
||||
current_ent = pos_attrs[4]
|
||||
elif current_ent != pos_attrs[4]:
|
||||
output_vrt += '</ent>\n'
|
||||
current_ent = None
|
||||
output_vrt += f'<ent type="{pos_attrs[4]}">\n'
|
||||
current_ent = pos_attrs[4]
|
||||
output_vrt += pos_attrs_to_string_function(pos_attrs)
|
||||
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(output_vrt)
|
@ -319,7 +319,7 @@ def corpus_file(corpus_id, corpus_file_id):
|
||||
form.title.data = corpus_file.title
|
||||
return render_template(
|
||||
'corpora/corpus_file.html.j2',
|
||||
corpus=corpus,
|
||||
corpus=corpus_file.corpus,
|
||||
corpus_file=corpus_file,
|
||||
form=form,
|
||||
title='Edit corpus file'
|
||||
|
@ -22,34 +22,46 @@ class CheckJobsMixin:
|
||||
def create_job_service(self, job):
|
||||
''' # Docker service settings # '''
|
||||
''' ## Service specific settings ## '''
|
||||
if job.service == 'file-setup':
|
||||
if job.service == 'file-setup-pipeline':
|
||||
mem_mb = 512
|
||||
n_cores = 2
|
||||
executable = 'file-setup'
|
||||
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}file-setup:v{job.service_version}' # noqa
|
||||
elif job.service == 'tesseract-ocr':
|
||||
mem_mb = 2048
|
||||
executable = 'file-setup-pipeline'
|
||||
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}file-setup-pipeline:v{job.service_version}' # noqa
|
||||
elif job.service == 'tesseract-ocr-pipeline':
|
||||
mem_mb = 1024
|
||||
n_cores = 4
|
||||
executable = 'ocr'
|
||||
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}ocr:v{job.service_version}' # noqa
|
||||
elif job.service == 'spacy-nlp':
|
||||
executable = 'tesseract-ocr-pipeline'
|
||||
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}tesseract-ocr-pipeline:v{job.service_version}' # noqa
|
||||
elif job.service == 'transkribus-htr-pipeline':
|
||||
mem_mb = 1024
|
||||
n_cores = 4
|
||||
executable = 'transkribus-htr-pipeline'
|
||||
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}transkribus-htr-pipeline:v{job.service_version}' # noqa
|
||||
elif job.service == 'spacy-nlp-pipeline':
|
||||
mem_mb = 1024
|
||||
n_cores = 1
|
||||
executable = 'nlp'
|
||||
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}nlp:v{job.service_version}' # noqa
|
||||
executable = 'spacy-nlp-pipeline'
|
||||
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}spacy-nlp-pipeline:v{job.service_version}' # noqa
|
||||
''' ## Command ## '''
|
||||
command = f'{executable} -i /input -o /output'
|
||||
command += ' --log-dir /logs'
|
||||
command += f' --mem-mb {mem_mb}'
|
||||
command += f' --n-cores {n_cores}'
|
||||
service_args = json.loads(job.service_args)
|
||||
if job.service == 'spacy-nlp':
|
||||
command += f' -m {service_args["model"]}'
|
||||
if 'encoding_detection' in service_args and service_args['encoding_detection']: # noqa
|
||||
if job.service == 'spacy-nlp-pipeline':
|
||||
command += f' -m {job.service_args["model"]}'
|
||||
if 'encoding_detection' in job.service_args and job.service_args['encoding_detection']: # noqa
|
||||
command += ' --check-encoding'
|
||||
elif job.service == 'tesseract-ocr':
|
||||
command += f' -m {service_args["model"]}'
|
||||
if 'binarization' in service_args and service_args['binarization']:
|
||||
elif job.service == 'tesseract-ocr-pipeline':
|
||||
command += f' -m {job.service_args["model"]}'
|
||||
if 'binarization' in job.service_args and job.service_args['binarization']:
|
||||
command += ' --binarize'
|
||||
elif job.service == 'transkribus-htr-pipeline':
|
||||
command += f' -m {job.service_args["model"]}'
|
||||
readcoop_username = current_app.config.get('NOPAQUE_READCOOP_USERNAME')
|
||||
command += f' --readcoop-username "{readcoop_username}"'
|
||||
readcoop_password = current_app.config.get('NOPAQUE_READCOOP_PASSWORD')
|
||||
command += f' --readcoop-password "{readcoop_password}"'
|
||||
if 'binarization' in job.service_args and job.service_args['binarization']:
|
||||
command += ' --binarize'
|
||||
''' ## Constraints ## '''
|
||||
constraints = ['node.role==worker']
|
||||
@ -63,16 +75,15 @@ class CheckJobsMixin:
|
||||
mounts = []
|
||||
''' ### Input mount(s) ### '''
|
||||
input_mount_target_base = '/input'
|
||||
if job.service == 'file-setup':
|
||||
if job.service == 'file-setup-pipeline':
|
||||
input_mount_target_base += f'/{secure_filename(job.title)}'
|
||||
for job_input in job.inputs:
|
||||
input_mount_source = job_input.path
|
||||
input_mount_target = f'/{input_mount_target_base}/{job_input.filename}' # noqa
|
||||
input_mount_target = f'{input_mount_target_base}/{job_input.filename}' # noqa
|
||||
input_mount = f'{input_mount_source}:{input_mount_target}:ro'
|
||||
mounts.append(input_mount)
|
||||
if job.service == 'tesseract-ocr':
|
||||
service_args = json.loads(job.service_args)
|
||||
model = TesseractOCRModel.query.get(service_args['model'])
|
||||
if job.service == 'tesseract-ocr-pipeline':
|
||||
model = TesseractOCRModel.query.get(job.service_args['model'])
|
||||
if model is None:
|
||||
job.status = JobStatus.FAILED
|
||||
return
|
||||
@ -114,7 +125,8 @@ class CheckJobsMixin:
|
||||
mounts=mounts,
|
||||
name=name,
|
||||
resources=resources,
|
||||
restart_policy=restart_policy
|
||||
restart_policy=restart_policy,
|
||||
user='1000:1000'
|
||||
)
|
||||
except docker.errors.APIError as e:
|
||||
current_app.logger.error(
|
||||
|
@ -36,14 +36,23 @@ class IntEnumColumn(db.TypeDecorator):
|
||||
return self.enum_type(value)
|
||||
|
||||
|
||||
class Permission(IntEnum):
|
||||
'''
|
||||
Defines User permissions as integers by the power of 2. User permission
|
||||
can be evaluated using the bitwise operator &.
|
||||
'''
|
||||
ADMINISTRATE = 4
|
||||
CONTRIBUTE = 2
|
||||
USE_API = 1
|
||||
class ContainerColumn(db.TypeDecorator):
|
||||
impl = db.String
|
||||
|
||||
def __init__(self, container_type, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.container_type = container_type
|
||||
|
||||
def process_bind_param(self, value, dialect):
|
||||
if isinstance(value, self.container_type):
|
||||
return json.dumps(value)
|
||||
elif isinstance(value, str) and isinstance(json.loads(value), self.container_type): # noqa
|
||||
return value
|
||||
else:
|
||||
return TypeError()
|
||||
|
||||
def process_result_value(self, value, dialect):
|
||||
return json.loads(value)
|
||||
|
||||
|
||||
class FileMixin:
|
||||
@ -61,6 +70,16 @@ class FileMixin:
|
||||
}
|
||||
|
||||
|
||||
class Permission(IntEnum):
|
||||
'''
|
||||
Defines User permissions as integers by the power of 2. User permission
|
||||
can be evaluated using the bitwise operator &.
|
||||
'''
|
||||
ADMINISTRATE = 1
|
||||
CONTRIBUTE = 2
|
||||
USE_API = 4
|
||||
|
||||
|
||||
class Role(HashidMixin, db.Model):
|
||||
__tablename__ = 'roles'
|
||||
# Primary key
|
||||
@ -102,7 +121,7 @@ class Role(HashidMixin, db.Model):
|
||||
'permissions': self.permissions
|
||||
}
|
||||
if relationships:
|
||||
dict_role['users']: {
|
||||
dict_role['users'] = {
|
||||
x.to_dict(backrefs=False, relationships=True)
|
||||
for x in self.users
|
||||
}
|
||||
@ -339,10 +358,11 @@ class TesseractOCRModel(FileMixin, HashidMixin, db.Model):
|
||||
# Foreign keys
|
||||
user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
|
||||
# Fields
|
||||
compatible_service_versions = db.Column(db.String(255))
|
||||
compatible_service_versions = db.Column(ContainerColumn(list, 255))
|
||||
description = db.Column(db.String(255))
|
||||
publisher = db.Column(db.String(128))
|
||||
publishing_year = db.Column(db.Integer)
|
||||
shared = db.Column(db.Boolean, default=False)
|
||||
title = db.Column(db.String(64))
|
||||
version = db.Column(db.String(16))
|
||||
# Backrefs: user: User
|
||||
@ -356,11 +376,10 @@ class TesseractOCRModel(FileMixin, HashidMixin, db.Model):
|
||||
)
|
||||
|
||||
def to_dict(self, backrefs=False, relationships=False):
|
||||
compatible_service_versions = json.loads(self.compatible_service_versions) # noqa
|
||||
dict_tesseract_ocr_model = {
|
||||
'id': self.hashid,
|
||||
'user_id': self.user.hashid,
|
||||
'compatible_service_versions': compatible_service_versions,
|
||||
'compatible_service_versions': self.compatible_service_versions,
|
||||
'description': self.description,
|
||||
'publisher': self.publisher,
|
||||
'publishing_year': self.publishing_year,
|
||||
@ -384,31 +403,39 @@ class TesseractOCRModel(FileMixin, HashidMixin, db.Model):
|
||||
with open(defaults_file, 'r') as f:
|
||||
defaults = yaml.safe_load(f)
|
||||
for m in defaults:
|
||||
if TesseractOCRModel.query.filter_by(title=m['title'], version=m['version']).first() is not None: # noqa
|
||||
model = TesseractOCRModel.query.filter_by(title=m['title'], version=m['version']).first() # noqa
|
||||
if model is not None:
|
||||
model.compatible_service_versions = m['compatible_service_versions']
|
||||
model.description = m['description']
|
||||
model.publisher = m['publisher']
|
||||
model.publishing_year = m['publishing_year']
|
||||
model.title = m['title']
|
||||
model.version = m['version']
|
||||
continue
|
||||
tesseract_ocr_model = TesseractOCRModel(
|
||||
compatible_service_versions=json.dumps(m['compatible_service_versions']), # noqa
|
||||
model = TesseractOCRModel(
|
||||
compatible_service_versions=m['compatible_service_versions'],
|
||||
description=m['description'],
|
||||
publisher=m['publisher'],
|
||||
publishing_year=m['publishing_year'],
|
||||
shared=True,
|
||||
title=m['title'],
|
||||
user=user,
|
||||
version=m['version']
|
||||
)
|
||||
db.session.add(tesseract_ocr_model)
|
||||
db.session.flush(objects=[tesseract_ocr_model])
|
||||
db.session.refresh(tesseract_ocr_model)
|
||||
tesseract_ocr_model.filename = f'{tesseract_ocr_model.id}.traineddata' # noqa
|
||||
db.session.add(model)
|
||||
db.session.flush(objects=[model])
|
||||
db.session.refresh(model)
|
||||
model.filename = f'{model.id}.traineddata'
|
||||
r = requests.get(m['url'], stream=True)
|
||||
pbar = tqdm(
|
||||
desc=f'{tesseract_ocr_model.title} ({tesseract_ocr_model.filename})', # noqa
|
||||
desc=f'{model.title} ({model.filename})',
|
||||
unit="B",
|
||||
unit_scale=True,
|
||||
unit_divisor=1024,
|
||||
total=int(r.headers['Content-Length'])
|
||||
)
|
||||
pbar.clear()
|
||||
with open(tesseract_ocr_model.path, 'wb') as f:
|
||||
with open(model.path, 'wb') as f:
|
||||
for chunk in r.iter_content(chunk_size=1024):
|
||||
if chunk: # filter out keep-alive new chunks
|
||||
pbar.update(len(chunk))
|
||||
@ -560,11 +587,7 @@ class Job(HashidMixin, db.Model):
|
||||
description = db.Column(db.String(255))
|
||||
end_date = db.Column(db.DateTime())
|
||||
service = db.Column(db.String(64))
|
||||
'''
|
||||
' Dictionary as JSON formatted string.
|
||||
' Example: {"binarization": True}
|
||||
'''
|
||||
service_args = db.Column(db.String(255))
|
||||
service_args = db.Column(ContainerColumn(dict, 255))
|
||||
service_version = db.Column(db.String(16))
|
||||
status = db.Column(
|
||||
IntEnumColumn(JobStatus),
|
||||
@ -643,10 +666,6 @@ class Job(HashidMixin, db.Model):
|
||||
self.status = JobStatus.SUBMITTED
|
||||
|
||||
def to_dict(self, backrefs=False, relationships=False):
|
||||
service_args = json.loads(self.service_args)
|
||||
if self.service == 'tesseract-ocr' and 'model' in service_args:
|
||||
tesseract_ocr_pipeline_model = TesseractOCRModel.query.get(service_args['model']) # noqa
|
||||
service_args['model'] = tesseract_ocr_pipeline_model.title
|
||||
dict_job = {
|
||||
'id': self.hashid,
|
||||
'user_id': self.user.hashid,
|
||||
@ -654,7 +673,7 @@ class Job(HashidMixin, db.Model):
|
||||
'description': self.description,
|
||||
'end_date': None if self.end_date is None else f'{self.end_date.isoformat()}Z', # noqa
|
||||
'service': self.service,
|
||||
'service_args': service_args,
|
||||
'service_args': self.service_args,
|
||||
'service_version': self.service_version,
|
||||
'status': self.status.name,
|
||||
'title': self.title,
|
||||
@ -798,7 +817,6 @@ class Corpus(HashidMixin, db.Model):
|
||||
title = db.Column(db.String(32))
|
||||
num_analysis_sessions = db.Column(db.Integer, default=0)
|
||||
num_tokens = db.Column(db.Integer, default=0)
|
||||
archive_file = db.Column(db.String(255))
|
||||
# Backrefs: user: User
|
||||
# Relationships
|
||||
files = db.relationship(
|
||||
|
@ -1,5 +1,7 @@
|
||||
from app.models import TesseractOCRModel
|
||||
from app.models import Job, TesseractOCRModel
|
||||
from flask_login import current_user
|
||||
from flask_wtf import FlaskForm
|
||||
from flask_wtf.file import FileField, FileAllowed, FileRequired
|
||||
from wtforms import (
|
||||
BooleanField,
|
||||
MultipleFileField,
|
||||
@ -8,110 +10,143 @@ from wtforms import (
|
||||
SubmitField,
|
||||
ValidationError
|
||||
)
|
||||
from wtforms.validators import DataRequired, Length
|
||||
from wtforms.validators import DataRequired, InputRequired, Length
|
||||
from . import SERVICES
|
||||
|
||||
|
||||
class AddJobForm(FlaskForm):
|
||||
description = StringField('Description', validators=[DataRequired(), Length(1, 255)]) # noqa
|
||||
description = StringField('Description', validators=[InputRequired()]) # noqa
|
||||
submit = SubmitField()
|
||||
title = StringField('Title', validators=[DataRequired(), Length(1, 32)])
|
||||
title = StringField('Title', validators=[InputRequired()])
|
||||
version = SelectField('Version', validators=[DataRequired()])
|
||||
|
||||
def validate_description(self, field):
|
||||
max_length = Job.description.property.columns[0].type.length
|
||||
if len(field.data) > max_length:
|
||||
raise ValidationError(
|
||||
f'Description must be less than {max_length} characters'
|
||||
)
|
||||
|
||||
class AddSpacyNLPJobForm(AddJobForm):
|
||||
encoding_detection = BooleanField('Encoding detection')
|
||||
files = MultipleFileField('Files', validators=[DataRequired()])
|
||||
model = SelectField(
|
||||
'Model',
|
||||
choices=[('', 'Choose your option')],
|
||||
default='',
|
||||
validators=[DataRequired()]
|
||||
)
|
||||
def validate_title(self, field):
|
||||
max_length = Job.title.property.columns[0].type.length
|
||||
if len(field.data) > max_length:
|
||||
raise ValidationError(
|
||||
f'Title must be less than {max_length} characters'
|
||||
)
|
||||
|
||||
def validate_encoding_detection(self, field):
|
||||
service_info = SERVICES['spacy-nlp']['versions'][self.version.data]
|
||||
if field.data and 'encoding_detection' not in service_info['methods']:
|
||||
raise ValidationError('Encoding detection is not available')
|
||||
|
||||
def validate_files(form, field):
|
||||
valid_extensions = ['.txt']
|
||||
for file in field.data:
|
||||
if not file.filename.lower().endswith(tuple(valid_extensions)):
|
||||
raise ValidationError(
|
||||
'File does not have an approved extension: '
|
||||
'/'.join(valid_extensions)
|
||||
)
|
||||
class AddFileSetupPipelineJobForm(AddJobForm):
|
||||
images = MultipleFileField('File(s)', validators=[DataRequired()])
|
||||
|
||||
def validate_images(form, field):
|
||||
valid_mimetypes = ['image/jpeg', 'image/png', 'image/tiff']
|
||||
for image in field.data:
|
||||
if image.mimetype not in valid_mimetypes:
|
||||
raise ValidationError('JPEG, PNG and TIFF files only!')
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
version = kwargs.pop('version', SERVICES['spacy-nlp']['latest_version']) # noqa
|
||||
service_manifest = SERVICES['file-setup-pipeline']
|
||||
version = kwargs.pop('version', service_manifest['latest_version'])
|
||||
super().__init__(*args, **kwargs)
|
||||
service_info = SERVICES['spacy-nlp']['versions'][version]
|
||||
if 'encoding_detection' not in service_info['methods']:
|
||||
self.encoding_detection.render_kw = {'disabled': True}
|
||||
self.model.choices += [(x, y) for x, y in service_info['models'].items()] # noqa
|
||||
self.version.choices = [(x, x) for x in SERVICES['spacy-nlp']['versions']] # noqa
|
||||
self.version.default = version
|
||||
self.version.choices = [(x, x) for x in service_manifest['versions']]
|
||||
self.version.data = version
|
||||
self.version.default = service_manifest['latest_version']
|
||||
|
||||
|
||||
class AddTesseractOCRJobForm(AddJobForm):
|
||||
class AddTesseractOCRPipelineJobForm(AddJobForm):
|
||||
binarization = BooleanField('Binarization')
|
||||
files = MultipleFileField('Files', validators=[DataRequired()])
|
||||
model = SelectField(
|
||||
'Model',
|
||||
choices=[('', 'Choose your option')],
|
||||
default='',
|
||||
validators=[DataRequired()]
|
||||
)
|
||||
pdf = FileField('File', validators=[FileRequired()])
|
||||
model = SelectField('Model', validators=[DataRequired()])
|
||||
|
||||
def validate_binarization(self, field):
|
||||
service_info = SERVICES['tesseract-ocr']['versions'][self.version.data]
|
||||
service_info = SERVICES['tesseract-ocr-pipeline']['versions'][self.version.data]
|
||||
if field.data and 'binarization' not in service_info['methods']:
|
||||
raise ValidationError('Binarization is not available')
|
||||
|
||||
def validate_files(self, field):
|
||||
valid_extensions = ['.pdf']
|
||||
for file in field.data:
|
||||
if not file.filename.lower().endswith(tuple(valid_extensions)):
|
||||
raise ValidationError(
|
||||
'File does not have an approved extension: '
|
||||
'/'.join(valid_extensions)
|
||||
)
|
||||
def validate_pdf(self, field):
|
||||
if field.data.mimetype != 'application/pdf':
|
||||
raise ValidationError('PDF files only!')
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
version = kwargs.pop('version', SERVICES['tesseract-ocr']['latest_version']) # noqa
|
||||
service_manifest = SERVICES['tesseract-ocr-pipeline']
|
||||
version = kwargs.pop('version', service_manifest['latest_version'])
|
||||
super().__init__(*args, **kwargs)
|
||||
service_info = SERVICES['tesseract-ocr']['versions'][version]
|
||||
service_info = service_manifest['versions'][version]
|
||||
if 'binarization' not in service_info['methods']:
|
||||
self.binarization.render_kw = {'disabled': True}
|
||||
self.model.choices += [(x.hashid, x.title) for x in TesseractOCRModel.query.all()] # noqa
|
||||
self.version.choices = [(x, x) for x in SERVICES['tesseract-ocr']['versions']] # noqa
|
||||
compatible_models = [
|
||||
x for x in TesseractOCRModel.query.filter_by(shared=True).all()
|
||||
if version in x.compatible_service_versions
|
||||
]
|
||||
compatible_models += [
|
||||
x for x in TesseractOCRModel.query.filter_by(shared=False, user=current_user).all()
|
||||
if version in x.compatible_service_versions
|
||||
]
|
||||
self.model.choices = [('', 'Choose your option')]
|
||||
self.model.choices += [(x.hashid, x.title) for x in compatible_models]
|
||||
self.model.default = ''
|
||||
self.version.choices = [(x, x) for x in service_manifest['versions']]
|
||||
self.version.data = version
|
||||
self.version.default = SERVICES['tesseract-ocr']['latest_version']
|
||||
self.version.default = service_manifest['latest_version']
|
||||
|
||||
|
||||
class AddFileSetupJobForm(AddJobForm):
|
||||
files = MultipleFileField('Files', validators=[DataRequired()])
|
||||
class AddTranskribusHTRPipelineJobForm(AddJobForm):
|
||||
binarization = BooleanField('Binarization')
|
||||
pdf = FileField('File', validators=[FileRequired()])
|
||||
model = SelectField('Model', validators=[DataRequired()])
|
||||
|
||||
def validate_files(form, field):
|
||||
valid_extensions = ['.jpeg', '.jpg', '.png', '.tiff', '.tif']
|
||||
for file in field.data:
|
||||
if not file.filename.lower().endswith(tuple(valid_extensions)):
|
||||
raise ValidationError(
|
||||
'File does not have an approved extension: '
|
||||
'/'.join(valid_extensions)
|
||||
)
|
||||
def validate_binarization(self, field):
|
||||
service_info = SERVICES['transkribus-htr-pipeline']['versions'][self.version.data]
|
||||
if field.data and 'binarization' not in service_info['methods']:
|
||||
raise ValidationError('Binarization is not available')
|
||||
|
||||
def validate_pdf(self, field):
|
||||
if field.data.mimetype != 'application/pdf':
|
||||
raise ValidationError('PDF files only!')
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
version = kwargs.pop('version', SERVICES['file-setup']['latest_version']) # noqa
|
||||
service_manifest = SERVICES['transkribus-htr-pipeline']
|
||||
version = kwargs.pop('version', service_manifest['latest_version'])
|
||||
super().__init__(*args, **kwargs)
|
||||
self.version.choices = [(x, x) for x in SERVICES['file-setup']['versions']] # noqa
|
||||
service_info = service_manifest['versions'][version]
|
||||
if 'binarization' not in service_info['methods']:
|
||||
self.binarization.render_kw = {'disabled': True}
|
||||
self.model.choices = [('', 'Choose your option')]
|
||||
self.model.choices += [
|
||||
('37569', 'Tim Model'),
|
||||
('29539', 'UCL–University of Toronto #7')
|
||||
]
|
||||
self.model.default = ''
|
||||
self.version.choices = [(x, x) for x in service_manifest['versions']]
|
||||
self.version.data = version
|
||||
self.version.default = SERVICES['file-setup']['latest_version']
|
||||
self.version.default = service_manifest['latest_version']
|
||||
|
||||
|
||||
AddJobForms = {
|
||||
'file-setup': AddFileSetupJobForm,
|
||||
'tesseract-ocr': AddTesseractOCRJobForm,
|
||||
'spacy-nlp': AddSpacyNLPJobForm
|
||||
}
|
||||
class AddSpacyNLPPipelineJobForm(AddJobForm):
|
||||
encoding_detection = BooleanField('Encoding detection')
|
||||
txt = FileField('File', validators=[FileRequired()])
|
||||
model = SelectField('Model', validators=[DataRequired()])
|
||||
|
||||
def validate_encoding_detection(self, field):
|
||||
service_manifest = SERVICES['spacy-nlp-pipeline']
|
||||
service_info = service_manifest['versions'][self.version.data]
|
||||
if field.data and 'encoding_detection' not in service_info['methods']:
|
||||
raise ValidationError('Encoding detection is not available!')
|
||||
|
||||
def validate_txt(form, field):
|
||||
if field.data.mimetype != 'text/plain':
|
||||
raise ValidationError('Plain text files only!')
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
service_manifest = SERVICES['spacy-nlp-pipeline']
|
||||
version = kwargs.pop('version', service_manifest['latest_version'])
|
||||
super().__init__(*args, **kwargs)
|
||||
service_info = service_manifest['versions'][version]
|
||||
if 'encoding_detection' not in service_info['methods']:
|
||||
self.encoding_detection.render_kw = {'disabled': True}
|
||||
self.model.choices = [('', 'Choose your option')]
|
||||
self.model.choices += [(x, y) for x, y in service_info['models'].items()] # noqa
|
||||
self.model.default = ''
|
||||
self.version.choices = [(x, x) for x in service_manifest['versions']]
|
||||
self.version.data = version
|
||||
self.version.default = version
|
||||
|
@ -13,47 +13,33 @@ from flask_login import current_user, login_required
|
||||
from werkzeug.utils import secure_filename
|
||||
from . import bp
|
||||
from . import SERVICES
|
||||
from .forms import AddJobForms
|
||||
from .forms import (
|
||||
AddFileSetupPipelineJobForm,
|
||||
AddTesseractOCRPipelineJobForm,
|
||||
AddTranskribusHTRPipelineJobForm,
|
||||
AddSpacyNLPPipelineJobForm
|
||||
)
|
||||
import json
|
||||
|
||||
|
||||
@bp.route('/corpus-analysis')
|
||||
@bp.route('/file-setup-pipeline', methods=['GET', 'POST'])
|
||||
@login_required
|
||||
def corpus_analysis():
|
||||
return render_template(
|
||||
'services/corpus_analysis.html.j2',
|
||||
title='Corpus analysis'
|
||||
)
|
||||
|
||||
|
||||
@bp.route('/<service>', methods=['GET', 'POST'])
|
||||
@login_required
|
||||
def service(service):
|
||||
# Check if the requested service exist
|
||||
if service not in SERVICES or service not in AddJobForms:
|
||||
def file_setup_pipeline():
|
||||
service = 'file-setup-pipeline'
|
||||
service_manifest = SERVICES[service]
|
||||
version = request.args.get('version', service_manifest['latest_version'])
|
||||
if version not in service_manifest['versions']:
|
||||
abort(404)
|
||||
version = request.args.get('version', SERVICES[service]['latest_version'])
|
||||
if version not in SERVICES[service]['versions']:
|
||||
abort(404)
|
||||
form = AddJobForms[service](prefix='add-job-form', version=version)
|
||||
title = SERVICES[service]['name']
|
||||
form = AddFileSetupPipelineJobForm(prefix='add-job-form', version=version)
|
||||
if form.is_submitted():
|
||||
if not form.validate():
|
||||
return make_response(form.errors, 400)
|
||||
service_args = {}
|
||||
if service == 'spacy-nlp':
|
||||
service_args['model'] = form.model.data
|
||||
if form.encoding_detection.data:
|
||||
service_args['encoding_detection'] = True
|
||||
if service == 'tesseract-ocr':
|
||||
service_args['model'] = hashids.decode(form.model.data)
|
||||
if form.binarization.data:
|
||||
service_args['binarization'] = True
|
||||
job = Job(
|
||||
user=current_user,
|
||||
description=form.description.data,
|
||||
service=service,
|
||||
service_args=json.dumps(service_args),
|
||||
service_args=service_args,
|
||||
service_version=form.version.data,
|
||||
title=form.title.data
|
||||
)
|
||||
@ -67,18 +53,17 @@ def service(service):
|
||||
db.session.rollback()
|
||||
flash('Internal Server Error', 'error')
|
||||
return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa
|
||||
for file in form.files.data:
|
||||
filename = secure_filename(file.filename)
|
||||
for image_file in form.images.data:
|
||||
job_input = JobInput(
|
||||
filename=filename,
|
||||
filename=secure_filename(image_file.filename),
|
||||
job=job,
|
||||
mimetype=file.mimetype
|
||||
mimetype=image_file.mimetype
|
||||
)
|
||||
db.session.add(job_input)
|
||||
db.session.flush(objects=[job_input])
|
||||
db.session.refresh(job_input)
|
||||
try:
|
||||
file.save(job_input.path)
|
||||
image_file.save(job_input.path)
|
||||
except OSError as e:
|
||||
current_app.logger.error(e)
|
||||
db.session.rollback()
|
||||
@ -91,5 +76,196 @@ def service(service):
|
||||
return render_template(
|
||||
f'services/{service.replace("-", "_")}.html.j2',
|
||||
form=form,
|
||||
title=title
|
||||
title=service_manifest['name']
|
||||
)
|
||||
|
||||
|
||||
@bp.route('/tesseract-ocr-pipeline', methods=['GET', 'POST'])
|
||||
@login_required
|
||||
def tesseract_ocr_pipeline():
|
||||
service = 'tesseract-ocr-pipeline'
|
||||
service_manifest = SERVICES[service]
|
||||
version = request.args.get('version', service_manifest['latest_version'])
|
||||
if version not in service_manifest['versions']:
|
||||
abort(404)
|
||||
form = AddTesseractOCRPipelineJobForm(prefix='add-job-form', version=version)
|
||||
if form.is_submitted():
|
||||
if not form.validate():
|
||||
return make_response(form.errors, 400)
|
||||
service_args = {}
|
||||
service_args['model'] = hashids.decode(form.model.data)
|
||||
if form.binarization.data:
|
||||
service_args['binarization'] = True
|
||||
job = Job(
|
||||
user=current_user,
|
||||
description=form.description.data,
|
||||
service=service,
|
||||
service_args=service_args,
|
||||
service_version=form.version.data,
|
||||
title=form.title.data
|
||||
)
|
||||
db.session.add(job)
|
||||
db.session.flush(objects=[job])
|
||||
db.session.refresh(job)
|
||||
try:
|
||||
job.makedirs()
|
||||
except OSError as e:
|
||||
current_app.logger.error(e)
|
||||
db.session.rollback()
|
||||
flash('Internal Server Error', 'error')
|
||||
return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa
|
||||
job_input = JobInput(
|
||||
filename=secure_filename(form.pdf.data.filename),
|
||||
job=job,
|
||||
mimetype=form.pdf.data.mimetype
|
||||
)
|
||||
db.session.add(job_input)
|
||||
db.session.flush(objects=[job_input])
|
||||
db.session.refresh(job_input)
|
||||
try:
|
||||
form.pdf.data.save(job_input.path)
|
||||
except OSError as e:
|
||||
current_app.logger.error(e)
|
||||
db.session.rollback()
|
||||
flash('Internal Server Error', 'error')
|
||||
return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa
|
||||
job.status = JobStatus.SUBMITTED
|
||||
db.session.commit()
|
||||
flash(f'Job "{job.title}" added', 'job')
|
||||
return make_response({'redirect_url': url_for('jobs.job', job_id=job.id)}, 201) # noqa
|
||||
return render_template(
|
||||
f'services/{service.replace("-", "_")}.html.j2',
|
||||
form=form,
|
||||
title=service_manifest['name']
|
||||
)
|
||||
|
||||
|
||||
@bp.route('/transkribus-htr-pipeline', methods=['GET', 'POST'])
|
||||
@login_required
|
||||
def transkribus_htr_pipeline():
|
||||
if not current_app.config.get('NOPAQUE_TRANSKRIBUS_ENABLED'):
|
||||
abort(404)
|
||||
service = 'transkribus-htr-pipeline'
|
||||
service_manifest = SERVICES[service]
|
||||
version = request.args.get('version', service_manifest['latest_version'])
|
||||
if version not in service_manifest['versions']:
|
||||
abort(404)
|
||||
form = AddTranskribusHTRPipelineJobForm(prefix='add-job-form', version=version)
|
||||
if form.is_submitted():
|
||||
if not form.validate():
|
||||
return make_response(form.errors, 400)
|
||||
service_args = {}
|
||||
service_args['model'] = form.model.data
|
||||
if form.binarization.data:
|
||||
service_args['binarization'] = True
|
||||
job = Job(
|
||||
user=current_user,
|
||||
description=form.description.data,
|
||||
service=service,
|
||||
service_args=service_args,
|
||||
service_version=form.version.data,
|
||||
title=form.title.data
|
||||
)
|
||||
db.session.add(job)
|
||||
db.session.flush(objects=[job])
|
||||
db.session.refresh(job)
|
||||
try:
|
||||
job.makedirs()
|
||||
except OSError as e:
|
||||
current_app.logger.error(e)
|
||||
db.session.rollback()
|
||||
flash('Internal Server Error', 'error')
|
||||
return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa
|
||||
job_input = JobInput(
|
||||
filename=secure_filename(form.pdf.data.filename),
|
||||
job=job,
|
||||
mimetype=form.pdf.data.mimetype
|
||||
)
|
||||
db.session.add(job_input)
|
||||
db.session.flush(objects=[job_input])
|
||||
db.session.refresh(job_input)
|
||||
try:
|
||||
form.pdf.data.save(job_input.path)
|
||||
except OSError as e:
|
||||
current_app.logger.error(e)
|
||||
db.session.rollback()
|
||||
flash('Internal Server Error', 'error')
|
||||
return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa
|
||||
job.status = JobStatus.SUBMITTED
|
||||
db.session.commit()
|
||||
flash(f'Job "{job.title}" added', 'job')
|
||||
return make_response({'redirect_url': url_for('jobs.job', job_id=job.id)}, 201) # noqa
|
||||
return render_template(
|
||||
f'services/{service.replace("-", "_")}.html.j2',
|
||||
form=form,
|
||||
title=service_manifest['name']
|
||||
)
|
||||
|
||||
|
||||
@bp.route('/spacy-nlp-pipeline', methods=['GET', 'POST'])
|
||||
@login_required
|
||||
def spacy_nlp_pipeline():
|
||||
service = 'spacy-nlp-pipeline'
|
||||
service_manifest = SERVICES[service]
|
||||
version = request.args.get('version', SERVICES[service]['latest_version'])
|
||||
if version not in service_manifest['versions']:
|
||||
abort(404)
|
||||
form = AddSpacyNLPPipelineJobForm(prefix='add-job-form', version=version)
|
||||
if form.is_submitted():
|
||||
if not form.validate():
|
||||
return make_response(form.errors, 400)
|
||||
service_args = {}
|
||||
service_args['model'] = form.model.data
|
||||
if form.encoding_detection.data:
|
||||
service_args['encoding_detection'] = True
|
||||
job = Job(
|
||||
user=current_user,
|
||||
description=form.description.data,
|
||||
service=service,
|
||||
service_args=service_args,
|
||||
service_version=form.version.data,
|
||||
title=form.title.data
|
||||
)
|
||||
db.session.add(job)
|
||||
db.session.flush(objects=[job])
|
||||
db.session.refresh(job)
|
||||
try:
|
||||
job.makedirs()
|
||||
except OSError as e:
|
||||
current_app.logger.error(e)
|
||||
db.session.rollback()
|
||||
flash('Internal Server Error', 'error')
|
||||
return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa
|
||||
job_input = JobInput(
|
||||
filename=secure_filename(form.txt.data.filename),
|
||||
job=job,
|
||||
mimetype=form.txt.data.mimetype
|
||||
)
|
||||
db.session.add(job_input)
|
||||
db.session.flush(objects=[job_input])
|
||||
db.session.refresh(job_input)
|
||||
try:
|
||||
form.txt.data.save(job_input.path)
|
||||
except OSError as e:
|
||||
current_app.logger.error(e)
|
||||
db.session.rollback()
|
||||
flash('Internal Server Error', 'error')
|
||||
return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa
|
||||
job.status = JobStatus.SUBMITTED
|
||||
db.session.commit()
|
||||
flash(f'Job "{job.title}" added', 'job')
|
||||
return make_response({'redirect_url': url_for('jobs.job', job_id=job.id)}, 201) # noqa
|
||||
return render_template(
|
||||
f'services/{service.replace("-", "_")}.html.j2',
|
||||
form=form,
|
||||
title=service_manifest['name']
|
||||
)
|
||||
|
||||
|
||||
@bp.route('/corpus-analysis')
|
||||
@login_required
|
||||
def corpus_analysis():
|
||||
return render_template(
|
||||
'services/corpus_analysis.html.j2',
|
||||
title='Corpus analysis'
|
||||
)
|
@ -1,38 +1,70 @@
|
||||
# TODO: This could also be done via GitLab/GitHub APIs
|
||||
#file-setup-pipeline:
|
||||
file-setup:
|
||||
file-setup-pipeline:
|
||||
name: 'File setup pipeline'
|
||||
publisher: 'Bielefeld University - CRC 1288 - INF'
|
||||
latest_version: '0.1.0'
|
||||
versions:
|
||||
0.1.0:
|
||||
publisher: 'Bielefeld University - CRC 1288 - INF'
|
||||
publishing_year: 2022
|
||||
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup/-/releases/v0.1.0'
|
||||
#spacy-nlp-pipeline:
|
||||
spacy-nlp:
|
||||
name: 'spaCy NLP'
|
||||
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup-pipeline/-/releases/v0.1.0'
|
||||
tesseract-ocr-pipeline:
|
||||
name: 'Tesseract OCR Pipeline'
|
||||
publisher: 'Bielefeld University - CRC 1288 - INF'
|
||||
latest_version: '0.1.4'
|
||||
versions:
|
||||
0.1.0:
|
||||
methods:
|
||||
- 'binarization'
|
||||
publishing_year: 2022
|
||||
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.0'
|
||||
0.1.1:
|
||||
methods:
|
||||
- 'binarization'
|
||||
publishing_year: 2022
|
||||
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.1'
|
||||
0.1.2:
|
||||
methods:
|
||||
- 'binarization'
|
||||
publishing_year: 2022
|
||||
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.2'
|
||||
0.1.3:
|
||||
methods:
|
||||
- 'binarization'
|
||||
publishing_year: 2022
|
||||
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.3'
|
||||
0.1.4:
|
||||
methods:
|
||||
- 'binarization'
|
||||
publishing_year: 2022
|
||||
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.4'
|
||||
transkribus-htr-pipeline:
|
||||
name: 'Transkribus HTR Pipeline'
|
||||
publisher: 'Bielefeld University - CRC 1288 - INF'
|
||||
latest_version: '0.1.0'
|
||||
versions:
|
||||
0.1.0:
|
||||
methods:
|
||||
- 'binarization'
|
||||
publishing_year: 2022
|
||||
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/transkribus-htr-pipeline/-/releases/v0.1.0'
|
||||
spacy-nlp-pipeline:
|
||||
name: 'spaCy NLP Pipeline'
|
||||
publisher: 'Bielefeld University - CRC 1288 - INF'
|
||||
latest_version: '0.1.0'
|
||||
versions:
|
||||
0.1.0:
|
||||
methods:
|
||||
- 'encoding_detection'
|
||||
models:
|
||||
ca: 'Catalan'
|
||||
de: 'German'
|
||||
el: 'Greek'
|
||||
en: 'English'
|
||||
es: 'Spanish'
|
||||
fr: 'French'
|
||||
it: 'Italian'
|
||||
pl: 'Polish'
|
||||
ru: 'Russian'
|
||||
zh: 'Chinese'
|
||||
publisher: 'Bielefeld University - CRC 1288 - INF'
|
||||
publishing_year: 2022
|
||||
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/releases/v0.1.0'
|
||||
#tesseract-ocr-pipeline:
|
||||
tesseract-ocr:
|
||||
name: 'Tesseract OCR'
|
||||
latest_version: '0.1.0'
|
||||
versions:
|
||||
0.1.0:
|
||||
methods:
|
||||
- 'binarization'
|
||||
publisher: 'Bielefeld University - CRC 1288 - INF'
|
||||
publishing_year: 2022
|
||||
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/releases/v0.1.0'
|
||||
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/spacy-nlp-pipeline/-/releases/v0.1.0'
|
@ -28,20 +28,25 @@ $color: (
|
||||
"darken": #6b3f89,
|
||||
"lighten": #ebe8f6
|
||||
),
|
||||
"file-setup": (
|
||||
"file-setup-pipeline": (
|
||||
"base": #d5dc95,
|
||||
"darken": #a1b300,
|
||||
"lighten": #f2f3e1
|
||||
),
|
||||
"spacy-nlp": (
|
||||
"spacy-nlp-pipeline": (
|
||||
"base": #98acd2,
|
||||
"darken": #0064a3,
|
||||
"lighten": #e5e8f5
|
||||
),
|
||||
"tesseract-ocr": (
|
||||
"tesseract-ocr-pipeline": (
|
||||
"base": #a9d8c8,
|
||||
"darken": #00a58b,
|
||||
"lighten": #e7f4f1
|
||||
),
|
||||
"transkribus-htr-pipeline": (
|
||||
"base": #607d8b,
|
||||
"darken": #37474f,
|
||||
"lighten": #cfd8dc
|
||||
)
|
||||
),
|
||||
"status": (
|
||||
|
@ -43,9 +43,10 @@ h1 .nopaque-icons, h2 .nopaque-icons, h3 .nopaque-icons, h4 .nopaque-icons, .tab
|
||||
.job-status-text {text-transform: lowercase;}
|
||||
.job-status-text[data-job-status]:empty:before {content: attr(data-job-status);}
|
||||
|
||||
.nopaque-icons.service-icon[data-service="file-setup"]:empty:before {content: "E";}
|
||||
.nopaque-icons.service-icon[data-service="tesseract-ocr"]:empty:before {content: "F";}
|
||||
.nopaque-icons.service-icon[data-service="spacy-nlp"]:empty:before {content: "G";}
|
||||
.nopaque-icons.service-icon[data-service="file-setup-pipeline"]:empty:before {content: "E";}
|
||||
.nopaque-icons.service-icon[data-service="tesseract-ocr-pipeline"]:empty:before {content: "F";}
|
||||
.nopaque-icons.service-icon[data-service="transkribus-htr-pipeline"]:empty:before {content: "F";}
|
||||
.nopaque-icons.service-icon[data-service="spacy-nlp-pipeline"]:empty:before {content: "G";}
|
||||
.nopaque-icons.service-icon[data-service="corpus-analysis"]:empty:before {content: "H";}
|
||||
|
||||
.hoverable {cursor: pointer;}
|
||||
|
@ -3,11 +3,13 @@
|
||||
<h2>Roadmap</h2>
|
||||
<p>The roadmap guides you through nopaque's workflow! If you have the necessary input fie formats, you can directly jump into the corresponding process. If not, you can use the roadmap to jump right to the preceding process.</p>
|
||||
<ul class="tabs tabs-fixed-width">
|
||||
<li class="tab"><a{%if request.path == url_for('services.service', service='file-setup') %} class="active"{% endif %} href="{{ url_for('services.service', service='file-setup') }}" target="_self">File setup</a></li>
|
||||
<li class="tab"><a{%if request.path == url_for('services.file_setup_pipeline') %} class="active"{% endif %} href="{{ url_for('services.file_setup_pipeline') }}" target="_self">File setup</a></li>
|
||||
<li class="tab disabled"><i class="material-icons">navigate_next</i></li>
|
||||
<li class="tab"><a{%if request.path == url_for('services.service', service='tesseract-ocr') %} class="active"{% endif %} href="{{ url_for('services.service', service='tesseract-ocr') }}" target="_self">OCR</a></li>
|
||||
<li class="tab"><a{%if request.path == url_for('services.tesseract_ocr_pipeline') %} class="active"{% endif %} href="{{ url_for('services.tesseract_ocr_pipeline') }}" target="_self">OCR</a></li>
|
||||
<li class="tab disabled"><i class="material-icons">more_vert</i></li>
|
||||
<li class="tab"><a{%if request.path == url_for('services.transkribus_htr_pipeline') %} class="active"{% endif %} href="{{ url_for('services.transkribus_htr_pipeline') }}" target="_self">HTR</a></li>
|
||||
<li class="tab disabled"><i class="material-icons">navigate_next</i></li>
|
||||
<li class="tab"><a{%if request.path == url_for('services.service', service='spacy-nlp') %} class="active"{% endif %} href="{{ url_for('services.service', service='spacy-nlp') }}" target="_self">NLP</a></li>
|
||||
<li class="tab"><a{%if request.path == url_for('services.spacy_nlp_pipeline') %} class="active"{% endif %} href="{{ url_for('services.spacy_nlp_pipeline') }}" target="_self">NLP</a></li>
|
||||
<li class="tab disabled"><i class="material-icons">navigate_next</i></li>
|
||||
<li class="tab"><a{%if request.path == url_for('corpora.add_corpus') %} class="active"{% endif %} href="{{ url_for('corpora.add_corpus') }}" target="_self">Add corpus</a></li>
|
||||
<li class="tab disabled"><i class="material-icons">navigate_next</i></li>
|
||||
|
@ -14,10 +14,13 @@
|
||||
<li><a href="{{ url_for('main.dashboard', _anchor='jobs') }}" style="padding-left: 47px;"><i class="nopaque-icons">J</i>My Jobs</a></li>
|
||||
<li><div class="divider"></div></li>
|
||||
<li><a class="subheader">Processes & Services</a></li>
|
||||
<li class="service-color service-color-border border-darken" data-service="file-setup" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='file-setup') }}"><i class="nopaque-icons service-icon" data-service="file-setup"></i>File setup</a></li>
|
||||
<li class="service-color service-color-border border-darken" data-service="tesseract-ocr" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='tesseract-ocr') }}"><i class="nopaque-icons service-icon" data-service="tesseract-ocr"></i>OCR</a></li>
|
||||
<li class="service-color service-color-border border-darken" data-service="spacy-nlp" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='spacy-nlp') }}"><i class="nopaque-icons service-icon" data-service="spacy-nlp"></i>NLP</a></li>
|
||||
<li class="service-color service-color-border border-darken" data-service="corpus-analysis" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='corpus-analysis') }}"><i class="nopaque-icons service-icon" data-service="corpus-analysis"></i>Corpus analysis</a></li>
|
||||
<li class="service-color service-color-border border-darken" data-service="file-setup-pipeline" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.file_setup_pipeline') }}"><i class="nopaque-icons service-icon" data-service="file-setup-pipeline"></i>File setup</a></li>
|
||||
<li class="service-color service-color-border border-darken" data-service="tesseract-ocr-pipeline" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.tesseract_ocr_pipeline') }}"><i class="nopaque-icons service-icon" data-service="tesseract-ocr-pipeline"></i>OCR</a></li>
|
||||
{% if config.NOPAQUE_TRANSKRIBUS_ENABLED %}
|
||||
<li class="service-color service-color-border border-darken" data-service="transkribus-htr-pipeline" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.transkribus_htr_pipeline') }}"><i class="nopaque-icons service-icon" data-service="transkribus-htr-pipeline"></i>HTR</a></li>
|
||||
{% endif %}
|
||||
<li class="service-color service-color-border border-darken" data-service="spacy-nlp-pipeline" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.spacy_nlp_pipeline') }}"><i class="nopaque-icons service-icon" data-service="spacy-nlp-pipeline"></i>NLP</a></li>
|
||||
<li class="service-color service-color-border border-darken" data-service="corpus-analysis" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.corpus_analysis') }}"><i class="nopaque-icons service-icon" data-service="corpus-analysis"></i>Corpus analysis</a></li>
|
||||
<li><div class="divider"></div></li>
|
||||
<li><a class="subheader">Account</a></li>
|
||||
<li><a href="{{ url_for('settings.index') }}"><i class="material-icons">settings</i>Settings</a></li>
|
||||
|
@ -115,37 +115,37 @@
|
||||
<div class="col s12 m4">
|
||||
<div class="card-panel center-align hoverable">
|
||||
<br>
|
||||
<a href="{{ url_for('services.service', service='file-setup') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);">
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="file-setup"></i>
|
||||
<a href="{{ url_for('services.file_setup_pipeline') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);">
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="file-setup-pipeline"></i>
|
||||
</a>
|
||||
<br><br>
|
||||
<p class="service-color-text darken" data-service="file-setup"><b>File setup</b></p>
|
||||
<p class="service-color-text darken" data-service="file-setup-pipeline"><b>File setup</b></p>
|
||||
<p class="light">Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing.</p>
|
||||
<a href="{{ url_for('services.service', service='file-setup') }}" class="waves-effect waves-light btn service-color darken" data-service="file-setup">Create Job</a>
|
||||
<a href="{{ url_for('services.file_setup_pipeline') }}" class="waves-effect waves-light btn service-color darken" data-service="file-setup-pipeline">Create Job</a>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col s12 m4">
|
||||
<div class="card-panel center-align hoverable">
|
||||
<br>
|
||||
<a href="{{ url_for('services.service', service='tesseract-ocr') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);">
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr" style="font-size: 2.5rem;"></i>
|
||||
<a href="{{ url_for('services.tesseract_ocr_pipeline') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);">
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr-pipeline" style="font-size: 2.5rem;"></i>
|
||||
</a>
|
||||
<br><br>
|
||||
<p class="service-color-text darken" data-service="tesseract-ocr"><b>Optical Character Recognition</b></p>
|
||||
<p class="service-color-text darken" data-service="tesseract-ocr-pipeline"><b>Optical Character Recognition</b></p>
|
||||
<p class="light">nopaque converts your image data – like photos or scans – into text data through a process called OCR. This step enables you to proceed with further computational analysis of your documents.</p>
|
||||
<a href="{{ url_for('services.service', service='tesseract-ocr') }}" class="waves-effect waves-light btn service-color darken" data-service="tesseract-ocr">Create Job</a>
|
||||
<a href="{{ url_for('services.tesseract_ocr_pipeline') }}" class="waves-effect waves-light btn service-color darken" data-service="tesseract-ocr-pipeline">Create Job</a>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col s12 m4">
|
||||
<div class="card-panel center-align hoverable">
|
||||
<br>
|
||||
<a href="{{ url_for('services.service', service='spacy-nlp') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);">
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp" style="font-size: 2.5rem;"></i>
|
||||
<a href="{{ url_for('services.spacy_nlp_pipeline') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);">
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp-pipeline" style="font-size: 2.5rem;"></i>
|
||||
</a>
|
||||
<br><br>
|
||||
<p class="service-color-text darken" data-service="spacy-nlp"><b>Natural Language Processing</b></p>
|
||||
<p class="service-color-text darken" data-service="spacy-nlp-pipeline"><b>Natural Language Processing</b></p>
|
||||
<p class="light">By means of computational linguistic data processing (tokenization, lemmatization, part-of-speech tagging and named-entity recognition) nopaque extracts additional information from your text.</p>
|
||||
<a href="{{ url_for('services.service', service='spacy-nlp') }}" class="waves-effect waves-light btn service-color darken" data-service="spacy-nlp">Create Job</a>
|
||||
<a href="{{ url_for('services.spacy_nlp_pipeline') }}" class="waves-effect waves-light btn service-color darken" data-service="spacy-nlp-pipeline">Create Job</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
@ -35,9 +35,9 @@
|
||||
<p>Our source code is spread over multiple Git repositories.</p>
|
||||
<ul>
|
||||
<li>nopaque frontend: <a href="https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque" target="_blank">https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque</a></li>
|
||||
<li>File setup: <a href="https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup" target="_blank">https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup</a></li>
|
||||
<li>OCR: <a href="https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr" target="_blank">https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr</a></li>
|
||||
<li>NLP: <a href="https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp" target="_blank">https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp</a></li>
|
||||
<li>File Setup Pipeline: <a href="https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup-pipeline" target="_blank">https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup-pipeline</a></li>
|
||||
<li>Tesseract OCR Pipeline: <a href="https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline" target="_blank">https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline</a></li>
|
||||
<li>spaCy NLP Pipeline: <a href="https://gitlab.ub.uni-bielefeld.de/sfb1288inf/spacy-nlp-pipeline" target="_blank">https://gitlab.ub.uni-bielefeld.de/sfb1288inf/spacy-nlp-pipeline</a></li>
|
||||
<li>Corpus analysis backend: <a href="https://gitlab.ub.uni-bielefeld.de/sfb1288inf/cqpserver" target="_blank">https://gitlab.ub.uni-bielefeld.de/sfb1288inf/cqpserver</a></li>
|
||||
<li>Corpus analysis backend connector: <a href="https://github.com/Pevtrick/cqi-py" target="_blank">https://github.com/Pevtrick/cqi-py</a></li>
|
||||
</ul>
|
||||
|
@ -76,31 +76,31 @@
|
||||
<p class="hide-on-small-only"> </p>
|
||||
<div class="row">
|
||||
<div class="col s12 m6 l3 center-align">
|
||||
<a href="{{ url_for('services.service', service='file-setup') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="file-setup"></i>
|
||||
<a href="{{ url_for('services.file_setup_pipeline') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="file-setup-pipeline"></i>
|
||||
</a>
|
||||
<br><br>
|
||||
<p class="service-color-text text-darken" data-service="file-setup"><b>File setup</b></p>
|
||||
<p class="service-color-text text-darken" data-service="file-setup-pipeline"><b>File setup</b></p>
|
||||
<p class="light">Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing and the application of other services.</p>
|
||||
</div>
|
||||
<div class="col s12 m6 l3 center-align">
|
||||
<a href="{{ url_for('services.service', service='tesseract-ocr') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr"></i>
|
||||
<a href="{{ url_for('services.tesseract_ocr_pipeline') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr-pipeline"></i>
|
||||
</a>
|
||||
<br><br>
|
||||
<p class="service-color-text text-darken" data-service="tesseract-ocr"><b>Optical Character Recognition</b></p>
|
||||
<p class="service-color-text text-darken" data-service="tesseract-ocr-pipeline"><b>Optical Character Recognition</b></p>
|
||||
<p class="light">nopaque converts your image data – like photos or scans – into text data through OCR making it machine readable. This step enables you to proceed with further computational analysis of your documents.</p>
|
||||
</div>
|
||||
<div class="col s12 m6 l3 center-align">
|
||||
<a href="{{ url_for('services.service', service='spacy-nlp') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp"></i>
|
||||
<a href="{{ url_for('services.spacy_nlp_pipeline') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp-pipeline"></i>
|
||||
</a>
|
||||
<br><br>
|
||||
<p class="service-color-text text-darken" data-service="spacy-nlp"><b>Natural Language Processing</b></p>
|
||||
<p class="service-color-text text-darken" data-service="spacy-nlp-pipeline"><b>Natural Language Processing</b></p>
|
||||
<p class="light">By means of computational linguistic data processing (tokenization, lemmatization, part-of-speech tagging and named-entity recognition) nopaque extracts additional information from your text.</p>
|
||||
</div>
|
||||
<div class="col s12 m6 l3 center-align">
|
||||
<a href="{{ url_for('services.service', service='corpus_analysis') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
|
||||
<a href="{{ url_for('services.corpus_analysis') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="corpus-analysis"></i>
|
||||
</a>
|
||||
<br><br>
|
||||
|
@ -2,13 +2,15 @@
|
||||
<li class="tab disabled"><i class="material-icons">navigate_next</i></li>
|
||||
<li class="tab"><a href="{{ url_for('main.index', _anchor='services') }}" target="_self">Processes & Services</a></li>
|
||||
<li class="tab disabled"><i class="material-icons">navigate_next</i></li>
|
||||
{% if request.path == url_for('.service', service='corpus-analysis') %}
|
||||
<li class="tab"><a class="active" href="{{ url_for('.service', service='corpus-analysis') }}" target="_self">{{ title }}</a></li>
|
||||
{% elif request.path == url_for('.service', service='file-setup') %}
|
||||
<li class="tab"><a class="active" href="{{ url_for('.service', service='file-setup') }}" target="_self">{{ title }}</a></li>
|
||||
{% elif request.path == url_for('.service', service='nlp') %}
|
||||
<li class="tab"><a class="active" href="{{ url_for('.service', service='nlp') }}" target="_self">{{ title }}</a></li>
|
||||
{% elif request.path == url_for('.service', service='ocr') %}
|
||||
<li class="tab"><a class="active" href="{{ url_for('.service', service='ocr') }}" target="_self">{{ title }}</a></li>
|
||||
{% if request.path == url_for('.corpus_analysis') %}
|
||||
<li class="tab"><a class="active" href="{{ url_for('.corpus_analysis') }}" target="_self">{{ title }}</a></li>
|
||||
{% elif request.path == url_for('.file_setup_pipeline') %}
|
||||
<li class="tab"><a class="active" href="{{ url_for('.file_setup_pipeline') }}" target="_self">{{ title }}</a></li>
|
||||
{% elif request.path == url_for('.spacy_nlp_pipeline') %}
|
||||
<li class="tab"><a class="active" href="{{ url_for('.spacy_nlp_pipeline') }}" target="_self">{{ title }}</a></li>
|
||||
{% elif request.path == url_for('.tesseract_ocr_pipeline') %}
|
||||
<li class="tab"><a class="active" href="{{ url_for('.tesseract_ocr_pipeline') }}" target="_self">{{ title }}</a></li>
|
||||
{% elif request.path == url_for('.transkribus_htr_pipeline') %}
|
||||
<li class="tab"><a class="active" href="{{ url_for('.transkribus_htr_pipeline') }}" target="_self">{{ title }}</a></li>
|
||||
{% endif %}
|
||||
{% endset %}
|
||||
|
@ -2,7 +2,7 @@
|
||||
{% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %}
|
||||
{% import "materialize/wtf.html.j2" as wtf %}
|
||||
|
||||
{% block main_attribs %} class="service-scheme" data-service="file-setup"{% endblock main_attribs %}
|
||||
{% block main_attribs %} class="service-scheme" data-service="file-setup-pipeline"{% endblock main_attribs %}
|
||||
|
||||
{% block page_content %}
|
||||
<div class="container">
|
||||
@ -16,13 +16,13 @@
|
||||
<p class="hide-on-small-only"> </p>
|
||||
<p class="hide-on-small-only"> </p>
|
||||
<a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="file-setup"></i>
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="file-setup-pipeline"></i>
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="col s12 m9 pull-m3">
|
||||
<div class="card service-color-border border-darken" data-service="file-setup" style="border-top: 10px solid;">
|
||||
<div class="card service-color-border border-darken" data-service="file-setup-pipeline" style="border-top: 10px solid;">
|
||||
<div class="card-content">
|
||||
<div class="row">
|
||||
<div class="col s12">
|
||||
@ -50,7 +50,7 @@
|
||||
{{ wtf.render_field(form.description, data_length='255', material_icon='description') }}
|
||||
</div>
|
||||
<div class="col s12 l9">
|
||||
{{ wtf.render_field(form.files, accept='image/jpeg, image/png, image/tiff', placeholder='Choose your .jpeg, .png or .tiff files') }}
|
||||
{{ wtf.render_field(form.images, accept='image/jpeg, image/png, image/tiff', placeholder='Choose JPEG, PNG or TIFF files') }}
|
||||
</div>
|
||||
<div class="col s12 l3">
|
||||
{{ wtf.render_field(form.version, material_icon='apps') }}
|
@ -2,7 +2,7 @@
|
||||
{% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %}
|
||||
{% import "materialize/wtf.html.j2" as wtf %}
|
||||
|
||||
{% block main_attribs %} class="service-scheme" data-service="spacy-nlp"{% endblock main_attribs %}
|
||||
{% block main_attribs %} class="service-scheme" data-service="spacy-nlp-pipeline"{% endblock main_attribs %}
|
||||
|
||||
{% block page_content %}
|
||||
<div class="container">
|
||||
@ -16,13 +16,13 @@
|
||||
<p class="hide-on-small-only"> </p>
|
||||
<p class="hide-on-small-only"> </p>
|
||||
<a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp"></i>
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp-pipeline"></i>
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="col s12 m9 pull-m3">
|
||||
<div class="card service-color-border border-darken" data-service="spacy-nlp" style="border-top: 10px solid;">
|
||||
<div class="card service-color-border border-darken" data-service="spacy-nlp-pipeline" style="border-top: 10px solid;">
|
||||
<div class="card-content">
|
||||
<div class="row">
|
||||
<div class="col s12 m6">
|
||||
@ -68,7 +68,7 @@
|
||||
{{ wtf.render_field(form.description, data_length='255', material_icon='description') }}
|
||||
</div>
|
||||
<div class="col s12 l5">
|
||||
{{ wtf.render_field(form.files, accept='text/plain', placeholder='Choose your .txt files') }}
|
||||
{{ wtf.render_field(form.txt, accept='text/plain', placeholder='Choose a plain text file') }}
|
||||
</div>
|
||||
<div class="col s12 l4">
|
||||
{{ wtf.render_field(form.model, material_icon='language') }}
|
@ -2,7 +2,7 @@
|
||||
{% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %}
|
||||
{% import "materialize/wtf.html.j2" as wtf %}
|
||||
|
||||
{% block main_attribs %} class="service-scheme" data-service="tesseract-ocr"{% endblock main_attribs %}
|
||||
{% block main_attribs %} class="service-scheme" data-service="tesseract-ocr-pipeline"{% endblock main_attribs %}
|
||||
|
||||
{% block page_content %}
|
||||
<div class="container">
|
||||
@ -16,13 +16,13 @@
|
||||
<p class="hide-on-small-only"> </p>
|
||||
<p class="hide-on-small-only"> </p>
|
||||
<a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr"></i>
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr-pipeline"></i>
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="col s12 m9 pull-m3">
|
||||
<div class="card service-color-border border-darken" data-service="tesseract-ocr" style="border-top: 10px solid;">
|
||||
<div class="card service-color-border border-darken" data-service="tesseract-ocr-pipeline" style="border-top: 10px solid;">
|
||||
<div class="card-content">
|
||||
<div class="row">
|
||||
<div class="col s12">
|
||||
@ -50,7 +50,7 @@
|
||||
{{ wtf.render_field(form.description, data_length='255', material_icon='description') }}
|
||||
</div>
|
||||
<div class="col s12 l5">
|
||||
{{ wtf.render_field(form.files, accept='application/pdf', placeholder='Choose your .pdf files') }}
|
||||
{{ wtf.render_field(form.pdf, accept='application/pdf', placeholder='Choose a PDF file') }}
|
||||
</div>
|
||||
<div class="col s12 l4">
|
||||
{{ wtf.render_field(form.model, material_icon='language') }}
|
169
app/templates/services/transkribus_htr_pipeline.html.j2
Normal file
169
app/templates/services/transkribus_htr_pipeline.html.j2
Normal file
@ -0,0 +1,169 @@
|
||||
{% extends "base.html.j2" %}
|
||||
{% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %}
|
||||
{% import "materialize/wtf.html.j2" as wtf %}
|
||||
|
||||
{% block main_attribs %} class="service-scheme" data-service="transkribus-htr-pipeline"{% endblock main_attribs %}
|
||||
|
||||
{% block page_content %}
|
||||
<div class="container">
|
||||
<div class="row">
|
||||
<div class="col s12">
|
||||
<h1 id="title">{{ title }}</h1>
|
||||
</div>
|
||||
|
||||
<div class="col s12 m3 push-m9">
|
||||
<div class="center-align">
|
||||
<p class="hide-on-small-only"> </p>
|
||||
<p class="hide-on-small-only"> </p>
|
||||
<a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="transkribus-htr-pipeline"></i>
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="col s12 m9 pull-m3">
|
||||
<div class="card service-color-border border-darken" data-service="transkribus-htr-pipeline" style="border-top: 10px solid;">
|
||||
<div class="card-content">
|
||||
<div class="row">
|
||||
<div class="col s12">
|
||||
<div class="card-panel z-depth-0">
|
||||
<span class="card-title"><i class="left material-icons">layers</i>HTR</span>
|
||||
<p>In this process, nopaque converts your image data – like photos or scans – into text data. This step enables you to proceed with the computational analysis of your documents.</p>
|
||||
<p class="right-align">
|
||||
<a href="https://readcoop.eu/de/transkribus/" target="_blank">
|
||||
<img src="https://readcoop.eu/wp-content/uploads/2020/02/Logo_Transkribus_web.svg" title="Logoo_Transkribus_web" alt="Logoo_Transkribus_web" style="width: 30%;">
|
||||
</a>
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="col s12">
|
||||
<h2>Submit a job</h2>
|
||||
<div class="card">
|
||||
<form class="nopaque-upload-form" data-progress-modal="progress-modal">
|
||||
<div class="card-content">
|
||||
{{ form.hidden_tag() }}
|
||||
<div class="row">
|
||||
<div class="col s12 l4">
|
||||
{{ wtf.render_field(form.title, data_length='32', material_icon='title') }}
|
||||
</div>
|
||||
<div class="col s12 l8">
|
||||
{{ wtf.render_field(form.description, data_length='255', material_icon='description') }}
|
||||
</div>
|
||||
<div class="col s12 l5">
|
||||
{{ wtf.render_field(form.pdf, accept='application/pdf', placeholder='Choose a PDF file') }}
|
||||
</div>
|
||||
<div class="col s12 l4">
|
||||
{{ wtf.render_field(form.model, material_icon='language') }}
|
||||
</div>
|
||||
<div class="col s12 l3">
|
||||
{{ wtf.render_field(form.version, material_icon='apps') }}
|
||||
</div>
|
||||
<div class="col s12">
|
||||
<span class="card-title">Preprocessing</span>
|
||||
</div>
|
||||
<div class="col s9">
|
||||
<p>{{ form.binarization.label.text }}</p>
|
||||
<p class="light">Based on a brightness threshold pixels are converted into either black or white. It is useful to reduce noise in images. (<b>longer duration</b>)</p>
|
||||
</div>
|
||||
<div class="col s3 right-align">
|
||||
<div class="switch">
|
||||
<label>
|
||||
{{ form.binarization() }}
|
||||
<span class="lever"></span>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col s12"><p> </p></div>
|
||||
<div class="col s12 divider"></div>
|
||||
<div class="col s12"><p> </p></div>
|
||||
<div class="col s9">
|
||||
<p>Page range</p>
|
||||
<p class="light"></p>
|
||||
</div>
|
||||
<div class="col s3 right-align">
|
||||
<div class="switch">
|
||||
<label>
|
||||
<input disabled type="checkbox">
|
||||
<span class="lever"></span>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col s12"><p> </p></div>
|
||||
<div class="col s12 divider"></div>
|
||||
<div class="col s12"><p> </p></div>
|
||||
<div class="col s9">
|
||||
<p>Page rotation</p>
|
||||
<p class="light"></p>
|
||||
</div>
|
||||
<div class="col s3 right-align">
|
||||
<div class="switch">
|
||||
<label>
|
||||
<input disabled type="checkbox">
|
||||
<span class="lever"></span>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col s12"><p> </p></div>
|
||||
<div class="col s12 divider"></div>
|
||||
<div class="col s12"><p> </p></div>
|
||||
<div class="col s9">
|
||||
<p>Page split</p>
|
||||
<p class="light"></p>
|
||||
</div>
|
||||
<div class="col s3 right-align">
|
||||
<div class="switch">
|
||||
<label>
|
||||
<input disabled type="checkbox">
|
||||
<span class="lever"></span>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
<!--
|
||||
Seperate each setting with the following
|
||||
<div class="col s12"><p> </p></div>
|
||||
<div class="col s12 divider"></div>
|
||||
<div class="col s12"><p> </p></div>
|
||||
-->
|
||||
</div>
|
||||
</div>
|
||||
<div class="card-action right-align">
|
||||
{{ wtf.render_field(form.submit, material_icon='send') }}
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endblock page_content %}
|
||||
|
||||
{% block modals %}
|
||||
{{ super() }}
|
||||
<div id="progress-modal" class="modal">
|
||||
<div class="modal-content">
|
||||
<h4><i class="material-icons left">file_upload</i>Uploading files...</h4>
|
||||
<div class="progress">
|
||||
<div class="determinate" style="width: 0%"></div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="modal-footer">
|
||||
<a href="#!" class="modal-close waves-effect waves-light btn red abort-request">Cancel</a>
|
||||
</div>
|
||||
</div>
|
||||
{% endblock modals %}
|
||||
|
||||
{% block scripts %}
|
||||
{{ super() }}
|
||||
<script>
|
||||
let versionField = document.querySelector('#add-job-form-version');
|
||||
versionField.addEventListener('change', (event) => {
|
||||
let url = new URL(window.location.href);
|
||||
url.search = `?version=${event.target.value}`;
|
||||
window.location.href = url.toString();
|
||||
});
|
||||
</script>
|
||||
{% endblock scripts %}
|
@ -92,6 +92,11 @@ class Config:
|
||||
NOPAQUE_PROXY_FIX_X_PROTO = \
|
||||
int(os.environ.get('NOPAQUE_PROXY_FIX_X_PROTO', '0'))
|
||||
|
||||
NOPAQUE_TRANSKRIBUS_ENABLED = \
|
||||
os.environ.get('NOPAQUE_TRANSKRIBUS_ENABLED', 'true').lower() == 'true'
|
||||
NOPAQUE_READCOOP_USERNAME = os.environ.get('NOPAQUE_READCOOP_USERNAME')
|
||||
NOPAQUE_READCOOP_PASSWORD = os.environ.get('NOPAQUE_READCOOP_PASSWORD')
|
||||
|
||||
@classmethod
|
||||
def init_app(cls, app: Flask):
|
||||
# Set up logging according to the corresponding (NOPAQUE_LOG_*)
|
||||
|
@ -18,13 +18,15 @@ services:
|
||||
- "traefik.http.middlewares.http-nopaque-headers.headers.customrequestheaders.X-Forwarded-Proto=http"
|
||||
- "traefik.http.routers.http-nopaque.entrypoints=http"
|
||||
- "traefik.http.routers.http-nopaque.middlewares=http-nopaque-headers, redirect-to-https@file"
|
||||
- "traefik.http.routers.http-nopaque.rule=Host(`${SERVER_NAME}`)"
|
||||
# Replace <nopaque-domain> with your domain
|
||||
- "traefik.http.routers.http-nopaque.rule=Host(`<nopaque-domain>`)"
|
||||
### </http> ###
|
||||
### <https> ###
|
||||
- "traefik.http.middlewares.https-nopaque-headers.headers.customrequestheaders.X-Forwarded-Proto=https"
|
||||
- "traefik.http.routers.https-nopaque.entrypoints=https"
|
||||
- "traefik.http.routers.https-nopaque.middlewares=hsts-header@file, https-nopaque-headers"
|
||||
- "traefik.http.routers.https-nopaque.rule=Host(`${SERVER_NAME}`)"
|
||||
# Replace <nopaque-domain> with your domain
|
||||
- "traefik.http.routers.https-nopaque.rule=Host(`<nopaque-domain>`)"
|
||||
- "traefik.http.routers.https-nopaque.tls.certresolver=<CERTRESOLVER>"
|
||||
- "traefik.http.routers.https-nopaque.tls.options=intermediate@file"
|
||||
### </https> ###
|
||||
|
@ -1,8 +1,8 @@
|
||||
"""empty message
|
||||
|
||||
Revision ID: 097aae1f02d7
|
||||
Revision ID: aa855b80cf1d
|
||||
Revises:
|
||||
Create Date: 2022-02-08 10:02:03.748588
|
||||
Create Date: 2022-04-01 12:14:42.606685
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
@ -10,7 +10,7 @@ import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = '097aae1f02d7'
|
||||
revision = 'aa855b80cf1d'
|
||||
down_revision = None
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
@ -56,7 +56,6 @@ def upgrade():
|
||||
sa.Column('title', sa.String(length=32), nullable=True),
|
||||
sa.Column('num_analysis_sessions', sa.Integer(), nullable=True),
|
||||
sa.Column('num_tokens', sa.Integer(), nullable=True),
|
||||
sa.Column('archive_file', sa.String(length=255), nullable=True),
|
||||
sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
|
||||
sa.PrimaryKeyConstraint('id')
|
||||
)
|
||||
@ -85,6 +84,7 @@ def upgrade():
|
||||
sa.Column('description', sa.String(length=255), nullable=True),
|
||||
sa.Column('publisher', sa.String(length=128), nullable=True),
|
||||
sa.Column('publishing_year', sa.Integer(), nullable=True),
|
||||
sa.Column('shared', sa.Boolean(), nullable=True),
|
||||
sa.Column('title', sa.String(length=64), nullable=True),
|
||||
sa.Column('version', sa.String(length=16), nullable=True),
|
||||
sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
|
@ -2,6 +2,7 @@ cqi
|
||||
docker
|
||||
eventlet==0.30.2
|
||||
Flask==1.1.4
|
||||
Flask-APScheduler
|
||||
Flask-Assets
|
||||
Flask-Hashids
|
||||
Flask-HTTPAuth
|
||||
@ -16,6 +17,7 @@ Flask-WTF
|
||||
gunicorn
|
||||
hiredis
|
||||
jsonschema
|
||||
MarkupSafe==2.0.1
|
||||
psycopg2
|
||||
pyScss
|
||||
python-dotenv
|
||||
|
Loading…
x
Reference in New Issue
Block a user