From ce997e69ea0f3487727c418083539c8da39ea94f Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Mon, 4 Apr 2022 13:31:09 +0200 Subject: [PATCH] Rename all services, use scss, cleanup, add sandpaper conversion script --- .env.tpl | 8 + .gitignore | 3 +- Dockerfile | 4 +- app/TesseractOCRModel.defaults.yml | 408 ++++++++++++++++++ app/TranskribusHTRModel.defaults.yml | 0 app/__init__.py | 2 + app/cli.py | 68 +-- app/converters/__init__.py | 0 app/converters/sandpaper.py | 215 +++++++++ app/corpora/routes.py | 2 +- app/daemon/job_utils.py | 58 ++- app/models.py | 82 ++-- app/services/forms.py | 179 ++++---- app/services/routes.py | 244 +++++++++-- app/services/services.yml | 72 +++- app/static/css/colors.scss | 11 +- app/static/css/style.css | 7 +- app/templates/_roadmap.html.j2 | 8 +- app/templates/_sidenav.html.j2 | 11 +- app/templates/main/dashboard.html.j2 | 24 +- app/templates/main/faq.html.j2 | 6 +- app/templates/main/index.html.j2 | 20 +- app/templates/services/_breadcrumbs.html.j2 | 18 +- ...up.html.j2 => file_setup_pipeline.html.j2} | 8 +- ...nlp.html.j2 => spacy_nlp_pipeline.html.j2} | 8 +- ...html.j2 => tesseract_ocr_pipeline.html.j2} | 8 +- .../services/transkribus_htr_pipeline.html.j2 | 169 ++++++++ config.py | 5 + docker-compose.traefik.yml | 6 +- .../{097aae1f02d7_.py => aa855b80cf1d_.py} | 8 +- requirements.txt | 2 + 31 files changed, 1361 insertions(+), 303 deletions(-) create mode 100644 app/TranskribusHTRModel.defaults.yml create mode 100644 app/converters/__init__.py create mode 100644 app/converters/sandpaper.py rename app/templates/services/{file_setup.html.j2 => file_setup_pipeline.html.j2} (89%) rename app/templates/services/{spacy_nlp.html.j2 => spacy_nlp_pipeline.html.j2} (95%) rename app/templates/services/{tesseract_ocr.html.j2 => tesseract_ocr_pipeline.html.j2} (95%) create mode 100644 app/templates/services/transkribus_htr_pipeline.html.j2 rename migrations/versions/{097aae1f02d7_.py => aa855b80cf1d_.py} (97%) diff --git a/.env.tpl b/.env.tpl index 9e067f09..07db6275 100644 --- a/.env.tpl +++ b/.env.tpl @@ -168,3 +168,11 @@ NOPAQUE_SOCKETIO_MESSAGE_QUEUE_URI= # DEFAULT: 0 # Number of values to trust for X-Forwarded-Proto # NOPAQUE_PROXY_FIX_X_PROTO= + +# CHOOSE ONE: False, True +# DEFAULT: False +# NOPAQUE_TRANSKRIBUS_ENABLED= + +# READ-COOP account data: https://readcoop.eu/ +# NOPAQUE_READCOOP_USERNAME= +# NOPAQUE_READCOOP_PASSWORD= \ No newline at end of file diff --git a/.gitignore b/.gitignore index 61a99e04..76c4e06b 100644 --- a/.gitignore +++ b/.gitignore @@ -18,7 +18,8 @@ data/** pip-log.txt # Logs in log folder -logs/*.log +logs/* +!logs/dummy # Packages *.egg diff --git a/Dockerfile b/Dockerfile index c327703f..6ec450b5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ -FROM python:3.9.8-slim-buster +FROM python:3.8.13-slim-buster -LABEL authors="Patrick Jentsch , Stephan Porada " +LABEL authors="Patrick Jentsch " ARG DOCKER_GID diff --git a/app/TesseractOCRModel.defaults.yml b/app/TesseractOCRModel.defaults.yml index 37929e89..0d067d54 100644 --- a/app/TesseractOCRModel.defaults.yml +++ b/app/TesseractOCRModel.defaults.yml @@ -6,6 +6,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Amharic' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/amh.traineddata' @@ -14,6 +18,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' - title: 'Arabic' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ara.traineddata' @@ -22,6 +30,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' # - title: 'Assamese' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/asm.traineddata' @@ -30,6 +42,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Azerbaijani' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/aze.traineddata' @@ -38,6 +54,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Azerbaijani - Cyrillic' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/aze_cyrl.traineddata' @@ -46,6 +66,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Belarusian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bel.traineddata' @@ -54,6 +78,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Bengali' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ben.traineddata' @@ -62,6 +90,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Tibetan' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bod.traineddata' @@ -70,6 +102,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Bosnian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bos.traineddata' @@ -78,6 +114,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Bulgarian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bul.traineddata' @@ -86,6 +126,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Catalan; Valencian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/cat.traineddata' @@ -94,6 +138,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Cebuano' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ceb.traineddata' @@ -102,6 +150,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Czech' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ces.traineddata' @@ -110,6 +162,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Chinese - Simplified' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chi_sim.traineddata' @@ -118,6 +174,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' - title: 'Chinese - Traditional' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chi_tra.traineddata' @@ -126,6 +186,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' # - title: 'Cherokee' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chr.traineddata' @@ -134,6 +198,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Welsh' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/cym.traineddata' @@ -142,6 +210,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' - title: 'Danish' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/dan.traineddata' @@ -150,6 +222,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' - title: 'German' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/deu.traineddata' @@ -158,6 +234,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' # - title: 'Dzongkha' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/dzo.traineddata' @@ -166,6 +246,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' - title: 'Greek, Modern (1453-)' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ell.traineddata' @@ -174,6 +258,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' - title: 'English' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/eng.traineddata' @@ -182,6 +270,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' - title: 'English, Middle (1100-1500)' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/enm.traineddata' @@ -190,6 +282,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' # - title: 'Esperanto' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/epo.traineddata' @@ -198,6 +294,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Estonian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/est.traineddata' @@ -206,6 +306,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Basque' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/eus.traineddata' @@ -214,6 +318,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Persian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fas.traineddata' @@ -222,6 +330,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Finnish' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fin.traineddata' @@ -230,6 +342,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' - title: 'French' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fra.traineddata' @@ -238,6 +354,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' - title: 'German Fraktur' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/frk.traineddata' @@ -246,6 +366,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' - title: 'French, Middle (ca. 1400-1600)' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/frm.traineddata' @@ -254,6 +378,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' # - title: 'Irish' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/gle.traineddata' @@ -262,6 +390,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Galician' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/glg.traineddata' @@ -270,6 +402,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' - title: 'Greek, Ancient (-1453)' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/grc.traineddata' @@ -278,6 +414,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' # - title: 'Gujarati' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/guj.traineddata' @@ -286,6 +426,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Haitian; Haitian Creole' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hat.traineddata' @@ -294,6 +438,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Hebrew' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/heb.traineddata' @@ -302,6 +450,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Hindi' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hin.traineddata' @@ -310,6 +462,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Croatian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hrv.traineddata' @@ -318,6 +474,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Hungarian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hun.traineddata' @@ -326,6 +486,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Inuktitut' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/iku.traineddata' @@ -334,6 +498,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Indonesian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ind.traineddata' @@ -342,6 +510,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Icelandic' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/isl.traineddata' @@ -350,6 +522,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' - title: 'Italian' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ita.traineddata' @@ -358,6 +534,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' - title: 'Italian - Old' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ita_old.traineddata' @@ -366,6 +546,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' # - title: 'Javanese' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/jav.traineddata' @@ -374,6 +558,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Japanese' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/jpn.traineddata' @@ -382,6 +570,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Kannada' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kan.traineddata' @@ -390,6 +582,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Georgian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kat.traineddata' @@ -398,6 +594,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Georgian - Old' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kat_old.traineddata' @@ -406,6 +606,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Kazakh' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kaz.traineddata' @@ -414,6 +618,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Central Khmer' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/khm.traineddata' @@ -422,6 +630,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Kirghiz; Kyrgyz' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kir.traineddata' @@ -430,6 +642,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Korean' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kor.traineddata' @@ -438,6 +654,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Kurdish' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kur.traineddata' @@ -446,6 +666,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Lao' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lao.traineddata' @@ -454,6 +678,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Latin' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lat.traineddata' @@ -462,6 +690,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Latvian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lav.traineddata' @@ -470,6 +702,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Lithuanian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lit.traineddata' @@ -478,6 +714,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Malayalam' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mal.traineddata' @@ -486,6 +726,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Marathi' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mar.traineddata' @@ -494,6 +738,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Macedonian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mkd.traineddata' @@ -502,6 +750,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Maltese' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mlt.traineddata' @@ -510,6 +762,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Malay' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/msa.traineddata' @@ -518,6 +774,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Burmese' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mya.traineddata' @@ -526,6 +786,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Nepali' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nep.traineddata' @@ -534,6 +798,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Dutch; Flemish' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nld.traineddata' @@ -542,6 +810,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Norwegian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nor.traineddata' @@ -550,6 +822,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Oriya' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ori.traineddata' @@ -558,6 +834,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Panjabi; Punjabi' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pan.traineddata' @@ -566,6 +846,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Polish' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pol.traineddata' @@ -574,6 +858,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' - title: 'Portuguese' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/por.traineddata' @@ -582,6 +870,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' # - title: 'Pushto; Pashto' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pus.traineddata' @@ -590,6 +882,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Romanian; Moldavian; Moldovan' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ron.traineddata' @@ -598,6 +894,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' - title: 'Russian' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/rus.traineddata' @@ -606,6 +906,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' # - title: 'Sanskrit' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/san.traineddata' @@ -614,6 +918,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Sinhala; Sinhalese' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/sin.traineddata' @@ -622,6 +930,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Slovak' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/slk.traineddata' @@ -630,6 +942,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Slovenian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/slv.traineddata' @@ -638,6 +954,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' - title: 'Spanish; Castilian' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/spa.traineddata' @@ -646,6 +966,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' - title: 'Spanish; Castilian - Old' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/spa_old.traineddata' @@ -654,6 +978,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' # - title: 'Albanian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/sqi.traineddata' @@ -662,6 +990,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Serbian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/srp.traineddata' @@ -670,6 +1002,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Serbian - Latin' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/srp_latn.traineddata' @@ -678,6 +1014,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Swahili' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/swa.traineddata' @@ -686,6 +1026,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Swedish' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/swe.traineddata' @@ -694,6 +1038,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Syriac' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/syr.traineddata' @@ -702,6 +1050,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Tamil' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tam.traineddata' @@ -710,6 +1062,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Telugu' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tel.traineddata' @@ -718,6 +1074,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Tajik' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tgk.traineddata' @@ -726,6 +1086,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Tagalog' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tgl.traineddata' @@ -734,6 +1098,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Thai' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tha.traineddata' @@ -742,6 +1110,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Tigrinya' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tir.traineddata' @@ -750,6 +1122,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Turkish' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tur.traineddata' @@ -758,6 +1134,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Uighur; Uyghur' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uig.traineddata' @@ -766,6 +1146,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Ukrainian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ukr.traineddata' @@ -774,6 +1158,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Urdu' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/urd.traineddata' @@ -782,6 +1170,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Uzbek' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uzb.traineddata' @@ -790,6 +1182,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Uzbek - Cyrillic' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uzb_cyrl.traineddata' @@ -798,6 +1194,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Vietnamese' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/vie.traineddata' @@ -806,6 +1206,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Yiddish' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/yid.traineddata' @@ -814,3 +1218,7 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' diff --git a/app/TranskribusHTRModel.defaults.yml b/app/TranskribusHTRModel.defaults.yml new file mode 100644 index 00000000..e69de29b diff --git a/app/__init__.py b/app/__init__.py index 46ca4533..9db97d3f 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -1,5 +1,6 @@ from config import Config from flask import Flask +from flask_apscheduler import APScheduler from flask_login import LoginManager from flask_mail import Mail from flask_migrate import Migrate @@ -20,6 +21,7 @@ mail: Mail = Mail() migrate: Migrate = Migrate() paranoid: Paranoid = Paranoid() paranoid.redirect_view = '/' +scheduler: APScheduler = APScheduler() # TODO: Use this! socketio: SocketIO = SocketIO() diff --git a/app/cli.py b/app/cli.py index e588eef9..4bff77d3 100644 --- a/app/cli.py +++ b/app/cli.py @@ -2,9 +2,8 @@ from flask import current_app from flask_migrate import upgrade from . import db from .models import Corpus, Job, Role, User, TesseractOCRModel -import json +import click import os -import re def _make_default_dirs(): @@ -56,6 +55,19 @@ def register(app): daemon: Daemon = Daemon() daemon.run() + @app.cli.group() + def converter(): + ''' Converter commands. ''' + pass + + @converter.command() + @click.argument('json_db') + @click.argument('data_dir') + def sandpaper(json_db, data_dir): + ''' Sandpaper converter ''' + from app.converters.sandpaper import convert + convert(json_db, data_dir) + @app.cli.group() def test(): ''' Test commands. ''' @@ -68,55 +80,3 @@ def register(app): from unittest.suite import TestSuite tests: TestSuite = TestLoader().discover('tests') TextTestRunner(verbosity=2).run(tests) - - @app.cli.group() - def convert(): - ''' Datebase convert commands. ''' - - @convert.command() - def nlp_jobs(): - for job in Job.query.filter_by(service='nlp').all(): - job.service = 'spacy-nlp' - service_args = json.loads(job.service_args) - new_service_args = {} - for service_arg in service_args: - if service_arg == '--check-encoding': - new_service_args['encoding_detection'] = True - elif re.match(r'-l ([a-z]{2})', service_arg): - language_code = re.search(r'-l ([a-z]{2})', service_arg).group(1) # noqa - new_service_args['language'] = language_code - job.service_args = json.dumps(new_service_args) - db.session.commit() - - @convert.command() - def ocr_jobs(): - # Language code to TesseractOCRModel.title lookup - language_code_lookup = { - 'ara': 'Arabic', - 'chi_tra': 'Chinese - Traditional', - 'dan': 'Danish', - 'eng': 'English', - 'enm': 'English, Middle (1100-1500)', - 'fra': 'French', - 'frm': 'French, Middle (ca. 1400-1600)', - 'deu': 'German', - 'frk': 'German Fraktur', - 'ell': 'Greek, Modern (1453-)', - 'ita': 'Italian', - 'por': 'Portuguese', - 'rus': 'Russian', - 'spa': 'Spanish; Castilian' - } - for job in Job.query.filter_by(service='ocr').all(): - job.service = 'tesseract-ocr' - service_args = json.loads(job.service_args) - new_service_args = {} - for service_arg in service_args: - if service_arg == '--binarize': - new_service_args['binarization'] = True - elif re.match(r'-l ([a-z]{3})', service_arg): - language_code = re.search(r'-l ([a-z]{3})', service_arg).group(1) # noqa - tesseract_ocr_model = TesseractOCRModel.query.filter_by(title=language_code_lookup[language_code]).first() # noqa - new_service_args['model'] = tesseract_ocr_model.id - job.service_args = json.dumps(new_service_args) - db.session.commit() diff --git a/app/converters/__init__.py b/app/converters/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/app/converters/sandpaper.py b/app/converters/sandpaper.py new file mode 100644 index 00000000..3172183e --- /dev/null +++ b/app/converters/sandpaper.py @@ -0,0 +1,215 @@ +from flask import current_app +from app import db +from app.models import User, Corpus, CorpusFile +from datetime import datetime +import json +import os + + +def convert(json_db_file, data_dir): + with open(json_db_file, 'r') as f: + json_db = json.loads(f.read()) + + for json_user in json_db: + if not json_user['confirmed']: + current_app.logger.info(f'Skip unconfirmed user {json_user["username"]}') + continue + user_dir = os.path.join(data_dir, json_user['id']) + convert_user(json_user, user_dir) + db.session.commit() + + +def convert_user(json_user, user_dir): + current_app.logger.info(f'Create User {json_user["username"]}...') + user = User( + confirmed=json_user['confirmed'], + email=json_user['email'], + last_seen=datetime.fromtimestamp(json_user['last_seen']), + member_since=datetime.fromtimestamp(json_user['member_since']), + password_hash=json_user['password_hash'], # TODO: Needs to be added manually + username=json_user['username'] + ) + db.session.add(user) + db.session.flush(objects=[user]) + db.session.refresh(user) + try: + user.makedirs() + except OSError as e: + current_app.logger.error(e) + db.session.rollback() + raise Exception('Internal Server Error') + for json_corpus in json_user['corpora'].values(): + if not json_corpus['files'].values(): + current_app.logger.info(f'Skip empty corpus {json_corpus["title"]}') + continue + corpus_dir = os.path.join(user_dir, 'corpora', json_corpus['id']) + convert_corpus(json_corpus, user, corpus_dir) + current_app.logger.info('Done') + + +def convert_corpus(json_corpus, user, corpus_dir): + current_app.logger.info(f'Create Corpus {json_corpus["title"]}...') + corpus = Corpus( + user=user, + creation_date=datetime.fromtimestamp(json_corpus['creation_date']), + description=json_corpus['description'], + last_edited_date=datetime.fromtimestamp(json_corpus['last_edited_date']), + title=json_corpus['title'] + ) + db.session.add(corpus) + db.session.flush(objects=[corpus]) + db.session.refresh(corpus) + try: + corpus.makedirs() + except OSError as e: + current_app.logger.error(e) + db.session.rollback() + raise Exception('Internal Server Error') + for json_corpus_file in json_corpus['files'].values(): + corpus_file_dir = os.path.join(corpus_dir, 'files', json_corpus_file['id']) + convert_corpus_file(json_corpus_file, corpus, corpus_file_dir) + current_app.logger.info('Done') + + +def convert_corpus_file(json_corpus_file, corpus, corpus_file_dir): + current_app.logger.info(f'Create CorpusFile {json_corpus_file["title"]}...') + corpus_file = CorpusFile( + corpus=corpus, + address=json_corpus_file['address'], + author=json_corpus_file['author'], + booktitle=json_corpus_file['booktitle'], + chapter=json_corpus_file['chapter'], + editor=json_corpus_file['editor'], + filename=json_corpus_file['filename'], + institution=json_corpus_file['institution'], + journal=json_corpus_file['journal'], + mimetype='application/vrt+xml', + pages=json_corpus_file['pages'], + publisher=json_corpus_file['publisher'], + publishing_year=json_corpus_file['publishing_year'], + school=json_corpus_file['school'], + title=json_corpus_file['title'] + ) + db.session.add(corpus_file) + db.session.flush(objects=[corpus_file]) + db.session.refresh(corpus_file) + try: + convert_vrt( + os.path.join(corpus_file_dir, json_corpus_file['filename']), + corpus_file.path + ) + except OSError as e: + current_app.logger.error(e) + db.session.rollback() + raise Exception('Internal Server Error') + current_app.logger.info('Done') + + +def convert_vrt(input_file, output_file): + def check_pos_attribute_order(vrt_lines): + # The following orders are possible: + # since 26.02.2019: 'word,lemma,simple_pos,pos,ner' + # since 26.03.2021: 'word,pos,lemma,simple_pos,ner' + # since 27.01.2022: 'word,pos,lemma,simple_pos' + # This Function tries to find out which order we have by looking at the + # number of attributes and the position of the simple_pos attribute + SIMPLE_POS_LABELS = [ + 'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', + 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', + 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', + 'VERB', 'X' + ] + for line in vrt_lines: + if line.startswith('<'): + continue + pos_attrs = line.rstrip('\n').split('\t') + num_pos_attrs = len(pos_attrs) + if num_pos_attrs == 4: + if pos_attrs[3] in SIMPLE_POS_LABELS: + return ['word', 'pos', 'lemma', 'simple_pos'] + continue + elif num_pos_attrs == 5: + if pos_attrs[2] in SIMPLE_POS_LABELS: + return ['word', 'lemma', 'simple_pos', 'pos', 'ner'] + elif pos_attrs[3] in SIMPLE_POS_LABELS: + return ['word', 'pos', 'lemma', 'simple_pos', 'ner'] + continue + return None + + + def check_has_ent_as_s_attr(vrt_lines): + for line in vrt_lines: + if line.startswith('\n' + current_ent = pos_attrs[4] + elif current_ent != pos_attrs[4]: + output_vrt += '\n' + current_ent = None + output_vrt += f'\n' + current_ent = pos_attrs[4] + output_vrt += pos_attrs_to_string_function(pos_attrs) + + with open(output_file, 'w') as f: + f.write(output_vrt) diff --git a/app/corpora/routes.py b/app/corpora/routes.py index fd0085ee..3b334d83 100644 --- a/app/corpora/routes.py +++ b/app/corpora/routes.py @@ -319,7 +319,7 @@ def corpus_file(corpus_id, corpus_file_id): form.title.data = corpus_file.title return render_template( 'corpora/corpus_file.html.j2', - corpus=corpus, + corpus=corpus_file.corpus, corpus_file=corpus_file, form=form, title='Edit corpus file' diff --git a/app/daemon/job_utils.py b/app/daemon/job_utils.py index e4797e26..e56bafbc 100644 --- a/app/daemon/job_utils.py +++ b/app/daemon/job_utils.py @@ -22,34 +22,46 @@ class CheckJobsMixin: def create_job_service(self, job): ''' # Docker service settings # ''' ''' ## Service specific settings ## ''' - if job.service == 'file-setup': + if job.service == 'file-setup-pipeline': mem_mb = 512 n_cores = 2 - executable = 'file-setup' - image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}file-setup:v{job.service_version}' # noqa - elif job.service == 'tesseract-ocr': - mem_mb = 2048 + executable = 'file-setup-pipeline' + image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}file-setup-pipeline:v{job.service_version}' # noqa + elif job.service == 'tesseract-ocr-pipeline': + mem_mb = 1024 n_cores = 4 - executable = 'ocr' - image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}ocr:v{job.service_version}' # noqa - elif job.service == 'spacy-nlp': + executable = 'tesseract-ocr-pipeline' + image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}tesseract-ocr-pipeline:v{job.service_version}' # noqa + elif job.service == 'transkribus-htr-pipeline': + mem_mb = 1024 + n_cores = 4 + executable = 'transkribus-htr-pipeline' + image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}transkribus-htr-pipeline:v{job.service_version}' # noqa + elif job.service == 'spacy-nlp-pipeline': mem_mb = 1024 n_cores = 1 - executable = 'nlp' - image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}nlp:v{job.service_version}' # noqa + executable = 'spacy-nlp-pipeline' + image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}spacy-nlp-pipeline:v{job.service_version}' # noqa ''' ## Command ## ''' command = f'{executable} -i /input -o /output' command += ' --log-dir /logs' command += f' --mem-mb {mem_mb}' command += f' --n-cores {n_cores}' - service_args = json.loads(job.service_args) - if job.service == 'spacy-nlp': - command += f' -m {service_args["model"]}' - if 'encoding_detection' in service_args and service_args['encoding_detection']: # noqa + if job.service == 'spacy-nlp-pipeline': + command += f' -m {job.service_args["model"]}' + if 'encoding_detection' in job.service_args and job.service_args['encoding_detection']: # noqa command += ' --check-encoding' - elif job.service == 'tesseract-ocr': - command += f' -m {service_args["model"]}' - if 'binarization' in service_args and service_args['binarization']: + elif job.service == 'tesseract-ocr-pipeline': + command += f' -m {job.service_args["model"]}' + if 'binarization' in job.service_args and job.service_args['binarization']: + command += ' --binarize' + elif job.service == 'transkribus-htr-pipeline': + command += f' -m {job.service_args["model"]}' + readcoop_username = current_app.config.get('NOPAQUE_READCOOP_USERNAME') + command += f' --readcoop-username "{readcoop_username}"' + readcoop_password = current_app.config.get('NOPAQUE_READCOOP_PASSWORD') + command += f' --readcoop-password "{readcoop_password}"' + if 'binarization' in job.service_args and job.service_args['binarization']: command += ' --binarize' ''' ## Constraints ## ''' constraints = ['node.role==worker'] @@ -63,16 +75,15 @@ class CheckJobsMixin: mounts = [] ''' ### Input mount(s) ### ''' input_mount_target_base = '/input' - if job.service == 'file-setup': + if job.service == 'file-setup-pipeline': input_mount_target_base += f'/{secure_filename(job.title)}' for job_input in job.inputs: input_mount_source = job_input.path - input_mount_target = f'/{input_mount_target_base}/{job_input.filename}' # noqa + input_mount_target = f'{input_mount_target_base}/{job_input.filename}' # noqa input_mount = f'{input_mount_source}:{input_mount_target}:ro' mounts.append(input_mount) - if job.service == 'tesseract-ocr': - service_args = json.loads(job.service_args) - model = TesseractOCRModel.query.get(service_args['model']) + if job.service == 'tesseract-ocr-pipeline': + model = TesseractOCRModel.query.get(job.service_args['model']) if model is None: job.status = JobStatus.FAILED return @@ -114,7 +125,8 @@ class CheckJobsMixin: mounts=mounts, name=name, resources=resources, - restart_policy=restart_policy + restart_policy=restart_policy, + user='1000:1000' ) except docker.errors.APIError as e: current_app.logger.error( diff --git a/app/models.py b/app/models.py index d391807a..528b11dc 100644 --- a/app/models.py +++ b/app/models.py @@ -36,14 +36,23 @@ class IntEnumColumn(db.TypeDecorator): return self.enum_type(value) -class Permission(IntEnum): - ''' - Defines User permissions as integers by the power of 2. User permission - can be evaluated using the bitwise operator &. - ''' - ADMINISTRATE = 4 - CONTRIBUTE = 2 - USE_API = 1 +class ContainerColumn(db.TypeDecorator): + impl = db.String + + def __init__(self, container_type, *args, **kwargs): + super().__init__(*args, **kwargs) + self.container_type = container_type + + def process_bind_param(self, value, dialect): + if isinstance(value, self.container_type): + return json.dumps(value) + elif isinstance(value, str) and isinstance(json.loads(value), self.container_type): # noqa + return value + else: + return TypeError() + + def process_result_value(self, value, dialect): + return json.loads(value) class FileMixin: @@ -61,6 +70,16 @@ class FileMixin: } +class Permission(IntEnum): + ''' + Defines User permissions as integers by the power of 2. User permission + can be evaluated using the bitwise operator &. + ''' + ADMINISTRATE = 1 + CONTRIBUTE = 2 + USE_API = 4 + + class Role(HashidMixin, db.Model): __tablename__ = 'roles' # Primary key @@ -102,7 +121,7 @@ class Role(HashidMixin, db.Model): 'permissions': self.permissions } if relationships: - dict_role['users']: { + dict_role['users'] = { x.to_dict(backrefs=False, relationships=True) for x in self.users } @@ -339,10 +358,11 @@ class TesseractOCRModel(FileMixin, HashidMixin, db.Model): # Foreign keys user_id = db.Column(db.Integer, db.ForeignKey('users.id')) # Fields - compatible_service_versions = db.Column(db.String(255)) + compatible_service_versions = db.Column(ContainerColumn(list, 255)) description = db.Column(db.String(255)) publisher = db.Column(db.String(128)) publishing_year = db.Column(db.Integer) + shared = db.Column(db.Boolean, default=False) title = db.Column(db.String(64)) version = db.Column(db.String(16)) # Backrefs: user: User @@ -356,11 +376,10 @@ class TesseractOCRModel(FileMixin, HashidMixin, db.Model): ) def to_dict(self, backrefs=False, relationships=False): - compatible_service_versions = json.loads(self.compatible_service_versions) # noqa dict_tesseract_ocr_model = { 'id': self.hashid, 'user_id': self.user.hashid, - 'compatible_service_versions': compatible_service_versions, + 'compatible_service_versions': self.compatible_service_versions, 'description': self.description, 'publisher': self.publisher, 'publishing_year': self.publishing_year, @@ -384,31 +403,39 @@ class TesseractOCRModel(FileMixin, HashidMixin, db.Model): with open(defaults_file, 'r') as f: defaults = yaml.safe_load(f) for m in defaults: - if TesseractOCRModel.query.filter_by(title=m['title'], version=m['version']).first() is not None: # noqa + model = TesseractOCRModel.query.filter_by(title=m['title'], version=m['version']).first() # noqa + if model is not None: + model.compatible_service_versions = m['compatible_service_versions'] + model.description = m['description'] + model.publisher = m['publisher'] + model.publishing_year = m['publishing_year'] + model.title = m['title'] + model.version = m['version'] continue - tesseract_ocr_model = TesseractOCRModel( - compatible_service_versions=json.dumps(m['compatible_service_versions']), # noqa + model = TesseractOCRModel( + compatible_service_versions=m['compatible_service_versions'], description=m['description'], publisher=m['publisher'], publishing_year=m['publishing_year'], + shared=True, title=m['title'], user=user, version=m['version'] ) - db.session.add(tesseract_ocr_model) - db.session.flush(objects=[tesseract_ocr_model]) - db.session.refresh(tesseract_ocr_model) - tesseract_ocr_model.filename = f'{tesseract_ocr_model.id}.traineddata' # noqa + db.session.add(model) + db.session.flush(objects=[model]) + db.session.refresh(model) + model.filename = f'{model.id}.traineddata' r = requests.get(m['url'], stream=True) pbar = tqdm( - desc=f'{tesseract_ocr_model.title} ({tesseract_ocr_model.filename})', # noqa + desc=f'{model.title} ({model.filename})', unit="B", unit_scale=True, unit_divisor=1024, total=int(r.headers['Content-Length']) ) pbar.clear() - with open(tesseract_ocr_model.path, 'wb') as f: + with open(model.path, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks pbar.update(len(chunk)) @@ -560,11 +587,7 @@ class Job(HashidMixin, db.Model): description = db.Column(db.String(255)) end_date = db.Column(db.DateTime()) service = db.Column(db.String(64)) - ''' - ' Dictionary as JSON formatted string. - ' Example: {"binarization": True} - ''' - service_args = db.Column(db.String(255)) + service_args = db.Column(ContainerColumn(dict, 255)) service_version = db.Column(db.String(16)) status = db.Column( IntEnumColumn(JobStatus), @@ -643,10 +666,6 @@ class Job(HashidMixin, db.Model): self.status = JobStatus.SUBMITTED def to_dict(self, backrefs=False, relationships=False): - service_args = json.loads(self.service_args) - if self.service == 'tesseract-ocr' and 'model' in service_args: - tesseract_ocr_pipeline_model = TesseractOCRModel.query.get(service_args['model']) # noqa - service_args['model'] = tesseract_ocr_pipeline_model.title dict_job = { 'id': self.hashid, 'user_id': self.user.hashid, @@ -654,7 +673,7 @@ class Job(HashidMixin, db.Model): 'description': self.description, 'end_date': None if self.end_date is None else f'{self.end_date.isoformat()}Z', # noqa 'service': self.service, - 'service_args': service_args, + 'service_args': self.service_args, 'service_version': self.service_version, 'status': self.status.name, 'title': self.title, @@ -798,7 +817,6 @@ class Corpus(HashidMixin, db.Model): title = db.Column(db.String(32)) num_analysis_sessions = db.Column(db.Integer, default=0) num_tokens = db.Column(db.Integer, default=0) - archive_file = db.Column(db.String(255)) # Backrefs: user: User # Relationships files = db.relationship( diff --git a/app/services/forms.py b/app/services/forms.py index 4a16ad4a..21db025e 100644 --- a/app/services/forms.py +++ b/app/services/forms.py @@ -1,5 +1,7 @@ -from app.models import TesseractOCRModel +from app.models import Job, TesseractOCRModel +from flask_login import current_user from flask_wtf import FlaskForm +from flask_wtf.file import FileField, FileAllowed, FileRequired from wtforms import ( BooleanField, MultipleFileField, @@ -8,110 +10,143 @@ from wtforms import ( SubmitField, ValidationError ) -from wtforms.validators import DataRequired, Length +from wtforms.validators import DataRequired, InputRequired, Length from . import SERVICES class AddJobForm(FlaskForm): - description = StringField('Description', validators=[DataRequired(), Length(1, 255)]) # noqa + description = StringField('Description', validators=[InputRequired()]) # noqa submit = SubmitField() - title = StringField('Title', validators=[DataRequired(), Length(1, 32)]) + title = StringField('Title', validators=[InputRequired()]) version = SelectField('Version', validators=[DataRequired()]) + def validate_description(self, field): + max_length = Job.description.property.columns[0].type.length + if len(field.data) > max_length: + raise ValidationError( + f'Description must be less than {max_length} characters' + ) -class AddSpacyNLPJobForm(AddJobForm): - encoding_detection = BooleanField('Encoding detection') - files = MultipleFileField('Files', validators=[DataRequired()]) - model = SelectField( - 'Model', - choices=[('', 'Choose your option')], - default='', - validators=[DataRequired()] - ) + def validate_title(self, field): + max_length = Job.title.property.columns[0].type.length + if len(field.data) > max_length: + raise ValidationError( + f'Title must be less than {max_length} characters' + ) - def validate_encoding_detection(self, field): - service_info = SERVICES['spacy-nlp']['versions'][self.version.data] - if field.data and 'encoding_detection' not in service_info['methods']: - raise ValidationError('Encoding detection is not available') - def validate_files(form, field): - valid_extensions = ['.txt'] - for file in field.data: - if not file.filename.lower().endswith(tuple(valid_extensions)): - raise ValidationError( - 'File does not have an approved extension: ' - '/'.join(valid_extensions) - ) +class AddFileSetupPipelineJobForm(AddJobForm): + images = MultipleFileField('File(s)', validators=[DataRequired()]) + + def validate_images(form, field): + valid_mimetypes = ['image/jpeg', 'image/png', 'image/tiff'] + for image in field.data: + if image.mimetype not in valid_mimetypes: + raise ValidationError('JPEG, PNG and TIFF files only!') def __init__(self, *args, **kwargs): - version = kwargs.pop('version', SERVICES['spacy-nlp']['latest_version']) # noqa + service_manifest = SERVICES['file-setup-pipeline'] + version = kwargs.pop('version', service_manifest['latest_version']) super().__init__(*args, **kwargs) - service_info = SERVICES['spacy-nlp']['versions'][version] - if 'encoding_detection' not in service_info['methods']: - self.encoding_detection.render_kw = {'disabled': True} - self.model.choices += [(x, y) for x, y in service_info['models'].items()] # noqa - self.version.choices = [(x, x) for x in SERVICES['spacy-nlp']['versions']] # noqa - self.version.default = version + self.version.choices = [(x, x) for x in service_manifest['versions']] + self.version.data = version + self.version.default = service_manifest['latest_version'] -class AddTesseractOCRJobForm(AddJobForm): +class AddTesseractOCRPipelineJobForm(AddJobForm): binarization = BooleanField('Binarization') - files = MultipleFileField('Files', validators=[DataRequired()]) - model = SelectField( - 'Model', - choices=[('', 'Choose your option')], - default='', - validators=[DataRequired()] - ) + pdf = FileField('File', validators=[FileRequired()]) + model = SelectField('Model', validators=[DataRequired()]) def validate_binarization(self, field): - service_info = SERVICES['tesseract-ocr']['versions'][self.version.data] + service_info = SERVICES['tesseract-ocr-pipeline']['versions'][self.version.data] if field.data and 'binarization' not in service_info['methods']: raise ValidationError('Binarization is not available') - def validate_files(self, field): - valid_extensions = ['.pdf'] - for file in field.data: - if not file.filename.lower().endswith(tuple(valid_extensions)): - raise ValidationError( - 'File does not have an approved extension: ' - '/'.join(valid_extensions) - ) + def validate_pdf(self, field): + if field.data.mimetype != 'application/pdf': + raise ValidationError('PDF files only!') def __init__(self, *args, **kwargs): - version = kwargs.pop('version', SERVICES['tesseract-ocr']['latest_version']) # noqa + service_manifest = SERVICES['tesseract-ocr-pipeline'] + version = kwargs.pop('version', service_manifest['latest_version']) super().__init__(*args, **kwargs) - service_info = SERVICES['tesseract-ocr']['versions'][version] + service_info = service_manifest['versions'][version] if 'binarization' not in service_info['methods']: self.binarization.render_kw = {'disabled': True} - self.model.choices += [(x.hashid, x.title) for x in TesseractOCRModel.query.all()] # noqa - self.version.choices = [(x, x) for x in SERVICES['tesseract-ocr']['versions']] # noqa + compatible_models = [ + x for x in TesseractOCRModel.query.filter_by(shared=True).all() + if version in x.compatible_service_versions + ] + compatible_models += [ + x for x in TesseractOCRModel.query.filter_by(shared=False, user=current_user).all() + if version in x.compatible_service_versions + ] + self.model.choices = [('', 'Choose your option')] + self.model.choices += [(x.hashid, x.title) for x in compatible_models] + self.model.default = '' + self.version.choices = [(x, x) for x in service_manifest['versions']] self.version.data = version - self.version.default = SERVICES['tesseract-ocr']['latest_version'] + self.version.default = service_manifest['latest_version'] -class AddFileSetupJobForm(AddJobForm): - files = MultipleFileField('Files', validators=[DataRequired()]) +class AddTranskribusHTRPipelineJobForm(AddJobForm): + binarization = BooleanField('Binarization') + pdf = FileField('File', validators=[FileRequired()]) + model = SelectField('Model', validators=[DataRequired()]) - def validate_files(form, field): - valid_extensions = ['.jpeg', '.jpg', '.png', '.tiff', '.tif'] - for file in field.data: - if not file.filename.lower().endswith(tuple(valid_extensions)): - raise ValidationError( - 'File does not have an approved extension: ' - '/'.join(valid_extensions) - ) + def validate_binarization(self, field): + service_info = SERVICES['transkribus-htr-pipeline']['versions'][self.version.data] + if field.data and 'binarization' not in service_info['methods']: + raise ValidationError('Binarization is not available') + + def validate_pdf(self, field): + if field.data.mimetype != 'application/pdf': + raise ValidationError('PDF files only!') def __init__(self, *args, **kwargs): - version = kwargs.pop('version', SERVICES['file-setup']['latest_version']) # noqa + service_manifest = SERVICES['transkribus-htr-pipeline'] + version = kwargs.pop('version', service_manifest['latest_version']) super().__init__(*args, **kwargs) - self.version.choices = [(x, x) for x in SERVICES['file-setup']['versions']] # noqa + service_info = service_manifest['versions'][version] + if 'binarization' not in service_info['methods']: + self.binarization.render_kw = {'disabled': True} + self.model.choices = [('', 'Choose your option')] + self.model.choices += [ + ('37569', 'Tim Model'), + ('29539', 'UCL–University of Toronto #7') + ] + self.model.default = '' + self.version.choices = [(x, x) for x in service_manifest['versions']] self.version.data = version - self.version.default = SERVICES['file-setup']['latest_version'] + self.version.default = service_manifest['latest_version'] -AddJobForms = { - 'file-setup': AddFileSetupJobForm, - 'tesseract-ocr': AddTesseractOCRJobForm, - 'spacy-nlp': AddSpacyNLPJobForm -} +class AddSpacyNLPPipelineJobForm(AddJobForm): + encoding_detection = BooleanField('Encoding detection') + txt = FileField('File', validators=[FileRequired()]) + model = SelectField('Model', validators=[DataRequired()]) + + def validate_encoding_detection(self, field): + service_manifest = SERVICES['spacy-nlp-pipeline'] + service_info = service_manifest['versions'][self.version.data] + if field.data and 'encoding_detection' not in service_info['methods']: + raise ValidationError('Encoding detection is not available!') + + def validate_txt(form, field): + if field.data.mimetype != 'text/plain': + raise ValidationError('Plain text files only!') + + def __init__(self, *args, **kwargs): + service_manifest = SERVICES['spacy-nlp-pipeline'] + version = kwargs.pop('version', service_manifest['latest_version']) + super().__init__(*args, **kwargs) + service_info = service_manifest['versions'][version] + if 'encoding_detection' not in service_info['methods']: + self.encoding_detection.render_kw = {'disabled': True} + self.model.choices = [('', 'Choose your option')] + self.model.choices += [(x, y) for x, y in service_info['models'].items()] # noqa + self.model.default = '' + self.version.choices = [(x, x) for x in service_manifest['versions']] + self.version.data = version + self.version.default = version diff --git a/app/services/routes.py b/app/services/routes.py index 7a8e520e..feecf39a 100644 --- a/app/services/routes.py +++ b/app/services/routes.py @@ -13,47 +13,33 @@ from flask_login import current_user, login_required from werkzeug.utils import secure_filename from . import bp from . import SERVICES -from .forms import AddJobForms +from .forms import ( + AddFileSetupPipelineJobForm, + AddTesseractOCRPipelineJobForm, + AddTranskribusHTRPipelineJobForm, + AddSpacyNLPPipelineJobForm +) import json -@bp.route('/corpus-analysis') +@bp.route('/file-setup-pipeline', methods=['GET', 'POST']) @login_required -def corpus_analysis(): - return render_template( - 'services/corpus_analysis.html.j2', - title='Corpus analysis' - ) - - -@bp.route('/', methods=['GET', 'POST']) -@login_required -def service(service): - # Check if the requested service exist - if service not in SERVICES or service not in AddJobForms: +def file_setup_pipeline(): + service = 'file-setup-pipeline' + service_manifest = SERVICES[service] + version = request.args.get('version', service_manifest['latest_version']) + if version not in service_manifest['versions']: abort(404) - version = request.args.get('version', SERVICES[service]['latest_version']) - if version not in SERVICES[service]['versions']: - abort(404) - form = AddJobForms[service](prefix='add-job-form', version=version) - title = SERVICES[service]['name'] + form = AddFileSetupPipelineJobForm(prefix='add-job-form', version=version) if form.is_submitted(): if not form.validate(): return make_response(form.errors, 400) service_args = {} - if service == 'spacy-nlp': - service_args['model'] = form.model.data - if form.encoding_detection.data: - service_args['encoding_detection'] = True - if service == 'tesseract-ocr': - service_args['model'] = hashids.decode(form.model.data) - if form.binarization.data: - service_args['binarization'] = True job = Job( user=current_user, description=form.description.data, service=service, - service_args=json.dumps(service_args), + service_args=service_args, service_version=form.version.data, title=form.title.data ) @@ -67,18 +53,17 @@ def service(service): db.session.rollback() flash('Internal Server Error', 'error') return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa - for file in form.files.data: - filename = secure_filename(file.filename) + for image_file in form.images.data: job_input = JobInput( - filename=filename, + filename=secure_filename(image_file.filename), job=job, - mimetype=file.mimetype + mimetype=image_file.mimetype ) db.session.add(job_input) db.session.flush(objects=[job_input]) db.session.refresh(job_input) try: - file.save(job_input.path) + image_file.save(job_input.path) except OSError as e: current_app.logger.error(e) db.session.rollback() @@ -91,5 +76,196 @@ def service(service): return render_template( f'services/{service.replace("-", "_")}.html.j2', form=form, - title=title + title=service_manifest['name'] ) + + +@bp.route('/tesseract-ocr-pipeline', methods=['GET', 'POST']) +@login_required +def tesseract_ocr_pipeline(): + service = 'tesseract-ocr-pipeline' + service_manifest = SERVICES[service] + version = request.args.get('version', service_manifest['latest_version']) + if version not in service_manifest['versions']: + abort(404) + form = AddTesseractOCRPipelineJobForm(prefix='add-job-form', version=version) + if form.is_submitted(): + if not form.validate(): + return make_response(form.errors, 400) + service_args = {} + service_args['model'] = hashids.decode(form.model.data) + if form.binarization.data: + service_args['binarization'] = True + job = Job( + user=current_user, + description=form.description.data, + service=service, + service_args=service_args, + service_version=form.version.data, + title=form.title.data + ) + db.session.add(job) + db.session.flush(objects=[job]) + db.session.refresh(job) + try: + job.makedirs() + except OSError as e: + current_app.logger.error(e) + db.session.rollback() + flash('Internal Server Error', 'error') + return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa + job_input = JobInput( + filename=secure_filename(form.pdf.data.filename), + job=job, + mimetype=form.pdf.data.mimetype + ) + db.session.add(job_input) + db.session.flush(objects=[job_input]) + db.session.refresh(job_input) + try: + form.pdf.data.save(job_input.path) + except OSError as e: + current_app.logger.error(e) + db.session.rollback() + flash('Internal Server Error', 'error') + return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa + job.status = JobStatus.SUBMITTED + db.session.commit() + flash(f'Job "{job.title}" added', 'job') + return make_response({'redirect_url': url_for('jobs.job', job_id=job.id)}, 201) # noqa + return render_template( + f'services/{service.replace("-", "_")}.html.j2', + form=form, + title=service_manifest['name'] + ) + + +@bp.route('/transkribus-htr-pipeline', methods=['GET', 'POST']) +@login_required +def transkribus_htr_pipeline(): + if not current_app.config.get('NOPAQUE_TRANSKRIBUS_ENABLED'): + abort(404) + service = 'transkribus-htr-pipeline' + service_manifest = SERVICES[service] + version = request.args.get('version', service_manifest['latest_version']) + if version not in service_manifest['versions']: + abort(404) + form = AddTranskribusHTRPipelineJobForm(prefix='add-job-form', version=version) + if form.is_submitted(): + if not form.validate(): + return make_response(form.errors, 400) + service_args = {} + service_args['model'] = form.model.data + if form.binarization.data: + service_args['binarization'] = True + job = Job( + user=current_user, + description=form.description.data, + service=service, + service_args=service_args, + service_version=form.version.data, + title=form.title.data + ) + db.session.add(job) + db.session.flush(objects=[job]) + db.session.refresh(job) + try: + job.makedirs() + except OSError as e: + current_app.logger.error(e) + db.session.rollback() + flash('Internal Server Error', 'error') + return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa + job_input = JobInput( + filename=secure_filename(form.pdf.data.filename), + job=job, + mimetype=form.pdf.data.mimetype + ) + db.session.add(job_input) + db.session.flush(objects=[job_input]) + db.session.refresh(job_input) + try: + form.pdf.data.save(job_input.path) + except OSError as e: + current_app.logger.error(e) + db.session.rollback() + flash('Internal Server Error', 'error') + return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa + job.status = JobStatus.SUBMITTED + db.session.commit() + flash(f'Job "{job.title}" added', 'job') + return make_response({'redirect_url': url_for('jobs.job', job_id=job.id)}, 201) # noqa + return render_template( + f'services/{service.replace("-", "_")}.html.j2', + form=form, + title=service_manifest['name'] + ) + + +@bp.route('/spacy-nlp-pipeline', methods=['GET', 'POST']) +@login_required +def spacy_nlp_pipeline(): + service = 'spacy-nlp-pipeline' + service_manifest = SERVICES[service] + version = request.args.get('version', SERVICES[service]['latest_version']) + if version not in service_manifest['versions']: + abort(404) + form = AddSpacyNLPPipelineJobForm(prefix='add-job-form', version=version) + if form.is_submitted(): + if not form.validate(): + return make_response(form.errors, 400) + service_args = {} + service_args['model'] = form.model.data + if form.encoding_detection.data: + service_args['encoding_detection'] = True + job = Job( + user=current_user, + description=form.description.data, + service=service, + service_args=service_args, + service_version=form.version.data, + title=form.title.data + ) + db.session.add(job) + db.session.flush(objects=[job]) + db.session.refresh(job) + try: + job.makedirs() + except OSError as e: + current_app.logger.error(e) + db.session.rollback() + flash('Internal Server Error', 'error') + return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa + job_input = JobInput( + filename=secure_filename(form.txt.data.filename), + job=job, + mimetype=form.txt.data.mimetype + ) + db.session.add(job_input) + db.session.flush(objects=[job_input]) + db.session.refresh(job_input) + try: + form.txt.data.save(job_input.path) + except OSError as e: + current_app.logger.error(e) + db.session.rollback() + flash('Internal Server Error', 'error') + return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa + job.status = JobStatus.SUBMITTED + db.session.commit() + flash(f'Job "{job.title}" added', 'job') + return make_response({'redirect_url': url_for('jobs.job', job_id=job.id)}, 201) # noqa + return render_template( + f'services/{service.replace("-", "_")}.html.j2', + form=form, + title=service_manifest['name'] + ) + + +@bp.route('/corpus-analysis') +@login_required +def corpus_analysis(): + return render_template( + 'services/corpus_analysis.html.j2', + title='Corpus analysis' + ) \ No newline at end of file diff --git a/app/services/services.yml b/app/services/services.yml index 0c82c3d9..c26f7fb7 100644 --- a/app/services/services.yml +++ b/app/services/services.yml @@ -1,38 +1,70 @@ # TODO: This could also be done via GitLab/GitHub APIs -#file-setup-pipeline: -file-setup: +file-setup-pipeline: name: 'File setup pipeline' + publisher: 'Bielefeld University - CRC 1288 - INF' latest_version: '0.1.0' versions: 0.1.0: - publisher: 'Bielefeld University - CRC 1288 - INF' publishing_year: 2022 - url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup/-/releases/v0.1.0' -#spacy-nlp-pipeline: -spacy-nlp: - name: 'spaCy NLP' + url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup-pipeline/-/releases/v0.1.0' +tesseract-ocr-pipeline: + name: 'Tesseract OCR Pipeline' + publisher: 'Bielefeld University - CRC 1288 - INF' + latest_version: '0.1.4' + versions: + 0.1.0: + methods: + - 'binarization' + publishing_year: 2022 + url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.0' + 0.1.1: + methods: + - 'binarization' + publishing_year: 2022 + url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.1' + 0.1.2: + methods: + - 'binarization' + publishing_year: 2022 + url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.2' + 0.1.3: + methods: + - 'binarization' + publishing_year: 2022 + url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.3' + 0.1.4: + methods: + - 'binarization' + publishing_year: 2022 + url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.4' +transkribus-htr-pipeline: + name: 'Transkribus HTR Pipeline' + publisher: 'Bielefeld University - CRC 1288 - INF' + latest_version: '0.1.0' + versions: + 0.1.0: + methods: + - 'binarization' + publishing_year: 2022 + url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/transkribus-htr-pipeline/-/releases/v0.1.0' +spacy-nlp-pipeline: + name: 'spaCy NLP Pipeline' + publisher: 'Bielefeld University - CRC 1288 - INF' latest_version: '0.1.0' versions: 0.1.0: methods: - 'encoding_detection' models: + ca: 'Catalan' de: 'German' + el: 'Greek' en: 'English' + es: 'Spanish' + fr: 'French' it: 'Italian' pl: 'Polish' + ru: 'Russian' zh: 'Chinese' - publisher: 'Bielefeld University - CRC 1288 - INF' publishing_year: 2022 - url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/releases/v0.1.0' -#tesseract-ocr-pipeline: -tesseract-ocr: - name: 'Tesseract OCR' - latest_version: '0.1.0' - versions: - 0.1.0: - methods: - - 'binarization' - publisher: 'Bielefeld University - CRC 1288 - INF' - publishing_year: 2022 - url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/releases/v0.1.0' + url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/spacy-nlp-pipeline/-/releases/v0.1.0' \ No newline at end of file diff --git a/app/static/css/colors.scss b/app/static/css/colors.scss index 230efe4b..0dfe27d5 100644 --- a/app/static/css/colors.scss +++ b/app/static/css/colors.scss @@ -28,20 +28,25 @@ $color: ( "darken": #6b3f89, "lighten": #ebe8f6 ), - "file-setup": ( + "file-setup-pipeline": ( "base": #d5dc95, "darken": #a1b300, "lighten": #f2f3e1 ), - "spacy-nlp": ( + "spacy-nlp-pipeline": ( "base": #98acd2, "darken": #0064a3, "lighten": #e5e8f5 ), - "tesseract-ocr": ( + "tesseract-ocr-pipeline": ( "base": #a9d8c8, "darken": #00a58b, "lighten": #e7f4f1 + ), + "transkribus-htr-pipeline": ( + "base": #607d8b, + "darken": #37474f, + "lighten": #cfd8dc ) ), "status": ( diff --git a/app/static/css/style.css b/app/static/css/style.css index 92830725..0a6c575a 100644 --- a/app/static/css/style.css +++ b/app/static/css/style.css @@ -43,9 +43,10 @@ h1 .nopaque-icons, h2 .nopaque-icons, h3 .nopaque-icons, h4 .nopaque-icons, .tab .job-status-text {text-transform: lowercase;} .job-status-text[data-job-status]:empty:before {content: attr(data-job-status);} -.nopaque-icons.service-icon[data-service="file-setup"]:empty:before {content: "E";} -.nopaque-icons.service-icon[data-service="tesseract-ocr"]:empty:before {content: "F";} -.nopaque-icons.service-icon[data-service="spacy-nlp"]:empty:before {content: "G";} +.nopaque-icons.service-icon[data-service="file-setup-pipeline"]:empty:before {content: "E";} +.nopaque-icons.service-icon[data-service="tesseract-ocr-pipeline"]:empty:before {content: "F";} +.nopaque-icons.service-icon[data-service="transkribus-htr-pipeline"]:empty:before {content: "F";} +.nopaque-icons.service-icon[data-service="spacy-nlp-pipeline"]:empty:before {content: "G";} .nopaque-icons.service-icon[data-service="corpus-analysis"]:empty:before {content: "H";} .hoverable {cursor: pointer;} diff --git a/app/templates/_roadmap.html.j2 b/app/templates/_roadmap.html.j2 index e8265ba0..3e87d607 100644 --- a/app/templates/_roadmap.html.j2 +++ b/app/templates/_roadmap.html.j2 @@ -3,11 +3,13 @@

Roadmap

The roadmap guides you through nopaque's workflow! If you have the necessary input fie formats, you can directly jump into the corresponding process. If not, you can use the roadmap to jump right to the preceding process.

    -
  • File setup
  • +
  • File setup
  • navigate_next
  • -
  • OCR
  • +
  • OCR
  • +
  • more_vert
  • +
  • HTR
  • navigate_next
  • -
  • NLP
  • +
  • NLP
  • navigate_next
  • Add corpus
  • navigate_next
  • diff --git a/app/templates/_sidenav.html.j2 b/app/templates/_sidenav.html.j2 index 8729f4f8..c246b6bd 100644 --- a/app/templates/_sidenav.html.j2 +++ b/app/templates/_sidenav.html.j2 @@ -14,10 +14,13 @@
  • JMy Jobs
  • Processes & Services
  • -
  • File setup
  • -
  • OCR
  • -
  • NLP
  • -
  • Corpus analysis
  • +
  • File setup
  • +
  • OCR
  • + {% if config.NOPAQUE_TRANSKRIBUS_ENABLED %} +
  • HTR
  • + {% endif %} +
  • NLP
  • +
  • Corpus analysis
  • Account
  • settingsSettings
  • diff --git a/app/templates/main/dashboard.html.j2 b/app/templates/main/dashboard.html.j2 index 7bac5c03..00695d69 100644 --- a/app/templates/main/dashboard.html.j2 +++ b/app/templates/main/dashboard.html.j2 @@ -115,37 +115,37 @@

    - - + +

    -

    File setup

    +

    File setup

    Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing.

    - Create Job + Create Job

    - - + +

    -

    Optical Character Recognition

    +

    Optical Character Recognition

    nopaque converts your image data – like photos or scans – into text data through a process called OCR. This step enables you to proceed with further computational analysis of your documents.

    - Create Job + Create Job

    - - + +

    -

    Natural Language Processing

    +

    Natural Language Processing

    By means of computational linguistic data processing (tokenization, lemmatization, part-of-speech tagging and named-entity recognition) nopaque extracts additional information from your text.

    - Create Job + Create Job
    diff --git a/app/templates/main/faq.html.j2 b/app/templates/main/faq.html.j2 index 86dde843..29ac9271 100644 --- a/app/templates/main/faq.html.j2 +++ b/app/templates/main/faq.html.j2 @@ -35,9 +35,9 @@

    Our source code is spread over multiple Git repositories.

    diff --git a/app/templates/main/index.html.j2 b/app/templates/main/index.html.j2 index 80ebd02f..6cc285d0 100644 --- a/app/templates/main/index.html.j2 +++ b/app/templates/main/index.html.j2 @@ -76,31 +76,31 @@

     

    - - + +

    -

    File setup

    +

    File setup

    Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing and the application of other services.

    - - + +

    -

    Optical Character Recognition

    +

    Optical Character Recognition

    nopaque converts your image data – like photos or scans – into text data through OCR making it machine readable. This step enables you to proceed with further computational analysis of your documents.

    - - + +

    -

    Natural Language Processing

    +

    Natural Language Processing

    By means of computational linguistic data processing (tokenization, lemmatization, part-of-speech tagging and named-entity recognition) nopaque extracts additional information from your text.

    - +

    diff --git a/app/templates/services/_breadcrumbs.html.j2 b/app/templates/services/_breadcrumbs.html.j2 index d819fe60..a08beafa 100644 --- a/app/templates/services/_breadcrumbs.html.j2 +++ b/app/templates/services/_breadcrumbs.html.j2 @@ -2,13 +2,15 @@
  • navigate_next
  • Processes & Services
  • navigate_next
  • -{% if request.path == url_for('.service', service='corpus-analysis') %} -
  • {{ title }}
  • -{% elif request.path == url_for('.service', service='file-setup') %} -
  • {{ title }}
  • -{% elif request.path == url_for('.service', service='nlp') %} -
  • {{ title }}
  • -{% elif request.path == url_for('.service', service='ocr') %} -
  • {{ title }}
  • +{% if request.path == url_for('.corpus_analysis') %} +
  • {{ title }}
  • +{% elif request.path == url_for('.file_setup_pipeline') %} +
  • {{ title }}
  • +{% elif request.path == url_for('.spacy_nlp_pipeline') %} +
  • {{ title }}
  • +{% elif request.path == url_for('.tesseract_ocr_pipeline') %} +
  • {{ title }}
  • +{% elif request.path == url_for('.transkribus_htr_pipeline') %} +
  • {{ title }}
  • {% endif %} {% endset %} diff --git a/app/templates/services/file_setup.html.j2 b/app/templates/services/file_setup_pipeline.html.j2 similarity index 89% rename from app/templates/services/file_setup.html.j2 rename to app/templates/services/file_setup_pipeline.html.j2 index 9e70288c..ce3e21e8 100644 --- a/app/templates/services/file_setup.html.j2 +++ b/app/templates/services/file_setup_pipeline.html.j2 @@ -2,7 +2,7 @@ {% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %} {% import "materialize/wtf.html.j2" as wtf %} -{% block main_attribs %} class="service-scheme" data-service="file-setup"{% endblock main_attribs %} +{% block main_attribs %} class="service-scheme" data-service="file-setup-pipeline"{% endblock main_attribs %} {% block page_content %}
    @@ -16,13 +16,13 @@

     

     

    - +
    -
    +
    @@ -50,7 +50,7 @@ {{ wtf.render_field(form.description, data_length='255', material_icon='description') }}
    - {{ wtf.render_field(form.files, accept='image/jpeg, image/png, image/tiff', placeholder='Choose your .jpeg, .png or .tiff files') }} + {{ wtf.render_field(form.images, accept='image/jpeg, image/png, image/tiff', placeholder='Choose JPEG, PNG or TIFF files') }}
    {{ wtf.render_field(form.version, material_icon='apps') }} diff --git a/app/templates/services/spacy_nlp.html.j2 b/app/templates/services/spacy_nlp_pipeline.html.j2 similarity index 95% rename from app/templates/services/spacy_nlp.html.j2 rename to app/templates/services/spacy_nlp_pipeline.html.j2 index 30fab84c..5c911cd9 100644 --- a/app/templates/services/spacy_nlp.html.j2 +++ b/app/templates/services/spacy_nlp_pipeline.html.j2 @@ -2,7 +2,7 @@ {% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %} {% import "materialize/wtf.html.j2" as wtf %} -{% block main_attribs %} class="service-scheme" data-service="spacy-nlp"{% endblock main_attribs %} +{% block main_attribs %} class="service-scheme" data-service="spacy-nlp-pipeline"{% endblock main_attribs %} {% block page_content %}
    @@ -16,13 +16,13 @@

     

     

    - +
    -
    +
    @@ -68,7 +68,7 @@ {{ wtf.render_field(form.description, data_length='255', material_icon='description') }}
    - {{ wtf.render_field(form.files, accept='text/plain', placeholder='Choose your .txt files') }} + {{ wtf.render_field(form.txt, accept='text/plain', placeholder='Choose a plain text file') }}
    {{ wtf.render_field(form.model, material_icon='language') }} diff --git a/app/templates/services/tesseract_ocr.html.j2 b/app/templates/services/tesseract_ocr_pipeline.html.j2 similarity index 95% rename from app/templates/services/tesseract_ocr.html.j2 rename to app/templates/services/tesseract_ocr_pipeline.html.j2 index 66121281..c14a8c71 100644 --- a/app/templates/services/tesseract_ocr.html.j2 +++ b/app/templates/services/tesseract_ocr_pipeline.html.j2 @@ -2,7 +2,7 @@ {% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %} {% import "materialize/wtf.html.j2" as wtf %} -{% block main_attribs %} class="service-scheme" data-service="tesseract-ocr"{% endblock main_attribs %} +{% block main_attribs %} class="service-scheme" data-service="tesseract-ocr-pipeline"{% endblock main_attribs %} {% block page_content %}
    @@ -16,13 +16,13 @@

     

     

    - +
    -
    +
    @@ -50,7 +50,7 @@ {{ wtf.render_field(form.description, data_length='255', material_icon='description') }}
    - {{ wtf.render_field(form.files, accept='application/pdf', placeholder='Choose your .pdf files') }} + {{ wtf.render_field(form.pdf, accept='application/pdf', placeholder='Choose a PDF file') }}
    {{ wtf.render_field(form.model, material_icon='language') }} diff --git a/app/templates/services/transkribus_htr_pipeline.html.j2 b/app/templates/services/transkribus_htr_pipeline.html.j2 new file mode 100644 index 00000000..f91d0bf5 --- /dev/null +++ b/app/templates/services/transkribus_htr_pipeline.html.j2 @@ -0,0 +1,169 @@ +{% extends "base.html.j2" %} +{% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %} +{% import "materialize/wtf.html.j2" as wtf %} + +{% block main_attribs %} class="service-scheme" data-service="transkribus-htr-pipeline"{% endblock main_attribs %} + +{% block page_content %} +
    +
    +
    +

    {{ title }}

    +
    + +
    +
    +

     

    +

     

    + + + +
    +
    + +
    +
    +
    +
    +
    +
    + layersHTR +

    In this process, nopaque converts your image data – like photos or scans – into text data. This step enables you to proceed with the computational analysis of your documents.

    +

    + + Logoo_Transkribus_web + +

    +
    +
    +
    +
    +
    +
    + +
    +

    Submit a job

    +
    +
    +
    + {{ form.hidden_tag() }} +
    +
    + {{ wtf.render_field(form.title, data_length='32', material_icon='title') }} +
    +
    + {{ wtf.render_field(form.description, data_length='255', material_icon='description') }} +
    +
    + {{ wtf.render_field(form.pdf, accept='application/pdf', placeholder='Choose a PDF file') }} +
    +
    + {{ wtf.render_field(form.model, material_icon='language') }} +
    +
    + {{ wtf.render_field(form.version, material_icon='apps') }} +
    +
    + Preprocessing +
    +
    +

    {{ form.binarization.label.text }}

    +

    Based on a brightness threshold pixels are converted into either black or white. It is useful to reduce noise in images. (longer duration)

    +
    +
    +
    + +
    +
    +

     

    +
    +

     

    +
    +

    Page range

    +

    +
    +
    +
    + +
    +
    +

     

    +
    +

     

    +
    +

    Page rotation

    +

    +
    +
    +
    + +
    +
    +

     

    +
    +

     

    +
    +

    Page split

    +

    +
    +
    +
    + +
    +
    + +
    +
    +
    + {{ wtf.render_field(form.submit, material_icon='send') }} +
    +
    +
    +
    +
    +
    +{% endblock page_content %} + +{% block modals %} +{{ super() }} + +{% endblock modals %} + +{% block scripts %} +{{ super() }} + +{% endblock scripts %} diff --git a/config.py b/config.py index c746595a..07ec1d78 100644 --- a/config.py +++ b/config.py @@ -92,6 +92,11 @@ class Config: NOPAQUE_PROXY_FIX_X_PROTO = \ int(os.environ.get('NOPAQUE_PROXY_FIX_X_PROTO', '0')) + NOPAQUE_TRANSKRIBUS_ENABLED = \ + os.environ.get('NOPAQUE_TRANSKRIBUS_ENABLED', 'true').lower() == 'true' + NOPAQUE_READCOOP_USERNAME = os.environ.get('NOPAQUE_READCOOP_USERNAME') + NOPAQUE_READCOOP_PASSWORD = os.environ.get('NOPAQUE_READCOOP_PASSWORD') + @classmethod def init_app(cls, app: Flask): # Set up logging according to the corresponding (NOPAQUE_LOG_*) diff --git a/docker-compose.traefik.yml b/docker-compose.traefik.yml index a96b3b1a..d0f7f4d3 100644 --- a/docker-compose.traefik.yml +++ b/docker-compose.traefik.yml @@ -18,13 +18,15 @@ services: - "traefik.http.middlewares.http-nopaque-headers.headers.customrequestheaders.X-Forwarded-Proto=http" - "traefik.http.routers.http-nopaque.entrypoints=http" - "traefik.http.routers.http-nopaque.middlewares=http-nopaque-headers, redirect-to-https@file" - - "traefik.http.routers.http-nopaque.rule=Host(`${SERVER_NAME}`)" + # Replace with your domain + - "traefik.http.routers.http-nopaque.rule=Host(``)" ### ### ### ### - "traefik.http.middlewares.https-nopaque-headers.headers.customrequestheaders.X-Forwarded-Proto=https" - "traefik.http.routers.https-nopaque.entrypoints=https" - "traefik.http.routers.https-nopaque.middlewares=hsts-header@file, https-nopaque-headers" - - "traefik.http.routers.https-nopaque.rule=Host(`${SERVER_NAME}`)" + # Replace with your domain + - "traefik.http.routers.https-nopaque.rule=Host(``)" - "traefik.http.routers.https-nopaque.tls.certresolver=" - "traefik.http.routers.https-nopaque.tls.options=intermediate@file" ### ### diff --git a/migrations/versions/097aae1f02d7_.py b/migrations/versions/aa855b80cf1d_.py similarity index 97% rename from migrations/versions/097aae1f02d7_.py rename to migrations/versions/aa855b80cf1d_.py index ccac6756..687c89a8 100644 --- a/migrations/versions/097aae1f02d7_.py +++ b/migrations/versions/aa855b80cf1d_.py @@ -1,8 +1,8 @@ """empty message -Revision ID: 097aae1f02d7 +Revision ID: aa855b80cf1d Revises: -Create Date: 2022-02-08 10:02:03.748588 +Create Date: 2022-04-01 12:14:42.606685 """ from alembic import op @@ -10,7 +10,7 @@ import sqlalchemy as sa # revision identifiers, used by Alembic. -revision = '097aae1f02d7' +revision = 'aa855b80cf1d' down_revision = None branch_labels = None depends_on = None @@ -56,7 +56,6 @@ def upgrade(): sa.Column('title', sa.String(length=32), nullable=True), sa.Column('num_analysis_sessions', sa.Integer(), nullable=True), sa.Column('num_tokens', sa.Integer(), nullable=True), - sa.Column('archive_file', sa.String(length=255), nullable=True), sa.ForeignKeyConstraint(['user_id'], ['users.id'], ), sa.PrimaryKeyConstraint('id') ) @@ -85,6 +84,7 @@ def upgrade(): sa.Column('description', sa.String(length=255), nullable=True), sa.Column('publisher', sa.String(length=128), nullable=True), sa.Column('publishing_year', sa.Integer(), nullable=True), + sa.Column('shared', sa.Boolean(), nullable=True), sa.Column('title', sa.String(length=64), nullable=True), sa.Column('version', sa.String(length=16), nullable=True), sa.ForeignKeyConstraint(['user_id'], ['users.id'], ), diff --git a/requirements.txt b/requirements.txt index ce5a4bfb..038611ce 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ cqi docker eventlet==0.30.2 Flask==1.1.4 +Flask-APScheduler Flask-Assets Flask-Hashids Flask-HTTPAuth @@ -16,6 +17,7 @@ Flask-WTF gunicorn hiredis jsonschema +MarkupSafe==2.0.1 psycopg2 pyScss python-dotenv