From fe938c0ca2ab77a76619f322845b11f0137c2a0f Mon Sep 17 00:00:00 2001
From: Patrick Jentsch <p.jentsch@uni-bielefeld.de>
Date: Thu, 3 Feb 2022 12:39:16 +0100
Subject: [PATCH] Big update, corpus analysis reworked, versioned services,
 preliminary work for contributions

---
 app/TesseractOCRModel.defaults.yml            | 816 ++++++++++++++++++
 app/__init__.py                               |   6 +-
 app/api/__init__.py                           |   2 -
 app/api/auth.py                               |   8 +-
 app/api/jobs.py                               |  48 --
 app/auth/routes.py                            |  37 +-
 app/cli.py                                    |  88 +-
 app/contribute/__init__.py                    |   5 +
 app/contribute/routes.py                      |  19 +
 app/corpora/cqi_over_socketio/__init__.py     |   6 +-
 app/corpora/cqi_over_socketio/utils.py        |   5 +-
 app/corpora/routes.py                         |  43 +-
 app/daemon/__init__.py                        |  10 +-
 app/daemon/corpus_utils.py                    | 128 +--
 app/daemon/job_utils.py                       | 105 ++-
 app/jobs/routes.py                            |  24 +-
 app/models.py                                 | 174 +++-
 app/services/__init__.py                      |  78 +-
 app/services/forms.py                         | 103 ++-
 app/services/routes.py                        |  82 +-
 app/services/services.yml                     |  38 +
 app/static/css/nopaque.css                    |   4 +-
 .../js/CorpusAnalysis/CorpusAnalysisApp.js    |   2 +-
 app/static/js/RessourceDisplays/JobDisplay.js |   5 +-
 app/static/js/RessourceLists/JobResultList.js |  17 +-
 app/templates/_colors.html.j2                 |   4 +-
 app/templates/_sidenav.html.j2                |   7 +-
 app/templates/main/dashboard.html.j2          |  18 +-
 app/templates/main/index.html.j2              |   6 +-
 .../{nlp.html.j2 => spacy_nlp.html.j2}        |  12 +-
 .../{ocr.html.j2 => tesseract_ocr.html.j2}    |  12 +-
 app/utils.py                                  |  10 -
 docker-compose.traefik.yml                    |   8 +-
 migrations/versions/ad0d835fe5b1_.py          |  45 +
 nopaque.py                                    |   6 +-
 requirements.txt                              |   2 +
 36 files changed, 1552 insertions(+), 431 deletions(-)
 create mode 100644 app/TesseractOCRModel.defaults.yml
 delete mode 100644 app/api/jobs.py
 create mode 100644 app/contribute/__init__.py
 create mode 100644 app/contribute/routes.py
 create mode 100644 app/services/services.yml
 rename app/templates/services/{nlp.html.j2 => spacy_nlp.html.j2} (93%)
 rename app/templates/services/{ocr.html.j2 => tesseract_ocr.html.j2} (93%)
 delete mode 100644 app/utils.py
 create mode 100644 migrations/versions/ad0d835fe5b1_.py

diff --git a/app/TesseractOCRModel.defaults.yml b/app/TesseractOCRModel.defaults.yml
new file mode 100644
index 00000000..37929e89
--- /dev/null
+++ b/app/TesseractOCRModel.defaults.yml
@@ -0,0 +1,816 @@
+# - title: 'Afrikaans'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/afr.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Amharic'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/amh.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+- title: 'Arabic'
+  description: ''
+  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ara.traineddata'
+  publisher: 'tesseract-ocr'
+  publishing_year: 2021
+  version: '4.1.0'
+  compatible_service_versions:
+    - '0.1.0'
+# - title: 'Assamese'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/asm.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Azerbaijani'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/aze.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Azerbaijani - Cyrillic'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/aze_cyrl.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Belarusian'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bel.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Bengali'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ben.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Tibetan'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bod.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Bosnian'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bos.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Bulgarian'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bul.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Catalan; Valencian'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/cat.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Cebuano'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ceb.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Czech'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ces.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Chinese - Simplified'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chi_sim.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+- title: 'Chinese - Traditional'
+  description: ''
+  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chi_tra.traineddata'
+  publisher: 'tesseract-ocr'
+  publishing_year: 2021
+  version: '4.1.0'
+  compatible_service_versions:
+    - '0.1.0'
+# - title: 'Cherokee'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chr.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Welsh'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/cym.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+- title: 'Danish'
+  description: ''
+  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/dan.traineddata'
+  publisher: 'tesseract-ocr'
+  publishing_year: 2021
+  version: '4.1.0'
+  compatible_service_versions:
+    - '0.1.0'
+- title: 'German'
+  description: ''
+  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/deu.traineddata'
+  publisher: 'tesseract-ocr'
+  publishing_year: 2021
+  version: '4.1.0'
+  compatible_service_versions:
+    - '0.1.0'
+# - title: 'Dzongkha'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/dzo.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+- title: 'Greek, Modern (1453-)'
+  description: ''
+  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ell.traineddata'
+  publisher: 'tesseract-ocr'
+  publishing_year: 2021
+  version: '4.1.0'
+  compatible_service_versions:
+    - '0.1.0'
+- title: 'English'
+  description: ''
+  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/eng.traineddata'
+  publisher: 'tesseract-ocr'
+  publishing_year: 2021
+  version: '4.1.0'
+  compatible_service_versions:
+    - '0.1.0'
+- title: 'English, Middle (1100-1500)'
+  description: ''
+  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/enm.traineddata'
+  publisher: 'tesseract-ocr'
+  publishing_year: 2021
+  version: '4.1.0'
+  compatible_service_versions:
+    - '0.1.0'
+# - title: 'Esperanto'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/epo.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Estonian'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/est.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Basque'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/eus.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Persian'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fas.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Finnish'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fin.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+- title: 'French'
+  description: ''
+  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fra.traineddata'
+  publisher: 'tesseract-ocr'
+  publishing_year: 2021
+  version: '4.1.0'
+  compatible_service_versions:
+    - '0.1.0'
+- title: 'German Fraktur'
+  description: ''
+  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/frk.traineddata'
+  publisher: 'tesseract-ocr'
+  publishing_year: 2021
+  version: '4.1.0'
+  compatible_service_versions:
+    - '0.1.0'
+- title: 'French, Middle (ca. 1400-1600)'
+  description: ''
+  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/frm.traineddata'
+  publisher: 'tesseract-ocr'
+  publishing_year: 2021
+  version: '4.1.0'
+  compatible_service_versions:
+    - '0.1.0'
+# - title: 'Irish'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/gle.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Galician'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/glg.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+- title: 'Greek, Ancient (-1453)'
+  description: ''
+  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/grc.traineddata'
+  publisher: 'tesseract-ocr'
+  publishing_year: 2021
+  version: '4.1.0'
+  compatible_service_versions:
+    - '0.1.0'
+# - title: 'Gujarati'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/guj.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Haitian; Haitian Creole'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hat.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Hebrew'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/heb.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Hindi'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hin.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Croatian'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hrv.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Hungarian'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hun.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Inuktitut'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/iku.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Indonesian'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ind.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Icelandic'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/isl.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+- title: 'Italian'
+  description: ''
+  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ita.traineddata'
+  publisher: 'tesseract-ocr'
+  publishing_year: 2021
+  version: '4.1.0'
+  compatible_service_versions:
+    - '0.1.0'
+- title: 'Italian - Old'
+  description: ''
+  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ita_old.traineddata'
+  publisher: 'tesseract-ocr'
+  publishing_year: 2021
+  version: '4.1.0'
+  compatible_service_versions:
+    - '0.1.0'
+# - title: 'Javanese'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/jav.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Japanese'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/jpn.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Kannada'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kan.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Georgian'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kat.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Georgian - Old'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kat_old.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Kazakh'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kaz.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Central Khmer'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/khm.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Kirghiz; Kyrgyz'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kir.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Korean'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kor.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Kurdish'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kur.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Lao'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lao.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Latin'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lat.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Latvian'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lav.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Lithuanian'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lit.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Malayalam'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mal.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Marathi'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mar.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Macedonian'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mkd.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Maltese'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mlt.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Malay'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/msa.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Burmese'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mya.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Nepali'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nep.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Dutch; Flemish'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nld.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Norwegian'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nor.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Oriya'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ori.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Panjabi; Punjabi'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pan.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Polish'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pol.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+- title: 'Portuguese'
+  description: ''
+  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/por.traineddata'
+  publisher: 'tesseract-ocr'
+  publishing_year: 2021
+  version: '4.1.0'
+  compatible_service_versions:
+    - '0.1.0'
+# - title: 'Pushto; Pashto'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pus.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Romanian; Moldavian; Moldovan'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ron.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+- title: 'Russian'
+  description: ''
+  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/rus.traineddata'
+  publisher: 'tesseract-ocr'
+  publishing_year: 2021
+  version: '4.1.0'
+  compatible_service_versions:
+    - '0.1.0'
+# - title: 'Sanskrit'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/san.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Sinhala; Sinhalese'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/sin.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Slovak'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/slk.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Slovenian'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/slv.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+- title: 'Spanish; Castilian'
+  description: ''
+  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/spa.traineddata'
+  publisher: 'tesseract-ocr'
+  publishing_year: 2021
+  version: '4.1.0'
+  compatible_service_versions:
+    - '0.1.0'
+- title: 'Spanish; Castilian - Old'
+  description: ''
+  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/spa_old.traineddata'
+  publisher: 'tesseract-ocr'
+  publishing_year: 2021
+  version: '4.1.0'
+  compatible_service_versions:
+    - '0.1.0'
+# - title: 'Albanian'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/sqi.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Serbian'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/srp.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Serbian - Latin'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/srp_latn.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Swahili'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/swa.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Swedish'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/swe.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Syriac'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/syr.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Tamil'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tam.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Telugu'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tel.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Tajik'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tgk.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Tagalog'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tgl.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Thai'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tha.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Tigrinya'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tir.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Turkish'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tur.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Uighur; Uyghur'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uig.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Ukrainian'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ukr.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Urdu'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/urd.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Uzbek'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uzb.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Uzbek - Cyrillic'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uzb_cyrl.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Vietnamese'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/vie.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
+# - title: 'Yiddish'
+#   description: ''
+#   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/yid.traineddata'
+#   publisher: 'tesseract-ocr'
+#   publishing_year: 2021
+#   version: '4.1.0'
+#   compatible_service_versions:
+#     - '0.1.0'
diff --git a/app/__init__.py b/app/__init__.py
index 37b0961f..5c4052d2 100644
--- a/app/__init__.py
+++ b/app/__init__.py
@@ -39,9 +39,6 @@ def create_app(config: Config = Config) -> Flask:
     socketio.init_app(
         app, message_queue=app.config['NOPAQUE_SOCKETIO_MESSAGE_QUEUE_URI'])
 
-    # from .utils import HashidConverter
-    # app.url_map.converters['hashid'] = HashidConverter
-
     from .events import socketio as socketio_events
     from .events import sqlalchemy as sqlalchemy_events
 
@@ -54,6 +51,9 @@ def create_app(config: Config = Config) -> Flask:
     from .auth import bp as auth_blueprint
     app.register_blueprint(auth_blueprint, url_prefix='/auth')
 
+    from .contribute import bp as contribute_blueprint
+    app.register_blueprint(contribute_blueprint, url_prefix='/contribute')
+
     from .corpora import bp as corpora_blueprint
     app.register_blueprint(corpora_blueprint, url_prefix='/corpora')
 
diff --git a/app/api/__init__.py b/app/api/__init__.py
index f47235ea..e7674c87 100644
--- a/app/api/__init__.py
+++ b/app/api/__init__.py
@@ -1,7 +1,6 @@
 from flask import Blueprint
 from flask_restx import Api
 
-from .jobs import ns as jobs_ns
 from .tokens import ns as tokens_ns
 
 bp = Blueprint('api', __name__)
@@ -23,5 +22,4 @@ api = Api(
     version='1.0'
 )
 
-api.add_namespace(jobs_ns)
 api.add_namespace(tokens_ns)
diff --git a/app/api/auth.py b/app/api/auth.py
index 24e862ea..fea4123b 100644
--- a/app/api/auth.py
+++ b/app/api/auth.py
@@ -9,8 +9,12 @@ token_auth = HTTPTokenAuth()
 
 @basic_auth.verify_password
 def verify_password(email_or_username, password):
-    user = User.query.filter(or_(User.username == email_or_username,
-                                 User.email == email_or_username.lower())).first()
+    user = User.query.filter(
+        or_(
+            User.username == email_or_username,
+            User.email == email_or_username.lower()
+        )
+    ).first()
     if user and user.verify_password(password):
         return user
 
diff --git a/app/api/jobs.py b/app/api/jobs.py
deleted file mode 100644
index 153d5060..00000000
--- a/app/api/jobs.py
+++ /dev/null
@@ -1,48 +0,0 @@
-from flask_restx import Namespace, Resource
-from .auth import token_auth
-from ..jobs import tasks
-from ..models import Job
-
-
-ns = Namespace('jobs', description='Job operations')
-
-
-@ns.route('')
-class API_Jobs(Resource):
-    '''Shows a list of all jobs and lets you POST to add new job'''
-
-    @ns.doc(security='apiKey')
-    @token_auth.login_required
-    def get(self):
-        '''List all jobs'''
-        # TODO: Implement the correct get_jobs functionality
-        jobs = Job.query.all()
-        return [job.to_dict(include_relationships=False) for job in jobs]
-
-    @ns.doc(security='apiKey')
-    @token_auth.login_required
-    def post(self):
-        '''Create a new job'''
-        # TODO: Implement this
-        pass
-
-
-@ns.route('/<hashid:id>')
-class API_Job(Resource):
-    '''Show a single job and lets you delete it'''
-
-    @ns.doc(security='apiKey')
-    @token_auth.login_required
-    def get(self, id):
-        '''Get a job by id'''
-        job = Job.query.get_or_404(id)
-        return job.to_dict(include_relationships=False)
-
-    @ns.doc(security='apiKey')
-    @token_auth.login_required
-    def delete(self, id):
-        '''Delete a job by id'''
-        job = Job.query.get_or_404(id)
-        # We use this imported task because it will run in the background
-        tasks.delete_job(job.id)
-        return '', 204
diff --git a/app/auth/routes.py b/app/auth/routes.py
index 2cda4bc2..35842251 100644
--- a/app/auth/routes.py
+++ b/app/auth/routes.py
@@ -60,28 +60,37 @@ def register():
         return redirect(url_for('main.dashboard'))
     form = RegistrationForm(prefix='registration-form')
     if form.validate_on_submit():
-        user = User(email=form.email.data.lower(),
-                    password=form.password.data,
-                    username=form.username.data)
+        user = User(
+            email=form.email.data.lower(),
+            password=form.password.data,
+            username=form.username.data
+        )
         db.session.add(user)
-        db.session.commit()
+        db.session.flush(objects=[user])
+        db.session.refresh(user)
         try:
-            os.makedirs(user.path)
-        except OSError:
-            current_app.logger.error(
-                f'Make dir {user.path} led to an OSError!')
-            db.session.delete(user)
-            db.session.commit()
+            user.makedirs()
+        except OSError as e:
+            current_app.logger.error(e)
+            db.session.rollback()
             abort(500)
         else:
             token = user.generate_confirmation_token()
-            msg = create_message(user.email, 'Confirm Your Account',
-                                 'auth/email/confirm', token=token, user=user)
+            msg = create_message(
+                user.email,
+                'Confirm Your Account',
+                'auth/email/confirm',
+                token=token,
+                user=user
+            )
             send(msg)
             flash('A confirmation email has been sent to you by email.')
             return redirect(url_for('.login'))
-    return render_template('auth/register.html.j2', form=form,
-                           title='Register')
+    return render_template(
+        'auth/register.html.j2',
+        form=form,
+        title='Register'
+    )
 
 
 @bp.route('/confirm/<token>')
diff --git a/app/cli.py b/app/cli.py
index d885ff12..e588eef9 100644
--- a/app/cli.py
+++ b/app/cli.py
@@ -1,16 +1,44 @@
-from . import db
-from .models import Corpus, Role
+from flask import current_app
 from flask_migrate import upgrade
+from . import db
+from .models import Corpus, Job, Role, User, TesseractOCRModel
+import json
+import os
+import re
+
+
+def _make_default_dirs():
+    base_dir = current_app.config['NOPAQUE_DATA_DIR']
+
+    default_directories = [
+        os.path.join(base_dir, 'tmp'),
+        os.path.join(base_dir, 'users')
+    ]
+    for directory in default_directories:
+        if os.path.exists(directory):
+            if not os.path.isdir(directory):
+                raise NotADirectoryError(f'{directory} is not a directory')
+        else:
+            os.mkdir(directory)
 
 
 def register(app):
     @app.cli.command()
     def deploy():
         ''' Run deployment tasks. '''
+        # Make default directories
+        _make_default_dirs()
+
         # migrate database to latest revision
         upgrade()
-        # create or update user roles
-        Role.insert_roles()
+
+        # Insert/Update default database values
+        current_app.logger.info('Insert/Update default roles')
+        Role.insert_defaults()
+        current_app.logger.info('Insert/Update default users')
+        User.insert_defaults()
+        current_app.logger.info('Insert/Update default tesseract ocr models')
+        TesseractOCRModel.insert_defaults()
 
     @app.cli.group()
     def daemon():
@@ -40,3 +68,55 @@ def register(app):
         from unittest.suite import TestSuite
         tests: TestSuite = TestLoader().discover('tests')
         TextTestRunner(verbosity=2).run(tests)
+
+    @app.cli.group()
+    def convert():
+        ''' Datebase convert commands. '''
+
+    @convert.command()
+    def nlp_jobs():
+        for job in Job.query.filter_by(service='nlp').all():
+            job.service = 'spacy-nlp'
+            service_args = json.loads(job.service_args)
+            new_service_args = {}
+            for service_arg in service_args:
+                if service_arg == '--check-encoding':
+                    new_service_args['encoding_detection'] = True
+                elif re.match(r'-l ([a-z]{2})', service_arg):
+                    language_code = re.search(r'-l ([a-z]{2})', service_arg).group(1)  # noqa
+                    new_service_args['language'] = language_code
+            job.service_args = json.dumps(new_service_args)
+        db.session.commit()
+
+    @convert.command()
+    def ocr_jobs():
+        # Language code to TesseractOCRModel.title lookup
+        language_code_lookup = {
+            'ara': 'Arabic',
+            'chi_tra': 'Chinese - Traditional',
+            'dan': 'Danish',
+            'eng': 'English',
+            'enm': 'English, Middle (1100-1500)',
+            'fra': 'French',
+            'frm': 'French, Middle (ca. 1400-1600)',
+            'deu': 'German',
+            'frk': 'German Fraktur',
+            'ell': 'Greek, Modern (1453-)',
+            'ita': 'Italian',
+            'por': 'Portuguese',
+            'rus': 'Russian',
+            'spa': 'Spanish; Castilian'
+        }
+        for job in Job.query.filter_by(service='ocr').all():
+            job.service = 'tesseract-ocr'
+            service_args = json.loads(job.service_args)
+            new_service_args = {}
+            for service_arg in service_args:
+                if service_arg == '--binarize':
+                    new_service_args['binarization'] = True
+                elif re.match(r'-l ([a-z]{3})', service_arg):
+                    language_code = re.search(r'-l ([a-z]{3})', service_arg).group(1)  # noqa
+                    tesseract_ocr_model = TesseractOCRModel.query.filter_by(title=language_code_lookup[language_code]).first()  # noqa
+                    new_service_args['model'] = tesseract_ocr_model.id
+            job.service_args = json.dumps(new_service_args)
+        db.session.commit()
diff --git a/app/contribute/__init__.py b/app/contribute/__init__.py
new file mode 100644
index 00000000..15d172ec
--- /dev/null
+++ b/app/contribute/__init__.py
@@ -0,0 +1,5 @@
+from flask import Blueprint
+
+
+bp = Blueprint('contribute', __name__)
+from . import routes
diff --git a/app/contribute/routes.py b/app/contribute/routes.py
new file mode 100644
index 00000000..e0b43231
--- /dev/null
+++ b/app/contribute/routes.py
@@ -0,0 +1,19 @@
+from flask import flash, redirect, render_template, url_for
+from flask_login import login_required
+from . import bp
+from .. import db
+from ..decorators import permission_required
+from ..models import Permission, Role, User
+from ..settings import tasks as settings_tasks
+
+
+@bp.before_request
+@login_required
+@permission_required(Permission.CONTRIBUTE)
+def before_request():
+    pass
+
+
+@bp.route('/')
+def index():
+    pass
diff --git a/app/corpora/cqi_over_socketio/__init__.py b/app/corpora/cqi_over_socketio/__init__.py
index 14031c4a..3a358758 100644
--- a/app/corpora/cqi_over_socketio/__init__.py
+++ b/app/corpora/cqi_over_socketio/__init__.py
@@ -93,12 +93,12 @@ def connect(auth):
 
 @socketio.on('disconnect', namespace=NAMESPACE)
 def disconnect():
+    if 'd' not in session:
+        return
     session['d']['cqi_client_lock'].acquire()
     try:
         session['d']['cqi_client'].disconnect()
-    except cqi.errors.CQiException:
-        pass
-    except BrokenPipeError:
+    except (BrokenPipeError, cqi.errors.CQiException):
         pass
     session['d']['cqi_client_lock'].release()
     corpus = Corpus.query.get(session['d']['corpus_id'])
diff --git a/app/corpora/cqi_over_socketio/utils.py b/app/corpora/cqi_over_socketio/utils.py
index 7cbe07b9..9763548a 100644
--- a/app/corpora/cqi_over_socketio/utils.py
+++ b/app/corpora/cqi_over_socketio/utils.py
@@ -12,7 +12,10 @@ def cqi_over_socketio(f):
         f_args = {}
         # Check for missing args and if all provided args are of the right type
         for param in signature(f).parameters.values():
-            if param.annotation == cqi.CQiClient:
+            if param.name == 'corpus_name':
+                f_args[param.name] = f'NOPAQUE_{session["d"]["corpus_id"]}'
+                continue
+            if param.name == 'cqi_client':
                 f_args[param.name] = session['d']['cqi_client']
                 continue
             if param.default is param.empty:
diff --git a/app/corpora/routes.py b/app/corpora/routes.py
index 1086c298..f6d95b54 100644
--- a/app/corpora/routes.py
+++ b/app/corpora/routes.py
@@ -1,6 +1,7 @@
 from flask import (abort, current_app, flash, make_response, redirect,
                    render_template, url_for, send_from_directory)
 from flask_login import current_user, login_required
+from werkzeug.utils import secure_filename
 from . import bp
 from . import tasks
 from .forms import (AddCorpusFileForm, AddCorpusForm, EditCorpusFileForm,
@@ -29,18 +30,20 @@ def add_corpus():
         db.session.flush()
         db.session.refresh(corpus)
         try:
-            os.makedirs(corpus.path)
+            corpus.makedirs()
         except OSError as e:
-            current_app.logger.error(f'Could not add corpus: {e}')
+            current_app.logger.error(e)
             db.session.rollback()
             flash('Internal Server Error', 'error')
             abort(500)
-        else:
-            db.session.commit()
-            flash(f'Corpus "{corpus.title}" added!', 'corpus')
-            return redirect(url_for('.corpus', corpus_id=corpus.id))
-    return render_template('corpora/add_corpus.html.j2', form=form,
-                           title='Add corpus')
+        db.session.commit()
+        flash(f'Corpus "{corpus.title}" added', 'corpus')
+        return redirect(url_for('.corpus', corpus_id=corpus.id))
+    return render_template(
+        'corpora/add_corpus.html.j2',
+        form=form,
+        title='Add corpus'
+    )
 
 
 @bp.route('/import', methods=['GET', 'POST'])
@@ -174,7 +177,7 @@ def add_corpus_file(corpus_id):
         if not form.validate():
             return make_response(form.errors, 400)
         # Save the file
-        form.file.data.save(os.path.join(corpus.path, form.file.data.filename))
+        filename = secure_filename(form.file.data.filename)
         corpus_file = CorpusFile(
             address=form.address.data,
             author=form.author.data,
@@ -182,9 +185,10 @@ def add_corpus_file(corpus_id):
             chapter=form.chapter.data,
             corpus=corpus,
             editor=form.editor.data,
-            filename=form.file.data.filename,
+            filename=filename,
             institution=form.institution.data,
             journal=form.journal.data,
+            mimetype='application/vrt+xml',
             pages=form.pages.data,
             publisher=form.publisher.data,
             publishing_year=form.publishing_year.data,
@@ -192,12 +196,25 @@ def add_corpus_file(corpus_id):
             title=form.title.data
         )
         db.session.add(corpus_file)
+        db.session.flush(objects=[corpus_file])
+        db.session.refresh(corpus_file)
+        try:
+            form.file.data.save(corpus_file.path)
+        except OSError as e:
+            current_app.logger.error(e)
+            db.session.rollback()
+            flash('Internal Server Error', 'error')
+            return make_response({'redirect_url': url_for('.add_corpus_file', corpus_id=corpus.id)}, 500)  # noqa
         corpus.status = 'unprepared'
         db.session.commit()
-        flash(f'Corpus file "{corpus_file.filename}" added!', 'corpus')
+        flash(f'Corpus file "{corpus_file.title}" added!', 'corpus')
         return make_response({'redirect_url': url_for('.corpus', corpus_id=corpus.id)}, 201)  # noqa
-    return render_template('corpora/add_corpus_file.html.j2', corpus=corpus,
-                           form=form, title='Add corpus file')
+    return render_template(
+        'corpora/add_corpus_file.html.j2',
+        corpus=corpus,
+        form=form,
+        title='Add corpus file'
+    )
 
 
 @bp.route('/<hashid:corpus_id>/files/<hashid:corpus_file_id>/delete')
diff --git a/app/daemon/__init__.py b/app/daemon/__init__.py
index 00977456..84ed0efe 100644
--- a/app/daemon/__init__.py
+++ b/app/daemon/__init__.py
@@ -17,11 +17,7 @@ class Daemon(CheckCorporaMixin, CheckJobsMixin):
 
     def run(self):
         while True:
-            try:
-                self.check_corpora()
-                self.check_jobs()
-                db.session.commit()
-            except Exception as e:
-                current_app.logger.warning(e)
-                pass
+            self.check_corpora()
+            self.check_jobs()
+            db.session.commit()
             sleep(1.5)
diff --git a/app/daemon/corpus_utils.py b/app/daemon/corpus_utils.py
index 31cad929..3962582e 100644
--- a/app/daemon/corpus_utils.py
+++ b/app/daemon/corpus_utils.py
@@ -26,37 +26,55 @@ class CheckCorporaMixin:
     def create_build_corpus_service(self, corpus):
         ''' # Docker service settings # '''
         ''' ## Command ## '''
-        command = 'docker-entrypoint.sh build-corpus'
+        command = ['bash', '-c']
+        command.append(
+            f'mkdir /corpora/data/nopaque_{corpus.id}'
+            ' && '
+            'cwb-encode'
+            ' -c utf8'
+            f' -d /corpora/data/nopaque_{corpus.id}'
+            ' -f /root/files/corpus.vrt'
+            f' -R /usr/local/share/cwb/registry/nopaque_{corpus.id}'
+            ' -P pos -P lemma -P simple_pos'
+            ' -S ent:0+type -S s:0'
+            ' -S text:0+address+author+booktitle+chapter+editor+institution+journal+pages+publisher+publishing_year+school+title'  # noqa
+            ' -xsB -9'
+            ' && '
+            f'cwb-make -V NOPAQUE_{corpus.id}'
+        )
         ''' ## Constraints ## '''
         constraints = ['node.role==worker']
         ''' ## Image ## '''
-        image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cqpserver:r1674'  # noqa
+        image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cwb:r1702'
         ''' ## Labels ## '''
         labels = {
             'origin': current_app.config['SERVER_NAME'],
-            'type': 'build-corpus',
+            'type': 'corpus.build',
             'corpus_id': str(corpus.id)
         }
         ''' ## Mounts ## '''
-        ''' ### Corpus file mount ### '''
-        corpus_file_source = os.path.join(corpus.path, 'merged', 'corpus.vrt')
-        corpus_file_target = '/root/files/corpus.vrt'
-        corpus_file_mount = f'{corpus_file_source}:{corpus_file_target}:ro'
-        ''' ### Corpus data mount ### '''
-        corpus_data_source = os.path.join(corpus.path, 'data')
-        corpus_data_target = '/corpora/data'
-        corpus_data_mount = f'{corpus_data_source}:{corpus_data_target}:rw'
-        # Make sure that their is no data in the corpus data directory
-        shutil.rmtree(corpus_data_source, ignore_errors=True)
-        os.mkdir(corpus_data_source)
-        ''' ### Corpus registry mount ### '''
-        corpus_registry_source = os.path.join(corpus.path, 'registry')
-        corpus_registry_target = '/usr/local/share/cwb/registry'
-        corpus_registry_mount = f'{corpus_registry_source}:{corpus_registry_target}:rw'  # noqa
-        # Make sure that their is no data in the corpus registry directory
-        shutil.rmtree(corpus_registry_source, ignore_errors=True)
-        os.mkdir(corpus_registry_source)
-        mounts = [corpus_file_mount, corpus_data_mount, corpus_registry_mount]
+        mounts = []
+        ''' ### Data mount ### '''
+        data_mount_source = os.path.join(corpus.path, 'cwb', 'data')
+        data_mount_target = '/corpora/data'
+        data_mount = f'{data_mount_source}:{data_mount_target}:rw'
+        # Make sure that their is no data in the data directory
+        shutil.rmtree(data_mount_source, ignore_errors=True)
+        os.makedirs(data_mount_source)
+        mounts.append(data_mount)
+        ''' ### File mount ### '''
+        file_mount_source = os.path.join(corpus.path, 'cwb', 'corpus.vrt')
+        file_mount_target = '/root/files/corpus.vrt'
+        file_mount = f'{file_mount_source}:{file_mount_target}:ro'
+        mounts.append(file_mount)
+        ''' ### Registry mount ### '''
+        registry_mount_source = os.path.join(corpus.path, 'cwb', 'registry')
+        registry_mount_target = '/usr/local/share/cwb/registry'
+        registry_mount = f'{registry_mount_source}:{registry_mount_target}:rw'
+        # Make sure that their is no data in the registry directory
+        shutil.rmtree(registry_mount_source, ignore_errors=True)
+        os.makedirs(registry_mount_source)
+        mounts.append(registry_mount)
         ''' ## Name ## '''
         name = f'build-corpus_{corpus.id}'
         ''' ## Restart policy ## '''
@@ -74,7 +92,7 @@ class CheckCorporaMixin:
         except docker.errors.APIError as e:
             current_app.logger.error(
                 f'Create service "{name}" failed '
-                + f'due to "docker.errors.APIError": {e}'
+                f'due to "docker.errors.APIError": {e}'
             )
             return
         corpus.status = 'queued'
@@ -86,14 +104,14 @@ class CheckCorporaMixin:
         except docker.errors.NotFound as e:
             current_app.logger.error(
                 f'Get service "{service_name}" failed '
-                + f'due to "docker.errors.NotFound": {e}'
+                f'due to "docker.errors.NotFound": {e}'
             )
             corpus.status = 'failed'
             return
         except docker.errors.APIError as e:
             current_app.logger.error(
                 f'Get service "{service_name}" failed '
-                + f'due to "docker.errors.APIError": {e}'
+                f'due to "docker.errors.APIError": {e}'
             )
         service_tasks = service.tasks()
         if not service_tasks:
@@ -108,36 +126,47 @@ class CheckCorporaMixin:
             corpus.status = 'failed'
         else:
             return
-        try:
-            service.remove()
-        except docker.errors.APIError as e:
-            current_app.logger.error(
-                f'Remove service "{service_name}" failed '
-                + f'due to "docker.errors.APIError": {e}'
-            )
+        # try:
+        #     service.remove()
+        # except docker.errors.APIError as e:
+        #     current_app.logger.error(
+        #         f'Remove service "{service_name}" failed '
+        #         f'due to "docker.errors.APIError": {e}'
+        #     )
 
     def create_cqpserver_container(self, corpus):
         ''' # Docker container settings # '''
         ''' ## Command ## '''
-        command = 'cqpserver'
+        command = []
+        command.append(
+            'echo "host *;" > cqpserver.init'
+            ' && '
+            'echo "user anonymous \\"\\";" >> cqpserver.init'
+            ' && '
+            'cqpserver -I cqpserver.init'
+        )
         ''' ## Detach ## '''
         detach = True
+        ''' ## Entrypoint ## '''
+        entrypoint = ['bash', '-c']
         ''' ## Image ## '''
-        image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cqpserver:r1674'  # noqa
+        image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cwb:r1702'
         ''' ## Name ## '''
         name = f'cqpserver_{corpus.id}'
         ''' ## Network ## '''
         network = 'nopaque_default'
         ''' ## Volumes ## '''
+        volumes = []
         ''' ### Corpus data volume ### '''
-        corpus_data_source = os.path.join(corpus.path, 'data')
-        corpus_data_target = '/corpora/data'
-        corpus_data_volume = f'{corpus_data_source}:{corpus_data_target}:rw'
+        data_volume_source = os.path.join(corpus.path, 'cwb', 'data')
+        data_volume_target = '/corpora/data'
+        data_volume = f'{data_volume_source}:{data_volume_target}:rw'
+        volumes.append(data_volume)
         ''' ### Corpus registry volume ### '''
-        corpus_registry_source = os.path.join(corpus.path, 'registry')
-        corpus_registry_target = '/usr/local/share/cwb/registry'
-        corpus_registry_volume = f'{corpus_registry_source}:{corpus_registry_target}:rw'  # noqa
-        volumes = [corpus_data_volume, corpus_registry_volume]
+        registry_volume_source = os.path.join(corpus.path, 'cwb', 'registry')
+        registry_volume_target = '/usr/local/share/cwb/registry'
+        registry_volume = f'{registry_volume_source}:{registry_volume_target}:rw'  # noqa
+        volumes.append(registry_volume)
         # Check if a cqpserver container already exists. If this is the case,
         # remove it and create a new one
         try:
@@ -147,7 +176,7 @@ class CheckCorporaMixin:
         except docker.errors.APIError as e:
             current_app.logger.error(
                 f'Get container "{name}" failed '
-                + f'due to "docker.errors.APIError": {e}'
+                f'due to "docker.errors.APIError": {e}'
             )
             return
         else:
@@ -156,7 +185,7 @@ class CheckCorporaMixin:
             except docker.errors.APIError as e:
                 current_app.logger.error(
                     f'Remove container "{name}" failed '
-                    + f'due to "docker.errors.APIError": {e}'
+                    f'due to "docker.errors.APIError": {e}'
                 )
                 return
         try:
@@ -164,6 +193,7 @@ class CheckCorporaMixin:
                 image,
                 command=command,
                 detach=detach,
+                entrypoint=entrypoint,
                 volumes=volumes,
                 name=name,
                 network=network
@@ -171,14 +201,14 @@ class CheckCorporaMixin:
         except docker.errors.ImageNotFound as e:
             current_app.logger.error(
                 f'Run container "{name}" failed '
-                + f'due to "docker.errors.ImageNotFound" error: {e}'
+                f'due to "docker.errors.ImageNotFound" error: {e}'
             )
             corpus.status = 'failed'
             return
         except docker.errors.APIError as e:
             current_app.logger.error(
                 f'Run container "{name}" failed '
-                + f'due to "docker.errors.APIError" error: {e}'
+                f'due to "docker.errors.APIError" error: {e}'
             )
             return
         corpus.status = 'analysing'
@@ -190,14 +220,14 @@ class CheckCorporaMixin:
         except docker.errors.NotFound as e:
             current_app.logger.error(
                 f'Get container "{container_name}" failed '
-                + f'due to "docker.errors.NotFound": {e}'
+                f'due to "docker.errors.NotFound": {e}'
             )
             corpus.num_analysis_sessions = 0
             corpus.status = 'prepared'
         except docker.errors.APIError as e:
             current_app.logger.error(
                 f'Get container "{container_name}" failed '
-                + f'due to "docker.errors.APIError": {e}'
+                f'due to "docker.errors.APIError": {e}'
             )
 
     def remove_cqpserver_container(self, corpus):
@@ -210,7 +240,7 @@ class CheckCorporaMixin:
         except docker.errors.APIError as e:
             current_app.logger.error(
                 f'Get container "{container_name}" failed '
-                + f'due to "docker.errors.APIError": {e}'
+                f'due to "docker.errors.APIError": {e}'
             )
             return
         try:
@@ -218,5 +248,5 @@ class CheckCorporaMixin:
         except docker.errors.APIError as e:
             current_app.logger.error(
                 f'Remove container "{container_name}" failed '
-                + f'due to "docker.errors.APIError": {e}'
+                f'due to "docker.errors.APIError": {e}'
             )
diff --git a/app/daemon/job_utils.py b/app/daemon/job_utils.py
index 78bae839..c640f35c 100644
--- a/app/daemon/job_utils.py
+++ b/app/daemon/job_utils.py
@@ -2,7 +2,7 @@ from datetime import datetime
 from flask import current_app
 from werkzeug.utils import secure_filename
 from .. import db
-from ..models import Job, JobResult
+from ..models import Job, JobResult, TesseractOCRModel
 import docker
 import json
 import os
@@ -23,27 +23,34 @@ class CheckJobsMixin:
         ''' # Docker service settings # '''
         ''' ## Service specific settings ## '''
         if job.service == 'file-setup':
-            mem_mb = 2048
+            mem_mb = 512
             n_cores = 2
             executable = 'file-setup'
-            image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}file-setup:{job.service_version}'  # noqa
-        elif job.service == 'ocr':
-            mem_mb = 4096
+            image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}file-setup:v{job.service_version}'  # noqa
+        elif job.service == 'tesseract-ocr':
+            mem_mb = 2048
             n_cores = 4
             executable = 'ocr'
-            image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}ocr:{job.service_version}'  # noqa
-        elif job.service == 'nlp':
-            mem_mb = 2048
-            n_cores = 2
+            image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}ocr:v{job.service_version}'  # noqa
+        elif job.service == 'spacy-nlp':
+            mem_mb = 1024
+            n_cores = 1
             executable = 'nlp'
-            image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}nlp:{job.service_version}'  # noqa
+            image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}nlp:v{job.service_version}'  # noqa
         ''' ## Command ## '''
         command = f'{executable} -i /input -o /output'
-        command += ' --log-dir /input'
+        command += ' --log-dir /logs'
         command += f' --mem-mb {mem_mb}'
         command += f' --n-cores {n_cores}'
-        command += f' --zip [{job.service}]_{secure_filename(job.title)}'
-        command += ' ' + ' '.join(json.loads(job.service_args))
+        service_args = json.loads(job.service_args)
+        if job.service == 'spacy-nlp':
+            command += f' -m {service_args["model"]}'
+            if 'encoding_detection' in service_args and service_args['encoding_detection']:  # noqa
+                command += ' --check-encoding'
+        elif job.service == 'tesseract-ocr':
+            command += f' -m {service_args["model"]}'
+            if 'binarization' in service_args and service_args['binarization']:
+                command += ' --binarize'
         ''' ## Constraints ## '''
         constraints = ['node.role==worker']
         ''' ## Labels ## '''
@@ -53,20 +60,42 @@ class CheckJobsMixin:
             'job_id': str(job.id)
         }
         ''' ## Mounts ## '''
-        ''' ### Input mount ### '''
-        input_mount_source = job.path
-        input_mount_target = '/input'
+        mounts = []
+        ''' ### Input mount(s) ### '''
+        input_mount_target_base = '/input'
         if job.service == 'file-setup':
-            input_mount_target += f'/{secure_filename(job.title)}'
-        input_mount = f'{input_mount_source}:{input_mount_target}:rw'
+            input_mount_target_base += f'/{secure_filename(job.title)}'
+        for job_input in job.inputs:
+            input_mount_source = job_input.path
+            input_mount_target = f'/{input_mount_target_base}/{job_input.filename}'  # noqa
+            input_mount = f'{input_mount_source}:{input_mount_target}:ro'
+            mounts.append(input_mount)
+        if job.service == 'tesseract-ocr':
+            service_args = json.loads(job.service_args)
+            model = TesseractOCRModel.query.get(service_args['model'])
+            if model is None:
+                job.status = 'failed'
+                return
+            models_mount_source = model.path
+            models_mount_target = f'/usr/local/share/tessdata/{model.filename}'
+            models_mount = f'{models_mount_source}:{models_mount_target}:ro'
+            mounts.append(models_mount)
         ''' ### Output mount ### '''
-        output_mount_source = os.path.join(job.path, 'output')
+        output_mount_source = os.path.join(job.path, 'results')
         output_mount_target = '/output'
         output_mount = f'{output_mount_source}:{output_mount_target}:rw'
         # Make sure that their is no data in the output directory
         shutil.rmtree(output_mount_source, ignore_errors=True)
         os.makedirs(output_mount_source)
-        mounts = [input_mount, output_mount]
+        mounts.append(output_mount)
+        ''' ### Pipeline data mount ### '''
+        pyflow_data_mount_source = os.path.join(job.path, 'pipeline_data')
+        pyflow_data_mount_target = '/logs/pyflow.data'
+        pyflow_data_mount = f'{pyflow_data_mount_source}:{pyflow_data_mount_target}:rw'  # noqa
+        # Make sure that their is no data in the output directory
+        shutil.rmtree(pyflow_data_mount_source, ignore_errors=True)
+        os.makedirs(pyflow_data_mount_source)
+        mounts.append(pyflow_data_mount)
         ''' ## Name ## '''
         name = f'job_{job.id}'
         ''' ## Resources ## '''
@@ -90,7 +119,7 @@ class CheckJobsMixin:
         except docker.errors.APIError as e:
             current_app.logger.error(
                 f'Create service "{name}" failed '
-                + f'due to "docker.errors.APIError": {e}'
+                f'due to "docker.errors.APIError": {e}'
             )
             return
         job.status = 'queued'
@@ -102,14 +131,14 @@ class CheckJobsMixin:
         except docker.errors.NotFound as e:
             current_app.logger.error(
                 f'Get service "{service_name}" failed '
-                + f'due to "docker.errors.NotFound": {e}'
+                f'due to "docker.errors.NotFound": {e}'
             )
             job.status = 'failed'
             return
         except docker.errors.APIError as e:
             current_app.logger.error(
                 f'Get service "{service_name}" failed '
-                + f'due to "docker.errors.APIError": {e}'
+                f'due to "docker.errors.APIError": {e}'
             )
             return
         service_tasks = service.tasks()
@@ -121,13 +150,25 @@ class CheckJobsMixin:
             return
         elif job.status == 'running' and task_state == 'complete':
             job.status = 'complete'
-            results_dir = os.path.join(job.path, 'output')
-            result_files = [x for x in os.listdir(results_dir) if x.endswith('.zip')]  # noqa
-            for result_file in result_files:
-                job_result = JobResult(filename=result_file, job=job)
+            results_dir = os.path.join(job.path, 'results')
+            with open(os.path.join(results_dir, 'outputs.json')) as f:
+                outputs = json.load(f)
+            for output in outputs:
+                filename = os.path.basename(output['file'])
+                job_result = JobResult(
+                    filename=filename,
+                    job=job,
+                    mimetype=output['mimetype']
+                )
+                if 'description' in output:
+                    job_result.description = output['description']
                 db.session.add(job_result)
-                db.session.flush()
+                db.session.flush(objects=[job_result])
                 db.session.refresh(job_result)
+                os.rename(
+                    os.path.join(results_dir, output['file']),
+                    job_result.path
+                )
         elif job.status == 'running' and task_state == 'failed':
             job.status = 'failed'
         else:
@@ -138,7 +179,7 @@ class CheckJobsMixin:
         except docker.errors.APIError as e:
             current_app.logger.error(
                 f'Remove service "{service_name}" failed '
-                + f'due to "docker.errors.APIError": {e}'
+                f'due to "docker.errors.APIError": {e}'
             )
 
     def remove_job_service(self, job):
@@ -151,7 +192,7 @@ class CheckJobsMixin:
         except docker.errors.APIError as e:
             current_app.logger.error(
                 f'Get service "{service_name}" failed '
-                + f'due to "docker.errors.APIError": {e}'
+                f'due to "docker.errors.APIError": {e}'
             )
             return
         try:
@@ -159,7 +200,7 @@ class CheckJobsMixin:
         except docker.errors.APIError as e:
             current_app.logger.error(
                 f'Update service "{service_name}" failed '
-                + f'due to "docker.errors.APIError": {e}'
+                f'due to "docker.errors.APIError": {e}'
             )
             return
         try:
@@ -167,5 +208,5 @@ class CheckJobsMixin:
         except docker.errors.APIError as e:
             current_app.logger.error(
                 f'Remove "{service_name}" service failed '
-                + f'due to "docker.errors.APIError": {e}'
+                f'due to "docker.errors.APIError": {e}'
             )
diff --git a/app/jobs/routes.py b/app/jobs/routes.py
index db8c686c..4acd7c47 100644
--- a/app/jobs/routes.py
+++ b/app/jobs/routes.py
@@ -34,12 +34,14 @@ def delete_job(job_id):
 @login_required
 def download_job_input(job_id, job_input_id):
     job_input = JobInput.query.filter(JobInput.job_id == job_id, JobInput.id == job_input_id).first_or_404()  # noqa
-    if not (job_input.job.user == current_user
-            or current_user.is_administrator()):
+    if not (job_input.job.user == current_user or current_user.is_administrator()):  # noqa
         abort(403)
-    return send_from_directory(as_attachment=True,
-                               directory=os.path.dirname(job_input.path),
-                               filename=job_input.filename)
+    return send_from_directory(
+        as_attachment=True,
+        attachment_filename=job_input.filename,
+        directory=os.path.dirname(job_input.path),
+        filename=os.path.basename(job_input.path)
+    )
 
 
 @bp.route('/<hashid:job_id>/restart')
@@ -59,9 +61,11 @@ def restart(job_id):
 @login_required
 def download_job_result(job_id, job_result_id):
     job_result = JobResult.query.filter(JobResult.job_id == job_id, JobResult.id == job_result_id).first_or_404()  # noqa
-    if not (job_result.job.user == current_user
-            or current_user.is_administrator()):
+    if not (job_result.job.user == current_user or current_user.is_administrator()):  # noqa
         abort(403)
-    return send_from_directory(as_attachment=True,
-                               directory=os.path.dirname(job_result.path),
-                               filename=job_result.filename)
+    return send_from_directory(
+        as_attachment=True,
+        attachment_filename=job_result.filename,
+        directory=os.path.dirname(job_result.path),
+        filename=os.path.basename(job_result.path)
+    )
diff --git a/app/models.py b/app/models.py
index 55013e92..d02b511c 100644
--- a/app/models.py
+++ b/app/models.py
@@ -4,13 +4,17 @@ from flask_hashids import HashidMixin
 from flask_login import UserMixin
 from itsdangerous import BadSignature, TimedJSONWebSignatureSerializer
 from time import sleep
+from tqdm import tqdm
 from werkzeug.security import generate_password_hash, check_password_hash
-import xml.etree.ElementTree as ET
 from . import db, login
 import base64
 import enum
+import json
 import os
+import requests
 import shutil
+import xml.etree.ElementTree as ET
+import yaml
 
 
 class Permission(enum.IntEnum):
@@ -25,7 +29,7 @@ class Permission(enum.IntEnum):
 
 class FileMixin:
     creation_date = db.Column(db.DateTime, default=datetime.utcnow)
-    filename = db.Column(db.String(256))
+    filename = db.Column(db.String(255))
     last_edited_date = db.Column(db.DateTime, default=datetime.utcnow)
     mimetype = db.Column(db.String(255))
 
@@ -86,7 +90,7 @@ class Role(HashidMixin, db.Model):
         return dict_role
 
     @staticmethod
-    def insert_roles():
+    def insert_defaults():
         roles = {
             'User': [],
             'API user': [Permission.USE_API],
@@ -132,6 +136,12 @@ class User(HashidMixin, UserMixin, db.Model):
         db.String(16), default='all')
     # Backrefs: role: Role
     # Relationships
+    tesseract_ocr_models = db.relationship(
+        'TesseractOCRModel',
+        backref='user',
+        cascade='all, delete-orphan',
+        lazy='dynamic'
+    )
     corpora = db.relationship(
         'Corpus',
         backref='user',
@@ -221,6 +231,12 @@ class User(HashidMixin, UserMixin, db.Model):
     def is_administrator(self):
         return self.can(Permission.ADMINISTRATE)
 
+    def makedirs(self):
+        os.mkdir(self.path)
+        os.mkdir(os.path.join(self.path, 'tesseract_ocr_models'))
+        os.mkdir(os.path.join(self.path, 'corpora'))
+        os.mkdir(os.path.join(self.path, 'jobs'))
+
     def revoke_token(self):
         self.token_expiration = datetime.utcnow() - timedelta(seconds=1)
 
@@ -269,6 +285,21 @@ class User(HashidMixin, UserMixin, db.Model):
             return None
         return user
 
+    @staticmethod
+    def insert_defaults():
+        if User.query.filter_by(username='nopaque').first() is not None:
+            return
+        user = User(username='nopaque')
+        db.session.add(user)
+        db.session.flush(objects=[user])
+        db.session.refresh(user)
+        try:
+            user.makedirs()
+        except OSError as e:
+            current_app.logger.error(e)
+            db.session.rollback()
+        db.session.commit()
+
     @staticmethod
     def reset_password(token, new_password):
         s = TimedJSONWebSignatureSerializer(current_app.config['SECRET_KEY'])
@@ -284,6 +315,72 @@ class User(HashidMixin, UserMixin, db.Model):
         return True
 
 
+class TesseractOCRModel(FileMixin, HashidMixin, db.Model):
+    __tablename__ = 'tesseract_ocr_models'
+    # Primary key
+    id = db.Column(db.Integer, primary_key=True)
+    # Foreign keys
+    user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
+    # Fields
+    compatible_service_versions = db.Column(db.String(255))
+    description = db.Column(db.String(255))
+    publisher = db.Column(db.String(128))
+    publishing_year = db.Column(db.Integer)
+    title = db.Column(db.String(64))
+    version = db.Column(db.String(16))
+    # Backrefs: user: User
+
+    @property
+    def path(self):
+        return os.path.join(
+            self.user.path,
+            'tesseract_ocr_models',
+            str(self.id)
+        )
+
+    @staticmethod
+    def insert_defaults():
+        user = User.query.filter_by(username='nopaque').first()
+        defaults_file = os.path.join(
+            os.path.dirname(os.path.abspath(__file__)),
+            'TesseractOCRModel.defaults.yml'
+        )
+        with open(defaults_file, 'r') as f:
+            defaults = yaml.safe_load(f)
+        for m in defaults:
+            if TesseractOCRModel.query.filter_by(title=m['title'], version=m['version']).first() is not None:  # noqa
+                continue
+            tesseract_ocr_model = TesseractOCRModel(
+                compatible_service_versions=json.dumps(m['compatible_service_versions']),  # noqa
+                description=m['description'],
+                publisher=m['publisher'],
+                publishing_year=m['publishing_year'],
+                title=m['title'],
+                user=user,
+                version=m['version']
+            )
+            db.session.add(tesseract_ocr_model)
+            db.session.flush(objects=[tesseract_ocr_model])
+            db.session.refresh(tesseract_ocr_model)
+            tesseract_ocr_model.filename = f'{tesseract_ocr_model.id}.traineddata'  # noqa
+            r = requests.get(m['url'], stream=True)
+            pbar = tqdm(
+                desc=f'{tesseract_ocr_model.title} ({tesseract_ocr_model.filename})',  # noqa
+                unit="B",
+                unit_scale=True,
+                unit_divisor=1024,
+                total=int(r.headers['Content-Length'])
+            )
+            pbar.clear()
+            with open(tesseract_ocr_model.path, 'wb') as f:
+                for chunk in r.iter_content(chunk_size=1024):
+                    if chunk:  # filter out keep-alive new chunks
+                        pbar.update(len(chunk))
+                        f.write(chunk)
+                pbar.close()
+        db.session.commit()
+
+
 class JobInput(FileMixin, HashidMixin, db.Model):
     __tablename__ = 'job_inputs'
     # Primary key
@@ -309,7 +406,7 @@ class JobInput(FileMixin, HashidMixin, db.Model):
 
     @property
     def path(self):
-        return os.path.join(self.job.path, self.filename)
+        return os.path.join(self.job.path, 'inputs', str(self.id))
 
     def to_dict(self, backrefs=False, relationships=False):
         dict_job_input = {
@@ -347,6 +444,8 @@ class JobResult(FileMixin, HashidMixin, db.Model):
     id = db.Column(db.Integer, primary_key=True)
     # Foreign keys
     job_id = db.Column(db.Integer, db.ForeignKey('jobs.id'))
+    # Fields
+    description = db.Column(db.String(255))
     # Backrefs: job: Job
 
     def __repr__(self):
@@ -366,12 +465,13 @@ class JobResult(FileMixin, HashidMixin, db.Model):
 
     @property
     def path(self):
-        return os.path.join(self.job.path, 'output', self.filename)
+        return os.path.join(self.job.path, 'results', str(self.id))
 
     def to_dict(self, backrefs=False, relationships=False):
         dict_job_result = {
             'id': self.hashid,
             'job_id': self.job.hashid,
+            'description': self.description,
             'download_url': self.download_url,
             'url': self.url,
             **self.file_mixin_to_dict(
@@ -414,8 +514,8 @@ class Job(HashidMixin, db.Model):
     end_date = db.Column(db.DateTime())
     service = db.Column(db.String(64))
     '''
-    ' Service specific arguments as string list.
-    ' Example: ["-l eng", "--binarize"]
+    ' Dictionary as JSON formatted string.
+    ' Example: {"binarization": True}
     '''
     service_args = db.Column(db.String(255))
     service_version = db.Column(db.String(16))
@@ -472,6 +572,12 @@ class Job(HashidMixin, db.Model):
         shutil.rmtree(self.path, ignore_errors=True)
         db.session.delete(self)
 
+    def makedirs(self):
+        os.mkdir(self.path)
+        os.mkdir(os.path.join(self.path, 'inputs'))
+        os.mkdir(os.path.join(self.path, 'pipeline_data'))
+        os.mkdir(os.path.join(self.path, 'results'))
+
     def restart(self):
         '''
         Restart a job - only if the status is complete or failed
@@ -479,7 +585,7 @@ class Job(HashidMixin, db.Model):
 
         if self.status not in ['complete', 'failed']:
             raise Exception('Could not restart job: status is not "complete/failed"')  # noqa
-        shutil.rmtree(os.path.join(self.path, 'output'), ignore_errors=True)
+        shutil.rmtree(os.path.join(self.path, 'results'), ignore_errors=True)
         shutil.rmtree(os.path.join(self.path, 'pyflow.data'), ignore_errors=True)  # noqa
         for result in self.results:
             db.session.delete(result)
@@ -487,6 +593,10 @@ class Job(HashidMixin, db.Model):
         self.status = 'submitted'
 
     def to_dict(self, backrefs=False, relationships=False):
+        service_args = json.loads(self.service_args)
+        if self.service == 'tesseract-ocr' and 'model' in service_args:
+            tesseract_ocr_pipeline_model = TesseractOCRModel.query.get(service_args['model'])  # noqa
+            service_args['model'] = tesseract_ocr_pipeline_model.title
         dict_job = {
             'id': self.hashid,
             'user_id': self.user.hashid,
@@ -494,7 +604,7 @@ class Job(HashidMixin, db.Model):
             'description': self.description,
             'end_date': None if self.end_date is None else f'{self.end_date.isoformat()}Z',  # noqa
             'service': self.service,
-            'service_args': self.service_args,
+            'service_args': service_args,
             'service_version': self.service_version,
             'status': self.status,
             'title': self.title,
@@ -550,7 +660,7 @@ class CorpusFile(FileMixin, HashidMixin, db.Model):
 
     @property
     def path(self):
-        return os.path.join(self.corpus.path, self.filename)
+        return os.path.join(self.corpus.path, 'files', str(self.id))
 
     @property
     def url(self):
@@ -659,28 +769,27 @@ class Corpus(HashidMixin, db.Model):
         return self.user.hashid
 
     def build(self):
-        output_dir = os.path.join(self.path, 'merged')
-        shutil.rmtree(output_dir, ignore_errors=True)
-        os.mkdir(output_dir)
-        output_file = os.path.join(output_dir, 'corpus.vrt')
         corpus_element = ET.fromstring('<corpus>\n</corpus>')
         for corpus_file in self.files:
             element_tree = ET.parse(corpus_file.path)
-            text_node = element_tree.find('text')
-            text_node.set('address', corpus_file.address or 'NULL')
-            text_node.set('author', corpus_file.author)
-            text_node.set('booktitle', corpus_file.booktitle or 'NULL')
-            text_node.set('chapter', corpus_file.chapter or 'NULL')
-            text_node.set('editor', corpus_file.editor or 'NULL')
-            text_node.set('institution', corpus_file.institution or 'NULL')
-            text_node.set('journal', corpus_file.journal or 'NULL')
-            text_node.set('pages', corpus_file.pages or 'NULL')
-            text_node.set('publisher', corpus_file.publisher or 'NULL')
-            text_node.set('publishing_year', str(corpus_file.publishing_year))
-            text_node.set('school', corpus_file.school or 'NULL')
-            text_node.set('title', corpus_file.title)
-            corpus_element.insert(1, text_node)
-        ET.ElementTree(corpus_element).write(output_file, encoding='utf-8')
+            text_element = element_tree.getroot()
+            text_element.set('address', corpus_file.address or 'NULL')
+            text_element.set('author', corpus_file.author)
+            text_element.set('booktitle', corpus_file.booktitle or 'NULL')
+            text_element.set('chapter', corpus_file.chapter or 'NULL')
+            text_element.set('editor', corpus_file.editor or 'NULL')
+            text_element.set('institution', corpus_file.institution or 'NULL')
+            text_element.set('journal', corpus_file.journal or 'NULL')
+            text_element.set('pages', corpus_file.pages or 'NULL')
+            text_element.set('publisher', corpus_file.publisher or 'NULL')
+            text_element.set('publishing_year', str(corpus_file.publishing_year))  # noqa
+            text_element.set('school', corpus_file.school or 'NULL')
+            text_element.set('title', corpus_file.title)
+            corpus_element.insert(1, text_element)
+        ET.ElementTree(corpus_element).write(
+            os.path.join(self.path, 'cwb', 'corpus.vrt'),
+            encoding='utf-8'
+        )
         self.last_edited_date = datetime.utcnow()
         self.status = 'submitted'
 
@@ -688,6 +797,13 @@ class Corpus(HashidMixin, db.Model):
         shutil.rmtree(self.path, ignore_errors=True)
         db.session.delete(self)
 
+    def makedirs(self):
+        os.mkdir(self.path)
+        os.mkdir(os.path.join(self.path, 'files'))
+        os.mkdir(os.path.join(self.path, 'cwb'))
+        os.mkdir(os.path.join(self.path, 'cwb', 'data'))
+        os.mkdir(os.path.join(self.path, 'cwb', 'registry'))
+
     def to_dict(self, backrefs=False, relationships=False):
         dict_corpus = {
             'id': self.hashid,
diff --git a/app/services/__init__.py b/app/services/__init__.py
index 5c553e89..e41a895d 100644
--- a/app/services/__init__.py
+++ b/app/services/__init__.py
@@ -1,77 +1,13 @@
 from flask import Blueprint
+import os
+import yaml
 
 
-SERVICES = {
-    'file-setup': {
-        'name': 'File setup',
-        'versions': {
-            'latest': '1.0.0b',
-            '1.0.0b': {
-                'publishing_data': {
-                    'date': None,
-                    'title': 'nopaque File setup service',
-                    'url': 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup/-/tree/1.0.0b',  # noqa
-                    'version': '1.0.0'
-                }
-            }
-        }
-    },
-    'nlp': {
-        'name': 'Natural Language Processing',
-        'versions': {
-            'latest': '1.0.0b',
-            '1.0.0b': {
-                'check_encoding': True,
-                'models': {
-                    'de': 'German',
-                    'en': 'English',
-                    'it': 'Italian',
-                    'nl': 'Dutch',
-                    'pl': 'Polish',
-                    'zh': 'Chinese'
-                },
-                'publishing_data': {
-                    'date': None,
-                    'title': 'nopaque NLP service',
-                    'url': 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/tree/1.0.0b',  # noqa
-                    'version': '1.0.0'
-                }
-            }
-        }
-    },
-    'ocr': {
-        'name': 'Optical Character Recognition',
-        'versions': {
-            'latest': '1.0.0b',
-            '1.0.0b': {
-                'binarization': True,
-                'models': {
-                    'ara': 'Arabic',
-                    'chi_tra': 'Chinese - Traditional',
-                    'dan': 'Danish',
-                    'eng': 'English',
-                    'enm': 'English, Middle 1100-1500',
-                    'fra': 'French',
-                    'frm': 'French, Middle ca. 1400-1600',
-                    'deu': 'German',
-                    'frk': 'German Fraktur',
-                    'ell': 'Greek, Modern (1453-)',
-                    'ita': 'Italian',
-                    'por': 'Portuguese',
-                    'rus': 'Russian',
-                    'spa': 'Spanish; Castilian',
-                },
-                'publishing_data': {
-                    'date': None,
-                    'title': 'nopaque OCR service',
-                    'url': 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/tree/1.0.0b',  # noqa
-                    'version': '1.0.0'
-                }
-            }
-        }
-    }
-}
+services_file = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)), 'services.yml')
+with open(services_file, 'r') as f:
+    SERVICES = yaml.safe_load(f)
 
 
 bp = Blueprint('services', __name__)
-from . import routes
+from . import routes  # noqa
diff --git a/app/services/forms.py b/app/services/forms.py
index e77f1db3..0bebfb02 100644
--- a/app/services/forms.py
+++ b/app/services/forms.py
@@ -1,3 +1,4 @@
+from app.models import TesseractOCRModel
 from flask_wtf import FlaskForm
 from wtforms import (BooleanField, MultipleFileField, SelectField, StringField,
                      SubmitField, ValidationError)
@@ -6,85 +7,105 @@ from . import SERVICES
 
 
 class AddJobForm(FlaskForm):
-    description = StringField('Description',
-                              validators=[DataRequired(), Length(1, 255)])
+    description = StringField('Description', validators=[DataRequired(), Length(1, 255)])  # noqa
     submit = SubmitField()
     title = StringField('Title', validators=[DataRequired(), Length(1, 32)])
     version = SelectField('Version', validators=[DataRequired()])
 
 
-class AddNLPJobForm(AddJobForm):
-    check_encoding = BooleanField('Check encoding')
+class AddSpacyNLPJobForm(AddJobForm):
+    encoding_detection = BooleanField('Encoding detection')
     files = MultipleFileField('Files', validators=[DataRequired()])
-    language = SelectField('Language',  choices=[('', 'Choose your option')],
-                           default='', validators=[DataRequired()])
+    model = SelectField(
+        'Model',
+        choices=[('', 'Choose your option')],
+        default='',
+        validators=[DataRequired()]
+    )
 
-    def validate_check_encoding(self, field):
-        if field.data and 'check_encoding' not in SERVICES['nlp']['versions'][self.version.data]:  # noqa
-            raise ValidationError('Check encoding is not available in this version')  # noqa
+    def validate_encoding_detection(self, field):
+        service_info = SERVICES['spacy-nlp']['versions'][self.version.data]
+        if field.data and 'encoding_detection' not in service_info:
+            raise ValidationError('Encoding detection is not available')
 
     def validate_files(form, field):
+        valid_extensions = ['.txt']
         for file in field.data:
-            if not file.filename.lower().endswith('.txt'):
-                raise ValidationError('File does not have an approved '
-                                      'extension: .txt')
+            if not file.filename.lower().endswith(tuple(valid_extensions)):
+                raise ValidationError(
+                    'File does not have an approved extension: '
+                    '/'.join(valid_extensions)
+                )
 
     def __init__(self, *args, **kwargs):
-        version = kwargs.pop('version', SERVICES['nlp']['versions']['latest'])
+        version = kwargs.pop('version', SERVICES['spacy-nlp']['latest_version'])  # noqa
         super().__init__(*args, **kwargs)
-        if 'check_encoding' not in SERVICES['nlp']['versions'][version]:
-            self.check_encoding.render_kw = {'disabled': True}
-        self.language.choices += [(x, y) for x, y in SERVICES['nlp']['versions'][version]['models'].items()]  # noqa
-        self.version.choices = [(x, x) for x in SERVICES['nlp']['versions'] if x != 'latest']  # noqa
+        service_info = SERVICES['spacy-nlp']['versions'][version]
+        if 'check_encoding' not in service_info['methods']:
+            self.encoding_detection.render_kw = {'disabled': True}
+        self.model.choices += [(x, y) for x, y in service_info['models'].items()]  # noqa
+        self.version.choices = [(x, x) for x in SERVICES['spacy-nlp']['versions']]  # noqa
         self.version.default = version
 
 
-class AddOCRJobForm(AddJobForm):
-    binarization = BooleanField('Binarazation')
+class AddTesseractOCRJobForm(AddJobForm):
+    binarization = BooleanField('Binarization')
     files = MultipleFileField('Files', validators=[DataRequired()])
-    language = SelectField('Language', choices=[('', 'Choose your option')],
-                           default='', validators=[DataRequired()])
+    model = SelectField(
+        'Model',
+        choices=[('', 'Choose your option')],
+        default='',
+        validators=[DataRequired()]
+    )
 
     def validate_binarization(self, field):
-        if field.data and 'binarization' not in SERVICES['ocr']['versions'][self.version.data]:  # noqa
-            raise ValidationError('Binarization is not available in this version')  # noqa
+        service_info = SERVICES['tesseract-ocr']['versions'][self.version.data]
+        if field.data and 'binarization' not in service_info:
+            raise ValidationError('Binarization is not available')
 
     def validate_files(self, field):
+        valid_extensions = ['.pdf']
         for file in field.data:
-            if not file.filename.lower().endswith('.pdf'):
-                raise ValidationError('File does not have an approved '
-                                      'extension: .pdf')
+            if not file.filename.lower().endswith(tuple(valid_extensions)):
+                raise ValidationError(
+                    'File does not have an approved extension: '
+                    '/'.join(valid_extensions)
+                )
 
     def __init__(self, *args, **kwargs):
-        version = kwargs.pop('version', SERVICES['ocr']['versions']['latest'])
+        version = kwargs.pop('version', SERVICES['tesseract-ocr']['latest_version'])  # noqa
         super().__init__(*args, **kwargs)
-        if 'binarization' not in SERVICES['ocr']['versions'][version]:
+        service_info = SERVICES['tesseract-ocr']['versions'][version]
+        if 'binarization' not in service_info['methods']:
             self.binarization.render_kw = {'disabled': True}
-        self.language.choices += [(x, y) for x, y in SERVICES['ocr']['versions'][version]['models'].items()]  # noqa
-        self.version.choices = [(x, x) for x in SERVICES['ocr']['versions'] if x != 'latest']  # noqa
-        self.version.default = version
+        self.model.choices += [(x.hashid, x.title) for x in TesseractOCRModel.query.all()]  # noqa
+        self.version.choices = [(x, x) for x in SERVICES['tesseract-ocr']['versions']]  # noqa
+        self.version.data = version
+        self.version.default = SERVICES['tesseract-ocr']['latest_version']
 
 
 class AddFileSetupJobForm(AddJobForm):
     files = MultipleFileField('Files', validators=[DataRequired()])
 
     def validate_files(form, field):
+        valid_extensions = ['.jpeg', '.jpg', '.png', '.tiff', '.tif']
         for file in field.data:
-            if not file.filename.lower().endswith(('.jpeg', '.jpg', '.png',
-                                                   '.tiff', '.tif')):
-                raise ValidationError('File does not have an approved '
-                                      'extension: .jpeg | .jpg | .png | .tiff '
-                                      '| .tif')
+            if not file.filename.lower().endswith(tuple(valid_extensions)):
+                raise ValidationError(
+                    'File does not have an approved extension: '
+                    '/'.join(valid_extensions)
+                )
 
     def __init__(self, *args, **kwargs):
-        version = kwargs.pop('version', SERVICES['file-setup']['versions']['latest'])
+        version = kwargs.pop('version', SERVICES['file-setup']['latest_version'])  # noqa
         super().__init__(*args, **kwargs)
-        self.version.choices = [(x, x) for x in SERVICES['file-setup']['versions'] if x != 'latest']  # noqa
-        self.version.default = version
+        self.version.choices = [(x, x) for x in SERVICES['file-setup']['versions']]  # noqa
+        self.version.data = version
+        self.version.default = SERVICES['file-setup']['latest_version']
 
 
 AddJobForms = {
     'file-setup': AddFileSetupJobForm,
-    'ocr': AddOCRJobForm,
-    'nlp': AddNLPJobForm
+    'tesseract-ocr': AddTesseractOCRJobForm,
+    'spacy-nlp': AddSpacyNLPJobForm
 }
diff --git a/app/services/routes.py b/app/services/routes.py
index 805ab692..d430e61e 100644
--- a/app/services/routes.py
+++ b/app/services/routes.py
@@ -1,3 +1,4 @@
+from app import hashids
 from flask import (abort, current_app, flash, make_response, render_template,
                    request, url_for)
 from flask_login import current_user, login_required
@@ -8,7 +9,6 @@ from .. import db
 from .forms import AddJobForms
 from ..models import Job, JobInput
 import json
-import os
 
 
 @bp.route('/corpus-analysis')
@@ -24,57 +24,65 @@ def service(service):
     # Check if the requested service exist
     if service not in SERVICES or service not in AddJobForms:
         abort(404)
-    version = request.args.get(
-        'version', SERVICES[service]['versions']['latest'])
+    version = request.args.get('version', SERVICES[service]['latest_version'])
     if version not in SERVICES[service]['versions']:
         abort(404)
     form = AddJobForms[service](prefix='add-job-form', version=version)
-    form.version.data = version
     title = SERVICES[service]['name']
-    versions = SERVICES[service]['versions']
     if form.is_submitted():
         if not form.validate():
             return make_response(form.errors, 400)
-        service_args = []
-        if service == 'nlp':
-            service_args.append(f'-l {form.language.data}')
-            if form.check_encoding.data:
-                service_args.append('--check-encoding')
-        if service == 'ocr':
-            service_args.append(f'-l {form.language.data}')
+        service_args = {}
+        if service == 'spacy-nlp':
+            service_args['model'] = form.model.data
+            if form.encoding_detection.data:
+                service_args['encoding_detection'] = True
+        if service == 'tesseract-ocr':
+            service_args['model'] = hashids.decode(form.model.data)
             if form.binarization.data:
-                service_args.append('--binarize')
-        job = Job(user=current_user,
-                  description=form.description.data,
-                  service=service, service_args=json.dumps(service_args),
-                  service_version=form.version.data,
-                  status='preparing', title=form.title.data)
+                service_args['binarization'] = True
+        job = Job(
+            user=current_user,
+            description=form.description.data,
+            service=service,
+            service_args=json.dumps(service_args),
+            service_version=form.version.data,
+            status='preparing',
+            title=form.title.data
+        )
         db.session.add(job)
-        db.session.flush()
+        db.session.flush(objects=[job])
         db.session.refresh(job)
         try:
-            os.makedirs(job.path)
-        except OSError:
-            current_app.logger.error(f'Make dir {job.path} led to an OSError!')
+            job.makedirs()
+        except OSError as e:
+            current_app.logger.error(e)
             db.session.rollback()
             flash('Internal Server Error', 'error')
-            return make_response(
-                {'redirect_url': url_for('.service', service=service)}, 500)
-        else:
-            for file in form.files.data:
-                filename = secure_filename(file.filename)
-                job_input = JobInput(
-                    filename=filename, job=job, mimetype=file.mimetype)
+            return make_response({'redirect_url': url_for('.service', service=service)}, 500)  # noqa
+        for file in form.files.data:
+            filename = secure_filename(file.filename)
+            job_input = JobInput(
+                filename=filename,
+                job=job,
+                mimetype=file.mimetype
+            )
+            db.session.add(job_input)
+            db.session.flush(objects=[job_input])
+            db.session.refresh(job_input)
+            try:
                 file.save(job_input.path)
-                db.session.add(job_input)
-            job.status = 'submitted'
-            db.session.commit()
-            flash(f'Job "{job.title}" added', 'job')
-            return make_response(
-                {'redirect_url': url_for('jobs.job', job_id=job.id)}, 201)
+            except OSError as e:
+                current_app.logger.error(e)
+                db.session.rollback()
+                flash('Internal Server Error', 'error')
+                return make_response({'redirect_url': url_for('.service', service=service)}, 500)  # noqa
+        job.status = 'submitted'
+        db.session.commit()
+        flash(f'Job "{job.title}" added', 'job')
+        return make_response({'redirect_url': url_for('jobs.job', job_id=job.id)}, 201)  # noqa
     return render_template(
         f'services/{service.replace("-", "_")}.html.j2',
         form=form,
-        title=title,
-        versions=versions
+        title=title
     )
diff --git a/app/services/services.yml b/app/services/services.yml
new file mode 100644
index 00000000..0c82c3d9
--- /dev/null
+++ b/app/services/services.yml
@@ -0,0 +1,38 @@
+# TODO: This could also be done via GitLab/GitHub APIs
+#file-setup-pipeline:
+file-setup:
+  name: 'File setup pipeline'
+  latest_version: '0.1.0'
+  versions:
+    0.1.0:
+      publisher: 'Bielefeld University - CRC 1288 - INF'
+      publishing_year: 2022
+      url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup/-/releases/v0.1.0'
+#spacy-nlp-pipeline:
+spacy-nlp:
+  name: 'spaCy NLP'
+  latest_version: '0.1.0'
+  versions:
+    0.1.0:
+      methods:
+        - 'encoding_detection'
+      models:
+        de: 'German'
+        en: 'English'
+        it: 'Italian'
+        pl: 'Polish'
+        zh: 'Chinese'
+      publisher: 'Bielefeld University - CRC 1288 - INF'
+      publishing_year: 2022
+      url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/releases/v0.1.0'
+#tesseract-ocr-pipeline:
+tesseract-ocr:
+  name: 'Tesseract OCR'
+  latest_version: '0.1.0'
+  versions:
+    0.1.0:
+      methods:
+        - 'binarization'
+      publisher: 'Bielefeld University - CRC 1288 - INF'
+      publishing_year: 2022
+      url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/releases/v0.1.0'
diff --git a/app/static/css/nopaque.css b/app/static/css/nopaque.css
index 90f4df68..ee5377e1 100644
--- a/app/static/css/nopaque.css
+++ b/app/static/css/nopaque.css
@@ -50,8 +50,8 @@ h1 .nopaque-icons, h2 .nopaque-icons, h3 .nopaque-icons, h4 .nopaque-icons,
 }
 .nopaque-icons.service-icon[data-service="corpus-analysis"]:empty:before {content: "H";}
 .nopaque-icons.service-icon[data-service="file-setup"]:empty:before {content: "E";}
-.nopaque-icons.service-icon[data-service="nlp"]:empty:before {content: "G";}
-.nopaque-icons.service-icon[data-service="ocr"]:empty:before {content: "F";}
+.nopaque-icons.service-icon[data-service="spacy-nlp"]:empty:before {content: "G";}
+.nopaque-icons.service-icon[data-service="tesseract-ocr"]:empty:before {content: "F";}
 
 .status-text[data-status]:empty:before {content: attr(data-status);}
 
diff --git a/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js b/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js
index ad324e34..c07ff35d 100644
--- a/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js
+++ b/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js
@@ -53,7 +53,7 @@ class CorpusAnalysisApp {
     this.data.cQiClient = new CQiClient(this.settings.corpusId);
     this.data.cQiClient.connect()
       .then(cQiStatus => {
-        return this.data.cQiClient.corpora.get('CORPUS');
+        return this.data.cQiClient.corpora.get(`NOPAQUE_${this.settings.corpusId}`);
       })
       .then(
         cQiCorpus => {
diff --git a/app/static/js/RessourceDisplays/JobDisplay.js b/app/static/js/RessourceDisplays/JobDisplay.js
index 61222693..92102908 100644
--- a/app/static/js/RessourceDisplays/JobDisplay.js
+++ b/app/static/js/RessourceDisplays/JobDisplay.js
@@ -100,7 +100,10 @@ class JobDisplay extends RessourceDisplay {
   }
 
   setServiceArgs(serviceArgs) {
-    this.setElements(this.displayElement.querySelectorAll('.job-service-args'), serviceArgs);
+    this.setElements(
+      this.displayElement.querySelectorAll('.job-service-args'),
+      JSON.stringify(serviceArgs)
+    );
   }
 
   setServiceVersion(serviceVersion) {
diff --git a/app/static/js/RessourceLists/JobResultList.js b/app/static/js/RessourceLists/JobResultList.js
index 56399bcb..708b25f2 100644
--- a/app/static/js/RessourceLists/JobResultList.js
+++ b/app/static/js/RessourceLists/JobResultList.js
@@ -10,25 +10,10 @@ class JobResultList extends RessourceList {
       </tr>
     `.trim(),
     ressourceMapper: jobResult => {
-      let description;
-
-      if (jobResult.filename.endsWith('.pdf.zip')) {
-        description = 'PDF files with text layer';
-      } else if (jobResult.filename.endsWith('.txt.zip')) {
-        description = 'Raw text files';
-      } else if (jobResult.filename.endsWith('.vrt.zip')) {
-        description = 'VRT compliant files including the NLP data';
-      } else if (jobResult.filename.endsWith('.xml.zip')) {
-        description = 'TEI compliant files';
-      } else if (jobResult.filename.endsWith('.poco.zip')) {
-        description = 'HOCR and image files for post correction (PoCo)';
-      } else {
-        description = 'All result files created during this job';
-      }
       return {
         id: jobResult.id,
         creationDate: jobResult.creation_date,
-        description: description,
+        description: jobResult.description,
         filename: jobResult.filename
       };
     },
diff --git a/app/templates/_colors.html.j2 b/app/templates/_colors.html.j2
index 84715cbe..a6ac0ed8 100644
--- a/app/templates/_colors.html.j2
+++ b/app/templates/_colors.html.j2
@@ -19,12 +19,12 @@
     'darken': '#a1b300',
     'lighten': '#f2f3e1'
   },
-  'nlp': {
+  'spacy-nlp': {
     'base': '#98acd2',
     'darken': '#0064a3',
     'lighten': '#e5e8f5'
   },
-  'ocr': {
+  'tesseract-ocr': {
     'base': '#a9d8c8',
     'darken': '#00a58b',
     'lighten': '#e7f4f1'
diff --git a/app/templates/_sidenav.html.j2 b/app/templates/_sidenav.html.j2
index c3ac9ab8..8729f4f8 100644
--- a/app/templates/_sidenav.html.j2
+++ b/app/templates/_sidenav.html.j2
@@ -15,8 +15,8 @@
   <li><div class="divider"></div></li>
   <li><a class="subheader">Processes & Services</a></li>
   <li class="service-color service-color-border border-darken" data-service="file-setup" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='file-setup') }}"><i class="nopaque-icons service-icon" data-service="file-setup"></i>File setup</a></li>
-  <li class="service-color service-color-border border-darken" data-service="ocr" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='ocr') }}"><i class="nopaque-icons service-icon" data-service="ocr"></i>OCR</a></li>
-  <li class="service-color service-color-border border-darken" data-service="nlp" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='nlp') }}"><i class="nopaque-icons service-icon" data-service="nlp"></i>NLP</a></li>
+  <li class="service-color service-color-border border-darken" data-service="tesseract-ocr" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='tesseract-ocr') }}"><i class="nopaque-icons service-icon" data-service="tesseract-ocr"></i>OCR</a></li>
+  <li class="service-color service-color-border border-darken" data-service="spacy-nlp" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='spacy-nlp') }}"><i class="nopaque-icons service-icon" data-service="spacy-nlp"></i>NLP</a></li>
   <li class="service-color service-color-border border-darken" data-service="corpus-analysis" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='corpus-analysis') }}"><i class="nopaque-icons service-icon" data-service="corpus-analysis"></i>Corpus analysis</a></li>
   <li><div class="divider"></div></li>
   <li><a class="subheader">Account</a></li>
@@ -28,6 +28,9 @@
   {% if current_user.can(Permission.ADMINISTRATE) %}
   <li><a href="{{ url_for('admin.index') }}"><i class="material-icons">admin_panel_settings</i>Administration</a></li>
   {% endif %}
+  {% if current_user.can(Permission.CONTRIBUTE) %}
+  <li><a href="{{ url_for('contribute.index') }}"><i class="material-icons">new_label</i>Contribute</a></li>
+  {% endif %}
   {% if current_user.can(Permission.USE_API) %}
   <li><a href="{{ url_for('api.doc') }}"><i class="material-icons">api</i>API</a></li>
   {% endif %}
diff --git a/app/templates/main/dashboard.html.j2 b/app/templates/main/dashboard.html.j2
index 1e763c3e..05f5b804 100644
--- a/app/templates/main/dashboard.html.j2
+++ b/app/templates/main/dashboard.html.j2
@@ -120,32 +120,32 @@
             </a>
             <br><br>
             <p class="service-color-text darken" data-service="file-setup"><b>File setup</b></p>
-            <p class="light">Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing and the application of other services.</p>
+            <p class="light">Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing.</p>
             <a href="{{ url_for('services.service', service='file-setup') }}" class="waves-effect waves-light btn service-color darken" data-service="file-setup">Create Job</a>
           </div>
       </div>
       <div class="col s12 m4">
           <div class="card-panel center-align hoverable">
             <br>
-            <a href="{{ url_for('services.service', service='ocr') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);">
-              <i class="nopaque-icons service-color darken service-icon" data-service="ocr" style="font-size: 2.5rem;"></i>
+            <a href="{{ url_for('services.service', service='tesseract-ocr') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);">
+              <i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr" style="font-size: 2.5rem;"></i>
             </a>
             <br><br>
-            <p class="service-color-text darken" data-service="ocr"><b>Optical Character Recognition</b></p>
+            <p class="service-color-text darken" data-service="tesseract-ocr"><b>Optical Character Recognition</b></p>
             <p class="light">nopaque converts your image data – like photos or scans – into text data through a process called OCR. This step enables you to proceed with further computational analysis of your documents.</p>
-            <a href="{{ url_for('services.service', service='ocr') }}" class="waves-effect waves-light btn service-color darken" data-service="ocr">Create Job</a>
+            <a href="{{ url_for('services.service', service='tesseract-ocr') }}" class="waves-effect waves-light btn service-color darken" data-service="tesseract-ocr">Create Job</a>
           </div>
       </div>
       <div class="col s12 m4">
           <div class="card-panel center-align hoverable">
             <br>
-            <a href="{{ url_for('services.service', service='nlp') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);">
-              <i class="nopaque-icons service-color darken service-icon" data-service="nlp" style="font-size: 2.5rem;"></i>
+            <a href="{{ url_for('services.service', service='spacy-nlp') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);">
+              <i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp" style="font-size: 2.5rem;"></i>
             </a>
             <br><br>
-            <p class="service-color-text darken" data-service="nlp"><b>Natural Language Processing</b></p>
+            <p class="service-color-text darken" data-service="spacy-nlp"><b>Natural Language Processing</b></p>
             <p class="light">By means of computational linguistic data processing (tokenization, lemmatization, part-of-speech tagging and named-entity recognition) nopaque extracts additional information from your text.</p>
-            <a href="{{ url_for('services.service', service='nlp') }}" class="waves-effect waves-light btn service-color darken" data-service="nlp">Create Job</a>
+            <a href="{{ url_for('services.service', service='spacy-nlp') }}" class="waves-effect waves-light btn service-color darken" data-service="spacy-nlp">Create Job</a>
           </div>
       </div>
     </div>
diff --git a/app/templates/main/index.html.j2 b/app/templates/main/index.html.j2
index bbc44283..0bd343c3 100644
--- a/app/templates/main/index.html.j2
+++ b/app/templates/main/index.html.j2
@@ -84,11 +84,11 @@
               <p class="light">Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing and the application of other services.</p>
             </div>
             <div class="col s12 m6 l3 center-align">
-              <a href="{{ url_for('services.service', service='ocr') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
-                <i class="nopaque-icons service-color darken service-icon" data-service="ocr"></i>
+              <a href="{{ url_for('services.service', service='tesseract-ocr') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
+                <i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr"></i>
               </a>
               <br><br>
-              <p class="service-color-text text-darken" data-service="ocr"><b>Optical Character Recognition</b></p>
+              <p class="service-color-text text-darken" data-service="tesseract-ocr"><b>Optical Character Recognition</b></p>
               <p class="light">nopaque converts your image data – like photos or scans – into text data through OCR making it machine readable. This step enables you to proceed with further computational analysis of your documents.</p>
             </div>
             <div class="col s12 m6 l3 center-align">
diff --git a/app/templates/services/nlp.html.j2 b/app/templates/services/spacy_nlp.html.j2
similarity index 93%
rename from app/templates/services/nlp.html.j2
rename to app/templates/services/spacy_nlp.html.j2
index d07470e1..30fab84c 100644
--- a/app/templates/services/nlp.html.j2
+++ b/app/templates/services/spacy_nlp.html.j2
@@ -2,7 +2,7 @@
 {% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %}
 {% import "materialize/wtf.html.j2" as wtf %}
 
-{% block main_attribs %} class="service-scheme" data-service="nlp"{% endblock main_attribs %}
+{% block main_attribs %} class="service-scheme" data-service="spacy-nlp"{% endblock main_attribs %}
 
 {% block page_content %}
 <div class="container">
@@ -16,13 +16,13 @@
         <p class="hide-on-small-only">&nbsp;</p>
         <p class="hide-on-small-only">&nbsp;</p>
         <a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
-          <i class="nopaque-icons service-color darken service-icon" data-service="nlp"></i>
+          <i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp"></i>
         </a>
       </div>
     </div>
 
     <div class="col s12 m9 pull-m3">
-      <div class="card service-color-border border-darken" data-service="nlp" style="border-top: 10px solid;">
+      <div class="card service-color-border border-darken" data-service="spacy-nlp" style="border-top: 10px solid;">
         <div class="card-content">
           <div class="row">
             <div class="col s12 m6">
@@ -71,7 +71,7 @@
                 {{ wtf.render_field(form.files, accept='text/plain', placeholder='Choose your .txt files') }}
               </div>
               <div class="col s12 l4">
-                {{ wtf.render_field(form.language, material_icon='language') }}
+                {{ wtf.render_field(form.model, material_icon='language') }}
               </div>
               <div class="col s12 l3">
                 {{ wtf.render_field(form.version, material_icon='apps') }}
@@ -80,13 +80,13 @@
                 <span class="card-title">Preprocessing</span>
               </div>
               <div class="col s9">
-                <p>{{ form.check_encoding.label.text }}</p>
+                <p>{{ form.encoding_detection.label.text }}</p>
                 <p class="light">If the input files are not created with the nopaque OCR service or you do not know if your text files are UTF-8 encoded, check this switch. We will try to automatically determine the right encoding for your texts to process them.</p>
               </div>
               <div class="col s3 right-align">
                 <div class="switch">
                   <label>
-                    {{ form.check_encoding() }}
+                    {{ form.encoding_detection() }}
                     <span class="lever"></span>
                   </label>
                 </div>
diff --git a/app/templates/services/ocr.html.j2 b/app/templates/services/tesseract_ocr.html.j2
similarity index 93%
rename from app/templates/services/ocr.html.j2
rename to app/templates/services/tesseract_ocr.html.j2
index 9af593b4..66121281 100644
--- a/app/templates/services/ocr.html.j2
+++ b/app/templates/services/tesseract_ocr.html.j2
@@ -2,7 +2,7 @@
 {% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %}
 {% import "materialize/wtf.html.j2" as wtf %}
 
-{% block main_attribs %} class="service-scheme" data-service="ocr"{% endblock main_attribs %}
+{% block main_attribs %} class="service-scheme" data-service="tesseract-ocr"{% endblock main_attribs %}
 
 {% block page_content %}
 <div class="container">
@@ -16,13 +16,13 @@
         <p class="hide-on-small-only">&nbsp;</p>
         <p class="hide-on-small-only">&nbsp;</p>
         <a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
-          <i class="nopaque-icons service-color darken service-icon" data-service="ocr"></i>
+          <i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr"></i>
         </a>
       </div>
     </div>
 
     <div class="col s12 m9 pull-m3">
-      <div class="card service-color-border border-darken" data-service="ocr" style="border-top: 10px solid;">
+      <div class="card service-color-border border-darken" data-service="tesseract-ocr" style="border-top: 10px solid;">
         <div class="card-content">
           <div class="row">
             <div class="col s12">
@@ -50,10 +50,10 @@
                 {{ wtf.render_field(form.description, data_length='255', material_icon='description') }}
               </div>
               <div class="col s12 l5">
-                {{ wtf.render_field(form.files, accept='application/pdf', color=ocr_color_darken, placeholder='Choose your .pdf files') }}
+                {{ wtf.render_field(form.files, accept='application/pdf', placeholder='Choose your .pdf files') }}
               </div>
               <div class="col s12 l4">
-                {{ wtf.render_field(form.language, material_icon='language') }}
+                {{ wtf.render_field(form.model, material_icon='language') }}
               </div>
               <div class="col s12 l3">
                 {{ wtf.render_field(form.version, material_icon='apps') }}
@@ -127,7 +127,7 @@
             </div>
           </div>
           <div class="card-action right-align">
-            {{ wtf.render_field(form.submit, color=ocr_color_darken, material_icon='send') }}
+            {{ wtf.render_field(form.submit, material_icon='send') }}
           </div>
         </form>
       </div>
diff --git a/app/utils.py b/app/utils.py
deleted file mode 100644
index 75d38b7c..00000000
--- a/app/utils.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from app import hashids
-from werkzeug.routing import BaseConverter
-
-
-class HashidConverter(BaseConverter):
-    def to_python(self, value: str) -> int:
-        return hashids.decode(value)[0]
-
-    def to_url(self, value: int) -> str:
-        return hashids.encode(value)
diff --git a/docker-compose.traefik.yml b/docker-compose.traefik.yml
index c261b2d2..c7d01575 100644
--- a/docker-compose.traefik.yml
+++ b/docker-compose.traefik.yml
@@ -5,14 +5,14 @@
 version: "3.5"
 
 networks:
-  reverse-proxy:
-    external:
-      name: reverse-proxy
+  traefik:
+    external: true
+    name: "traefik"
 
 services:
   nopaque:
     labels:
-      - "traefik.docker.network=reverse-proxy"
+      - "traefik.docker.network=traefik"
       - "traefik.enable=true"
       ### <http> ###
       - "traefik.http.middlewares.http-nopaque-headers.headers.customrequestheaders.X-Forwarded-Proto=http"
diff --git a/migrations/versions/ad0d835fe5b1_.py b/migrations/versions/ad0d835fe5b1_.py
new file mode 100644
index 00000000..0248e316
--- /dev/null
+++ b/migrations/versions/ad0d835fe5b1_.py
@@ -0,0 +1,45 @@
+"""empty message
+
+Revision ID: ad0d835fe5b1
+Revises: 68ed092ffe5e
+Create Date: 2022-01-18 16:23:45.673993
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = 'ad0d835fe5b1'
+down_revision = '68ed092ffe5e'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table('tesseract_ocr_models',
+    sa.Column('creation_date', sa.DateTime(), nullable=True),
+    sa.Column('filename', sa.String(length=255), nullable=True),
+    sa.Column('last_edited_date', sa.DateTime(), nullable=True),
+    sa.Column('mimetype', sa.String(length=255), nullable=True),
+    sa.Column('id', sa.Integer(), nullable=False),
+    sa.Column('user_id', sa.Integer(), nullable=True),
+    sa.Column('compatible_service_versions', sa.String(length=255), nullable=True),
+    sa.Column('description', sa.String(length=255), nullable=True),
+    sa.Column('publisher', sa.String(length=128), nullable=True),
+    sa.Column('publishing_year', sa.Integer(), nullable=True),
+    sa.Column('title', sa.String(length=64), nullable=True),
+    sa.Column('version', sa.String(length=16), nullable=True),
+    sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
+    sa.PrimaryKeyConstraint('id')
+    )
+    op.add_column('job_results', sa.Column('description', sa.String(length=255), nullable=True))
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column('job_results', 'description')
+    op.drop_table('tesseract_ocr_models')
+    # ### end Alembic commands ###
diff --git a/nopaque.py b/nopaque.py
index 0045d02b..ab8db5a6 100644
--- a/nopaque.py
+++ b/nopaque.py
@@ -3,10 +3,9 @@
 import eventlet
 eventlet.monkey_patch()
 
-
-from app import db, cli, create_app  # noqa
 from app.models import (Corpus, CorpusFile, Job, JobInput, JobResult,
-                        Permission, QueryResult, Role, User)  # noqa
+                        Permission, QueryResult, Role, TesseractOCRModel, User)  # noqa
+from app import db, cli, create_app  # noqa
 from flask import Flask  # noqa
 from typing import Any, Dict  # noqa
 
@@ -34,5 +33,6 @@ def make_shell_context() -> Dict[str, Any]:
         'Permission': Permission,
         'QueryResult': QueryResult,
         'Role': Role,
+        'TesseractOCRModel': TesseractOCRModel,
         'User': User
     }
diff --git a/requirements.txt b/requirements.txt
index 202121fd..52770c57 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,5 +19,7 @@ hiredis
 jsonschema
 psycopg2
 python-dotenv
+pyyaml
 redis
+tqdm
 wtforms[email]