Big update, corpus analysis reworked, versioned services, preliminary work for contributions

2026-08-03 05:51:28 +00:00 · 2022-02-03 12:39:16 +01:00
parent 0647537192
commit fe938c0ca2
36 changed files with 1552 additions and 431 deletions
@@ -0,0 +1,816 @@
 # - title: 'Afrikaans'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/afr.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Amharic'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/amh.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 - title: 'Arabic'
  description: ''
  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ara.traineddata'
  publisher: 'tesseract-ocr'
  publishing_year: 2021
  version: '4.1.0'
  compatible_service_versions:
    - '0.1.0'
 # - title: 'Assamese'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/asm.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Azerbaijani'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/aze.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Azerbaijani - Cyrillic'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/aze_cyrl.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Belarusian'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bel.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Bengali'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ben.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Tibetan'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bod.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Bosnian'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bos.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Bulgarian'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bul.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Catalan; Valencian'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/cat.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Cebuano'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ceb.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Czech'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ces.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Chinese - Simplified'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chi_sim.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 - title: 'Chinese - Traditional'
  description: ''
  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chi_tra.traineddata'
  publisher: 'tesseract-ocr'
  publishing_year: 2021
  version: '4.1.0'
  compatible_service_versions:
    - '0.1.0'
 # - title: 'Cherokee'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chr.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Welsh'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/cym.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 - title: 'Danish'
  description: ''
  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/dan.traineddata'
  publisher: 'tesseract-ocr'
  publishing_year: 2021
  version: '4.1.0'
  compatible_service_versions:
    - '0.1.0'
 - title: 'German'
  description: ''
  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/deu.traineddata'
  publisher: 'tesseract-ocr'
  publishing_year: 2021
  version: '4.1.0'
  compatible_service_versions:
    - '0.1.0'
 # - title: 'Dzongkha'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/dzo.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 - title: 'Greek, Modern (1453-)'
  description: ''
  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ell.traineddata'
  publisher: 'tesseract-ocr'
  publishing_year: 2021
  version: '4.1.0'
  compatible_service_versions:
    - '0.1.0'
 - title: 'English'
  description: ''
  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/eng.traineddata'
  publisher: 'tesseract-ocr'
  publishing_year: 2021
  version: '4.1.0'
  compatible_service_versions:
    - '0.1.0'
 - title: 'English, Middle (1100-1500)'
  description: ''
  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/enm.traineddata'
  publisher: 'tesseract-ocr'
  publishing_year: 2021
  version: '4.1.0'
  compatible_service_versions:
    - '0.1.0'
 # - title: 'Esperanto'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/epo.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Estonian'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/est.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Basque'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/eus.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Persian'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fas.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Finnish'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fin.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 - title: 'French'
  description: ''
  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fra.traineddata'
  publisher: 'tesseract-ocr'
  publishing_year: 2021
  version: '4.1.0'
  compatible_service_versions:
    - '0.1.0'
 - title: 'German Fraktur'
  description: ''
  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/frk.traineddata'
  publisher: 'tesseract-ocr'
  publishing_year: 2021
  version: '4.1.0'
  compatible_service_versions:
    - '0.1.0'
 - title: 'French, Middle (ca. 1400-1600)'
  description: ''
  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/frm.traineddata'
  publisher: 'tesseract-ocr'
  publishing_year: 2021
  version: '4.1.0'
  compatible_service_versions:
    - '0.1.0'
 # - title: 'Irish'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/gle.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Galician'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/glg.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 - title: 'Greek, Ancient (-1453)'
  description: ''
  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/grc.traineddata'
  publisher: 'tesseract-ocr'
  publishing_year: 2021
  version: '4.1.0'
  compatible_service_versions:
    - '0.1.0'
 # - title: 'Gujarati'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/guj.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Haitian; Haitian Creole'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hat.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Hebrew'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/heb.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Hindi'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hin.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Croatian'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hrv.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Hungarian'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hun.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Inuktitut'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/iku.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Indonesian'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ind.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Icelandic'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/isl.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 - title: 'Italian'
  description: ''
  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ita.traineddata'
  publisher: 'tesseract-ocr'
  publishing_year: 2021
  version: '4.1.0'
  compatible_service_versions:
    - '0.1.0'
 - title: 'Italian - Old'
  description: ''
  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ita_old.traineddata'
  publisher: 'tesseract-ocr'
  publishing_year: 2021
  version: '4.1.0'
  compatible_service_versions:
    - '0.1.0'
 # - title: 'Javanese'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/jav.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Japanese'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/jpn.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Kannada'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kan.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Georgian'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kat.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Georgian - Old'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kat_old.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Kazakh'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kaz.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Central Khmer'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/khm.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Kirghiz; Kyrgyz'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kir.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Korean'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kor.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Kurdish'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kur.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Lao'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lao.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Latin'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lat.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Latvian'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lav.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Lithuanian'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lit.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Malayalam'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mal.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Marathi'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mar.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Macedonian'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mkd.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Maltese'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mlt.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Malay'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/msa.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Burmese'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mya.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Nepali'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nep.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Dutch; Flemish'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nld.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Norwegian'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nor.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Oriya'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ori.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Panjabi; Punjabi'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pan.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Polish'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pol.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 - title: 'Portuguese'
  description: ''
  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/por.traineddata'
  publisher: 'tesseract-ocr'
  publishing_year: 2021
  version: '4.1.0'
  compatible_service_versions:
    - '0.1.0'
 # - title: 'Pushto; Pashto'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pus.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Romanian; Moldavian; Moldovan'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ron.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 - title: 'Russian'
  description: ''
  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/rus.traineddata'
  publisher: 'tesseract-ocr'
  publishing_year: 2021
  version: '4.1.0'
  compatible_service_versions:
    - '0.1.0'
 # - title: 'Sanskrit'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/san.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Sinhala; Sinhalese'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/sin.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Slovak'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/slk.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Slovenian'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/slv.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 - title: 'Spanish; Castilian'
  description: ''
  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/spa.traineddata'
  publisher: 'tesseract-ocr'
  publishing_year: 2021
  version: '4.1.0'
  compatible_service_versions:
    - '0.1.0'
 - title: 'Spanish; Castilian - Old'
  description: ''
  url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/spa_old.traineddata'
  publisher: 'tesseract-ocr'
  publishing_year: 2021
  version: '4.1.0'
  compatible_service_versions:
    - '0.1.0'
 # - title: 'Albanian'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/sqi.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Serbian'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/srp.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Serbian - Latin'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/srp_latn.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Swahili'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/swa.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Swedish'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/swe.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Syriac'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/syr.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Tamil'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tam.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Telugu'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tel.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Tajik'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tgk.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Tagalog'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tgl.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Thai'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tha.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Tigrinya'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tir.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Turkish'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tur.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Uighur; Uyghur'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uig.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Ukrainian'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ukr.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Urdu'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/urd.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Uzbek'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uzb.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Uzbek - Cyrillic'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uzb_cyrl.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Vietnamese'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/vie.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
 # - title: 'Yiddish'
 #   description: ''
 #   url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/yid.traineddata'
 #   publisher: 'tesseract-ocr'
 #   publishing_year: 2021
 #   version: '4.1.0'
 #   compatible_service_versions:
 #     - '0.1.0'
@@ -39,9 +39,6 @@ def create_app(config: Config = Config) -> Flask:
    socketio.init_app(
        app, message_queue=app.config['NOPAQUE_SOCKETIO_MESSAGE_QUEUE_URI'])
    # from .utils import HashidConverter
    # app.url_map.converters['hashid'] = HashidConverter
    from .events import socketio as socketio_events
    from .events import sqlalchemy as sqlalchemy_events
@@ -54,6 +51,9 @@ def create_app(config: Config = Config) -> Flask:
    from .auth import bp as auth_blueprint
    app.register_blueprint(auth_blueprint, url_prefix='/auth')
    from .contribute import bp as contribute_blueprint
    app.register_blueprint(contribute_blueprint, url_prefix='/contribute')
    from .corpora import bp as corpora_blueprint
    app.register_blueprint(corpora_blueprint, url_prefix='/corpora')
@@ -1,7 +1,6 @@
 from flask import Blueprint
 from flask_restx import Api
 from .jobs import ns as jobs_ns
 from .tokens import ns as tokens_ns
 bp = Blueprint('api', __name__)
@@ -23,5 +22,4 @@ api = Api(
    version='1.0'
 )
 api.add_namespace(jobs_ns)
 api.add_namespace(tokens_ns)
@@ -9,8 +9,12 @@ token_auth = HTTPTokenAuth()
@basic_auth.verify_password
 def verify_password(email_or_username, password):
-    user = User.query.filter(or_(User.username == email_or_username,
+    user = User.query.filter(
-                                 User.email == email_or_username.lower())).first()
+        or_(
            User.username == email_or_username,
            User.email == email_or_username.lower()
        )
    ).first()
    if user and user.verify_password(password):
        return user
@@ -1,48 +0,0 @@
 from flask_restx import Namespace, Resource
 from .auth import token_auth
 from ..jobs import tasks
 from ..models import Job
 ns = Namespace('jobs', description='Job operations')
@ns.route('')
 class API_Jobs(Resource):
    '''Shows a list of all jobs and lets you POST to add new job'''
    @ns.doc(security='apiKey')
    @token_auth.login_required
    def get(self):
        '''List all jobs'''
        # TODO: Implement the correct get_jobs functionality
        jobs = Job.query.all()
        return [job.to_dict(include_relationships=False) for job in jobs]
    @ns.doc(security='apiKey')
    @token_auth.login_required
    def post(self):
        '''Create a new job'''
        # TODO: Implement this
        pass
@ns.route('/<hashid:id>')
 class API_Job(Resource):
    '''Show a single job and lets you delete it'''
    @ns.doc(security='apiKey')
    @token_auth.login_required
    def get(self, id):
        '''Get a job by id'''
        job = Job.query.get_or_404(id)
        return job.to_dict(include_relationships=False)
    @ns.doc(security='apiKey')
    @token_auth.login_required
    def delete(self, id):
        '''Delete a job by id'''
        job = Job.query.get_or_404(id)
        # We use this imported task because it will run in the background
        tasks.delete_job(job.id)
        return '', 204
@@ -60,28 +60,37 @@ def register():
        return redirect(url_for('main.dashboard'))
    form = RegistrationForm(prefix='registration-form')
    if form.validate_on_submit():
-        user = User(email=form.email.data.lower(),
+        user = User(
            email=form.email.data.lower(),
            password=form.password.data,
-                    username=form.username.data)
+            username=form.username.data
        )
        db.session.add(user)
-        db.session.commit()
+        db.session.flush(objects=[user])
        db.session.refresh(user)
        try:
-            os.makedirs(user.path)
+            user.makedirs()
-        except OSError:
+        except OSError as e:
-            current_app.logger.error(
+            current_app.logger.error(e)
-                f'Make dir {user.path} led to an OSError!')
+            db.session.rollback()
            db.session.delete(user)
            db.session.commit()
            abort(500)
        else:
            token = user.generate_confirmation_token()
-            msg = create_message(user.email, 'Confirm Your Account',
+            msg = create_message(
-                                 'auth/email/confirm', token=token, user=user)
+                user.email,
                'Confirm Your Account',
                'auth/email/confirm',
                token=token,
                user=user
            )
            send(msg)
            flash('A confirmation email has been sent to you by email.')
            return redirect(url_for('.login'))
-    return render_template('auth/register.html.j2', form=form,
+    return render_template(
-                           title='Register')
+        'auth/register.html.j2',
        form=form,
        title='Register'
    )
@bp.route('/confirm/<token>')
@@ -1,16 +1,44 @@
-from . import db
+from flask import current_app
 from .models import Corpus, Role
 from flask_migrate import upgrade
 from . import db
 from .models import Corpus, Job, Role, User, TesseractOCRModel
 import json
 import os
 import re
 def _make_default_dirs():
    base_dir = current_app.config['NOPAQUE_DATA_DIR']
    default_directories = [
        os.path.join(base_dir, 'tmp'),
        os.path.join(base_dir, 'users')
    ]
    for directory in default_directories:
        if os.path.exists(directory):
            if not os.path.isdir(directory):
                raise NotADirectoryError(f'{directory} is not a directory')
        else:
            os.mkdir(directory)
 def register(app):
    @app.cli.command()
    def deploy():
        ''' Run deployment tasks. '''
        # Make default directories
        _make_default_dirs()
        # migrate database to latest revision
        upgrade()
-        # create or update user roles
+
-        Role.insert_roles()
+        # Insert/Update default database values
        current_app.logger.info('Insert/Update default roles')
        Role.insert_defaults()
        current_app.logger.info('Insert/Update default users')
        User.insert_defaults()
        current_app.logger.info('Insert/Update default tesseract ocr models')
        TesseractOCRModel.insert_defaults()
    @app.cli.group()
    def daemon():
@@ -40,3 +68,55 @@ def register(app):
        from unittest.suite import TestSuite
        tests: TestSuite = TestLoader().discover('tests')
        TextTestRunner(verbosity=2).run(tests)
    @app.cli.group()
    def convert():
        ''' Datebase convert commands. '''
    @convert.command()
    def nlp_jobs():
        for job in Job.query.filter_by(service='nlp').all():
            job.service = 'spacy-nlp'
            service_args = json.loads(job.service_args)
            new_service_args = {}
            for service_arg in service_args:
                if service_arg == '--check-encoding':
                    new_service_args['encoding_detection'] = True
                elif re.match(r'-l ([a-z]{2})', service_arg):
                    language_code = re.search(r'-l ([a-z]{2})', service_arg).group(1)  # noqa
                    new_service_args['language'] = language_code
            job.service_args = json.dumps(new_service_args)
        db.session.commit()
    @convert.command()
    def ocr_jobs():
        # Language code to TesseractOCRModel.title lookup
        language_code_lookup = {
            'ara': 'Arabic',
            'chi_tra': 'Chinese - Traditional',
            'dan': 'Danish',
            'eng': 'English',
            'enm': 'English, Middle (1100-1500)',
            'fra': 'French',
            'frm': 'French, Middle (ca. 1400-1600)',
            'deu': 'German',
            'frk': 'German Fraktur',
            'ell': 'Greek, Modern (1453-)',
            'ita': 'Italian',
            'por': 'Portuguese',
            'rus': 'Russian',
            'spa': 'Spanish; Castilian'
        }
        for job in Job.query.filter_by(service='ocr').all():
            job.service = 'tesseract-ocr'
            service_args = json.loads(job.service_args)
            new_service_args = {}
            for service_arg in service_args:
                if service_arg == '--binarize':
                    new_service_args['binarization'] = True
                elif re.match(r'-l ([a-z]{3})', service_arg):
                    language_code = re.search(r'-l ([a-z]{3})', service_arg).group(1)  # noqa
                    tesseract_ocr_model = TesseractOCRModel.query.filter_by(title=language_code_lookup[language_code]).first()  # noqa
                    new_service_args['model'] = tesseract_ocr_model.id
            job.service_args = json.dumps(new_service_args)
        db.session.commit()
@@ -0,0 +1,5 @@
 from flask import Blueprint
 bp = Blueprint('contribute', __name__)
 from . import routes
@@ -0,0 +1,19 @@
 from flask import flash, redirect, render_template, url_for
 from flask_login import login_required
 from . import bp
 from .. import db
 from ..decorators import permission_required
 from ..models import Permission, Role, User
 from ..settings import tasks as settings_tasks
@bp.before_request
@login_required
@permission_required(Permission.CONTRIBUTE)
 def before_request():
    pass
@bp.route('/')
 def index():
    pass
@@ -93,12 +93,12 @@ def connect(auth):
@socketio.on('disconnect', namespace=NAMESPACE)
 def disconnect():
    if 'd' not in session:
        return
    session['d']['cqi_client_lock'].acquire()
    try:
        session['d']['cqi_client'].disconnect()
-    except cqi.errors.CQiException:
+    except (BrokenPipeError, cqi.errors.CQiException):
        pass
    except BrokenPipeError:
        pass
    session['d']['cqi_client_lock'].release()
    corpus = Corpus.query.get(session['d']['corpus_id'])
@@ -12,7 +12,10 @@ def cqi_over_socketio(f):
        f_args = {}
        # Check for missing args and if all provided args are of the right type
        for param in signature(f).parameters.values():
-            if param.annotation == cqi.CQiClient:
+            if param.name == 'corpus_name':
                f_args[param.name] = f'NOPAQUE_{session["d"]["corpus_id"]}'
                continue
            if param.name == 'cqi_client':
                f_args[param.name] = session['d']['cqi_client']
                continue
            if param.default is param.empty:
@@ -1,6 +1,7 @@
 from flask import (abort, current_app, flash, make_response, redirect,
                   render_template, url_for, send_from_directory)
 from flask_login import current_user, login_required
 from werkzeug.utils import secure_filename
 from . import bp
 from . import tasks
 from .forms import (AddCorpusFileForm, AddCorpusForm, EditCorpusFileForm,
@@ -29,18 +30,20 @@ def add_corpus():
        db.session.flush()
        db.session.refresh(corpus)
        try:
-            os.makedirs(corpus.path)
+            corpus.makedirs()
        except OSError as e:
-            current_app.logger.error(f'Could not add corpus: {e}')
+            current_app.logger.error(e)
            db.session.rollback()
            flash('Internal Server Error', 'error')
            abort(500)
        else:
        db.session.commit()
-            flash(f'Corpus "{corpus.title}" added!', 'corpus')
+        flash(f'Corpus "{corpus.title}" added', 'corpus')
        return redirect(url_for('.corpus', corpus_id=corpus.id))
-    return render_template('corpora/add_corpus.html.j2', form=form,
+    return render_template(
-                           title='Add corpus')
+        'corpora/add_corpus.html.j2',
        form=form,
        title='Add corpus'
    )
@bp.route('/import', methods=['GET', 'POST'])
@@ -174,7 +177,7 @@ def add_corpus_file(corpus_id):
        if not form.validate():
            return make_response(form.errors, 400)
        # Save the file
-        form.file.data.save(os.path.join(corpus.path, form.file.data.filename))
+        filename = secure_filename(form.file.data.filename)
        corpus_file = CorpusFile(
            address=form.address.data,
            author=form.author.data,
@@ -182,9 +185,10 @@ def add_corpus_file(corpus_id):
            chapter=form.chapter.data,
            corpus=corpus,
            editor=form.editor.data,
-            filename=form.file.data.filename,
+            filename=filename,
            institution=form.institution.data,
            journal=form.journal.data,
            mimetype='application/vrt+xml',
            pages=form.pages.data,
            publisher=form.publisher.data,
            publishing_year=form.publishing_year.data,
@@ -192,12 +196,25 @@ def add_corpus_file(corpus_id):
            title=form.title.data
        )
        db.session.add(corpus_file)
        db.session.flush(objects=[corpus_file])
        db.session.refresh(corpus_file)
        try:
            form.file.data.save(corpus_file.path)
        except OSError as e:
            current_app.logger.error(e)
            db.session.rollback()
            flash('Internal Server Error', 'error')
            return make_response({'redirect_url': url_for('.add_corpus_file', corpus_id=corpus.id)}, 500)  # noqa
        corpus.status = 'unprepared'
        db.session.commit()
-        flash(f'Corpus file "{corpus_file.filename}" added!', 'corpus')
+        flash(f'Corpus file "{corpus_file.title}" added!', 'corpus')
        return make_response({'redirect_url': url_for('.corpus', corpus_id=corpus.id)}, 201)  # noqa
-    return render_template('corpora/add_corpus_file.html.j2', corpus=corpus,
+    return render_template(
-                           form=form, title='Add corpus file')
+        'corpora/add_corpus_file.html.j2',
        corpus=corpus,
        form=form,
        title='Add corpus file'
    )
@bp.route('/<hashid:corpus_id>/files/<hashid:corpus_file_id>/delete')
@@ -17,11 +17,7 @@ class Daemon(CheckCorporaMixin, CheckJobsMixin):
    def run(self):
        while True:
            try:
            self.check_corpora()
            self.check_jobs()
            db.session.commit()
            except Exception as e:
                current_app.logger.warning(e)
                pass
            sleep(1.5)
@@ -26,37 +26,55 @@ class CheckCorporaMixin:
    def create_build_corpus_service(self, corpus):
        ''' # Docker service settings # '''
        ''' ## Command ## '''
-        command = 'docker-entrypoint.sh build-corpus'
+        command = ['bash', '-c']
        command.append(
            f'mkdir /corpora/data/nopaque_{corpus.id}'
            ' && '
            'cwb-encode'
            ' -c utf8'
            f' -d /corpora/data/nopaque_{corpus.id}'
            ' -f /root/files/corpus.vrt'
            f' -R /usr/local/share/cwb/registry/nopaque_{corpus.id}'
            ' -P pos -P lemma -P simple_pos'
            ' -S ent:0+type -S s:0'
            ' -S text:0+address+author+booktitle+chapter+editor+institution+journal+pages+publisher+publishing_year+school+title'  # noqa
            ' -xsB -9'
            ' && '
            f'cwb-make -V NOPAQUE_{corpus.id}'
        )
        ''' ## Constraints ## '''
        constraints = ['node.role==worker']
        ''' ## Image ## '''
-        image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cqpserver:r1674'  # noqa
+        image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cwb:r1702'
        ''' ## Labels ## '''
        labels = {
            'origin': current_app.config['SERVER_NAME'],
-            'type': 'build-corpus',
+            'type': 'corpus.build',
            'corpus_id': str(corpus.id)
        }
        ''' ## Mounts ## '''
-        ''' ### Corpus file mount ### '''
+        mounts = []
-        corpus_file_source = os.path.join(corpus.path, 'merged', 'corpus.vrt')
+        ''' ### Data mount ### '''
-        corpus_file_target = '/root/files/corpus.vrt'
+        data_mount_source = os.path.join(corpus.path, 'cwb', 'data')
-        corpus_file_mount = f'{corpus_file_source}:{corpus_file_target}:ro'
+        data_mount_target = '/corpora/data'
-        ''' ### Corpus data mount ### '''
+        data_mount = f'{data_mount_source}:{data_mount_target}:rw'
-        corpus_data_source = os.path.join(corpus.path, 'data')
+        # Make sure that their is no data in the data directory
-        corpus_data_target = '/corpora/data'
+        shutil.rmtree(data_mount_source, ignore_errors=True)
-        corpus_data_mount = f'{corpus_data_source}:{corpus_data_target}:rw'
+        os.makedirs(data_mount_source)
-        # Make sure that their is no data in the corpus data directory
+        mounts.append(data_mount)
-        shutil.rmtree(corpus_data_source, ignore_errors=True)
+        ''' ### File mount ### '''
-        os.mkdir(corpus_data_source)
+        file_mount_source = os.path.join(corpus.path, 'cwb', 'corpus.vrt')
-        ''' ### Corpus registry mount ### '''
+        file_mount_target = '/root/files/corpus.vrt'
-        corpus_registry_source = os.path.join(corpus.path, 'registry')
+        file_mount = f'{file_mount_source}:{file_mount_target}:ro'
-        corpus_registry_target = '/usr/local/share/cwb/registry'
+        mounts.append(file_mount)
-        corpus_registry_mount = f'{corpus_registry_source}:{corpus_registry_target}:rw'  # noqa
+        ''' ### Registry mount ### '''
-        # Make sure that their is no data in the corpus registry directory
+        registry_mount_source = os.path.join(corpus.path, 'cwb', 'registry')
-        shutil.rmtree(corpus_registry_source, ignore_errors=True)
+        registry_mount_target = '/usr/local/share/cwb/registry'
-        os.mkdir(corpus_registry_source)
+        registry_mount = f'{registry_mount_source}:{registry_mount_target}:rw'
-        mounts = [corpus_file_mount, corpus_data_mount, corpus_registry_mount]
+        # Make sure that their is no data in the registry directory
        shutil.rmtree(registry_mount_source, ignore_errors=True)
        os.makedirs(registry_mount_source)
        mounts.append(registry_mount)
        ''' ## Name ## '''
        name = f'build-corpus_{corpus.id}'
        ''' ## Restart policy ## '''
@@ -74,7 +92,7 @@ class CheckCorporaMixin:
        except docker.errors.APIError as e:
            current_app.logger.error(
                f'Create service "{name}" failed '
-                + f'due to "docker.errors.APIError": {e}'
+                f'due to "docker.errors.APIError": {e}'
            )
            return
        corpus.status = 'queued'
@@ -86,14 +104,14 @@ class CheckCorporaMixin:
        except docker.errors.NotFound as e:
            current_app.logger.error(
                f'Get service "{service_name}" failed '
-                + f'due to "docker.errors.NotFound": {e}'
+                f'due to "docker.errors.NotFound": {e}'
            )
            corpus.status = 'failed'
            return
        except docker.errors.APIError as e:
            current_app.logger.error(
                f'Get service "{service_name}" failed '
-                + f'due to "docker.errors.APIError": {e}'
+                f'due to "docker.errors.APIError": {e}'
            )
        service_tasks = service.tasks()
        if not service_tasks:
@@ -108,36 +126,47 @@ class CheckCorporaMixin:
            corpus.status = 'failed'
        else:
            return
-        try:
+        # try:
-            service.remove()
+        #     service.remove()
-        except docker.errors.APIError as e:
+        # except docker.errors.APIError as e:
-            current_app.logger.error(
+        #     current_app.logger.error(
-                f'Remove service "{service_name}" failed '
+        #         f'Remove service "{service_name}" failed '
-                + f'due to "docker.errors.APIError": {e}'
+        #         f'due to "docker.errors.APIError": {e}'
-            )
+        #     )
    def create_cqpserver_container(self, corpus):
        ''' # Docker container settings # '''
        ''' ## Command ## '''
-        command = 'cqpserver'
+        command = []
        command.append(
            'echo "host *;" > cqpserver.init'
            ' && '
            'echo "user anonymous \\"\\";" >> cqpserver.init'
            ' && '
            'cqpserver -I cqpserver.init'
        )
        ''' ## Detach ## '''
        detach = True
        ''' ## Entrypoint ## '''
        entrypoint = ['bash', '-c']
        ''' ## Image ## '''
-        image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cqpserver:r1674'  # noqa
+        image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cwb:r1702'
        ''' ## Name ## '''
        name = f'cqpserver_{corpus.id}'
        ''' ## Network ## '''
        network = 'nopaque_default'
        ''' ## Volumes ## '''
        volumes = []
        ''' ### Corpus data volume ### '''
-        corpus_data_source = os.path.join(corpus.path, 'data')
+        data_volume_source = os.path.join(corpus.path, 'cwb', 'data')
-        corpus_data_target = '/corpora/data'
+        data_volume_target = '/corpora/data'
-        corpus_data_volume = f'{corpus_data_source}:{corpus_data_target}:rw'
+        data_volume = f'{data_volume_source}:{data_volume_target}:rw'
        volumes.append(data_volume)
        ''' ### Corpus registry volume ### '''
-        corpus_registry_source = os.path.join(corpus.path, 'registry')
+        registry_volume_source = os.path.join(corpus.path, 'cwb', 'registry')
-        corpus_registry_target = '/usr/local/share/cwb/registry'
+        registry_volume_target = '/usr/local/share/cwb/registry'
-        corpus_registry_volume = f'{corpus_registry_source}:{corpus_registry_target}:rw'  # noqa
+        registry_volume = f'{registry_volume_source}:{registry_volume_target}:rw'  # noqa
-        volumes = [corpus_data_volume, corpus_registry_volume]
+        volumes.append(registry_volume)
        # Check if a cqpserver container already exists. If this is the case,
        # remove it and create a new one
        try:
@@ -147,7 +176,7 @@ class CheckCorporaMixin:
        except docker.errors.APIError as e:
            current_app.logger.error(
                f'Get container "{name}" failed '
-                + f'due to "docker.errors.APIError": {e}'
+                f'due to "docker.errors.APIError": {e}'
            )
            return
        else:
@@ -156,7 +185,7 @@ class CheckCorporaMixin:
            except docker.errors.APIError as e:
                current_app.logger.error(
                    f'Remove container "{name}" failed '
-                    + f'due to "docker.errors.APIError": {e}'
+                    f'due to "docker.errors.APIError": {e}'
                )
                return
        try:
@@ -164,6 +193,7 @@ class CheckCorporaMixin:
                image,
                command=command,
                detach=detach,
                entrypoint=entrypoint,
                volumes=volumes,
                name=name,
                network=network
@@ -171,14 +201,14 @@ class CheckCorporaMixin:
        except docker.errors.ImageNotFound as e:
            current_app.logger.error(
                f'Run container "{name}" failed '
-                + f'due to "docker.errors.ImageNotFound" error: {e}'
+                f'due to "docker.errors.ImageNotFound" error: {e}'
            )
            corpus.status = 'failed'
            return
        except docker.errors.APIError as e:
            current_app.logger.error(
                f'Run container "{name}" failed '
-                + f'due to "docker.errors.APIError" error: {e}'
+                f'due to "docker.errors.APIError" error: {e}'
            )
            return
        corpus.status = 'analysing'
@@ -190,14 +220,14 @@ class CheckCorporaMixin:
        except docker.errors.NotFound as e:
            current_app.logger.error(
                f'Get container "{container_name}" failed '
-                + f'due to "docker.errors.NotFound": {e}'
+                f'due to "docker.errors.NotFound": {e}'
            )
            corpus.num_analysis_sessions = 0
            corpus.status = 'prepared'
        except docker.errors.APIError as e:
            current_app.logger.error(
                f'Get container "{container_name}" failed '
-                + f'due to "docker.errors.APIError": {e}'
+                f'due to "docker.errors.APIError": {e}'
            )
    def remove_cqpserver_container(self, corpus):
@@ -210,7 +240,7 @@ class CheckCorporaMixin:
        except docker.errors.APIError as e:
            current_app.logger.error(
                f'Get container "{container_name}" failed '
-                + f'due to "docker.errors.APIError": {e}'
+                f'due to "docker.errors.APIError": {e}'
            )
            return
        try:
@@ -218,5 +248,5 @@ class CheckCorporaMixin:
        except docker.errors.APIError as e:
            current_app.logger.error(
                f'Remove container "{container_name}" failed '
-                + f'due to "docker.errors.APIError": {e}'
+                f'due to "docker.errors.APIError": {e}'
            )
@@ -2,7 +2,7 @@ from datetime import datetime
 from flask import current_app
 from werkzeug.utils import secure_filename
 from .. import db
-from ..models import Job, JobResult
+from ..models import Job, JobResult, TesseractOCRModel
 import docker
 import json
 import os
@@ -23,27 +23,34 @@ class CheckJobsMixin:
        ''' # Docker service settings # '''
        ''' ## Service specific settings ## '''
        if job.service == 'file-setup':
-            mem_mb = 2048
+            mem_mb = 512
            n_cores = 2
            executable = 'file-setup'
-            image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}file-setup:{job.service_version}'  # noqa
+            image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}file-setup:v{job.service_version}'  # noqa
-        elif job.service == 'ocr':
+        elif job.service == 'tesseract-ocr':
-            mem_mb = 4096
+            mem_mb = 2048
            n_cores = 4
            executable = 'ocr'
-            image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}ocr:{job.service_version}'  # noqa
+            image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}ocr:v{job.service_version}'  # noqa
-        elif job.service == 'nlp':
+        elif job.service == 'spacy-nlp':
-            mem_mb = 2048
+            mem_mb = 1024
-            n_cores = 2
+            n_cores = 1
            executable = 'nlp'
-            image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}nlp:{job.service_version}'  # noqa
+            image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}nlp:v{job.service_version}'  # noqa
        ''' ## Command ## '''
        command = f'{executable} -i /input -o /output'
-        command += ' --log-dir /input'
+        command += ' --log-dir /logs'
        command += f' --mem-mb {mem_mb}'
        command += f' --n-cores {n_cores}'
-        command += f' --zip [{job.service}]_{secure_filename(job.title)}'
+        service_args = json.loads(job.service_args)
-        command += ' ' + ' '.join(json.loads(job.service_args))
+        if job.service == 'spacy-nlp':
            command += f' -m {service_args["model"]}'
            if 'encoding_detection' in service_args and service_args['encoding_detection']:  # noqa
                command += ' --check-encoding'
        elif job.service == 'tesseract-ocr':
            command += f' -m {service_args["model"]}'
            if 'binarization' in service_args and service_args['binarization']:
                command += ' --binarize'
        ''' ## Constraints ## '''
        constraints = ['node.role==worker']
        ''' ## Labels ## '''
@@ -53,20 +60,42 @@ class CheckJobsMixin:
            'job_id': str(job.id)
        }
        ''' ## Mounts ## '''
-        ''' ### Input mount ### '''
+        mounts = []
-        input_mount_source = job.path
+        ''' ### Input mount(s) ### '''
-        input_mount_target = '/input'
+        input_mount_target_base = '/input'
        if job.service == 'file-setup':
-            input_mount_target += f'/{secure_filename(job.title)}'
+            input_mount_target_base += f'/{secure_filename(job.title)}'
-        input_mount = f'{input_mount_source}:{input_mount_target}:rw'
+        for job_input in job.inputs:
            input_mount_source = job_input.path
            input_mount_target = f'/{input_mount_target_base}/{job_input.filename}'  # noqa
            input_mount = f'{input_mount_source}:{input_mount_target}:ro'
            mounts.append(input_mount)
        if job.service == 'tesseract-ocr':
            service_args = json.loads(job.service_args)
            model = TesseractOCRModel.query.get(service_args['model'])
            if model is None:
                job.status = 'failed'
                return
            models_mount_source = model.path
            models_mount_target = f'/usr/local/share/tessdata/{model.filename}'
            models_mount = f'{models_mount_source}:{models_mount_target}:ro'
            mounts.append(models_mount)
        ''' ### Output mount ### '''
-        output_mount_source = os.path.join(job.path, 'output')
+        output_mount_source = os.path.join(job.path, 'results')
        output_mount_target = '/output'
        output_mount = f'{output_mount_source}:{output_mount_target}:rw'
        # Make sure that their is no data in the output directory
        shutil.rmtree(output_mount_source, ignore_errors=True)
        os.makedirs(output_mount_source)
-        mounts = [input_mount, output_mount]
+        mounts.append(output_mount)
        ''' ### Pipeline data mount ### '''
        pyflow_data_mount_source = os.path.join(job.path, 'pipeline_data')
        pyflow_data_mount_target = '/logs/pyflow.data'
        pyflow_data_mount = f'{pyflow_data_mount_source}:{pyflow_data_mount_target}:rw'  # noqa
        # Make sure that their is no data in the output directory
        shutil.rmtree(pyflow_data_mount_source, ignore_errors=True)
        os.makedirs(pyflow_data_mount_source)
        mounts.append(pyflow_data_mount)
        ''' ## Name ## '''
        name = f'job_{job.id}'
        ''' ## Resources ## '''
@@ -90,7 +119,7 @@ class CheckJobsMixin:
        except docker.errors.APIError as e:
            current_app.logger.error(
                f'Create service "{name}" failed '
-                + f'due to "docker.errors.APIError": {e}'
+                f'due to "docker.errors.APIError": {e}'
            )
            return
        job.status = 'queued'
@@ -102,14 +131,14 @@ class CheckJobsMixin:
        except docker.errors.NotFound as e:
            current_app.logger.error(
                f'Get service "{service_name}" failed '
-                + f'due to "docker.errors.NotFound": {e}'
+                f'due to "docker.errors.NotFound": {e}'
            )
            job.status = 'failed'
            return
        except docker.errors.APIError as e:
            current_app.logger.error(
                f'Get service "{service_name}" failed '
-                + f'due to "docker.errors.APIError": {e}'
+                f'due to "docker.errors.APIError": {e}'
            )
            return
        service_tasks = service.tasks()
@@ -121,13 +150,25 @@ class CheckJobsMixin:
            return
        elif job.status == 'running' and task_state == 'complete':
            job.status = 'complete'
-            results_dir = os.path.join(job.path, 'output')
+            results_dir = os.path.join(job.path, 'results')
-            result_files = [x for x in os.listdir(results_dir) if x.endswith('.zip')]  # noqa
+            with open(os.path.join(results_dir, 'outputs.json')) as f:
-            for result_file in result_files:
+                outputs = json.load(f)
-                job_result = JobResult(filename=result_file, job=job)
+            for output in outputs:
                filename = os.path.basename(output['file'])
                job_result = JobResult(
                    filename=filename,
                    job=job,
                    mimetype=output['mimetype']
                )
                if 'description' in output:
                    job_result.description = output['description']
                db.session.add(job_result)
-                db.session.flush()
+                db.session.flush(objects=[job_result])
                db.session.refresh(job_result)
                os.rename(
                    os.path.join(results_dir, output['file']),
                    job_result.path
                )
        elif job.status == 'running' and task_state == 'failed':
            job.status = 'failed'
        else:
@@ -138,7 +179,7 @@ class CheckJobsMixin:
        except docker.errors.APIError as e:
            current_app.logger.error(
                f'Remove service "{service_name}" failed '
-                + f'due to "docker.errors.APIError": {e}'
+                f'due to "docker.errors.APIError": {e}'
            )
    def remove_job_service(self, job):
@@ -151,7 +192,7 @@ class CheckJobsMixin:
        except docker.errors.APIError as e:
            current_app.logger.error(
                f'Get service "{service_name}" failed '
-                + f'due to "docker.errors.APIError": {e}'
+                f'due to "docker.errors.APIError": {e}'
            )
            return
        try:
@@ -159,7 +200,7 @@ class CheckJobsMixin:
        except docker.errors.APIError as e:
            current_app.logger.error(
                f'Update service "{service_name}" failed '
-                + f'due to "docker.errors.APIError": {e}'
+                f'due to "docker.errors.APIError": {e}'
            )
            return
        try:
@@ -167,5 +208,5 @@ class CheckJobsMixin:
        except docker.errors.APIError as e:
            current_app.logger.error(
                f'Remove "{service_name}" service failed '
-                + f'due to "docker.errors.APIError": {e}'
+                f'due to "docker.errors.APIError": {e}'
            )
@@ -34,12 +34,14 @@ def delete_job(job_id):
@login_required
 def download_job_input(job_id, job_input_id):
    job_input = JobInput.query.filter(JobInput.job_id == job_id, JobInput.id == job_input_id).first_or_404()  # noqa
-    if not (job_input.job.user == current_user
+    if not (job_input.job.user == current_user or current_user.is_administrator()):  # noqa
            or current_user.is_administrator()):
        abort(403)
-    return send_from_directory(as_attachment=True,
+    return send_from_directory(
        as_attachment=True,
        attachment_filename=job_input.filename,
        directory=os.path.dirname(job_input.path),
-                               filename=job_input.filename)
+        filename=os.path.basename(job_input.path)
    )
@bp.route('/<hashid:job_id>/restart')
@@ -59,9 +61,11 @@ def restart(job_id):
@login_required
 def download_job_result(job_id, job_result_id):
    job_result = JobResult.query.filter(JobResult.job_id == job_id, JobResult.id == job_result_id).first_or_404()  # noqa
-    if not (job_result.job.user == current_user
+    if not (job_result.job.user == current_user or current_user.is_administrator()):  # noqa
            or current_user.is_administrator()):
        abort(403)
-    return send_from_directory(as_attachment=True,
+    return send_from_directory(
        as_attachment=True,
        attachment_filename=job_result.filename,
        directory=os.path.dirname(job_result.path),
-                               filename=job_result.filename)
+        filename=os.path.basename(job_result.path)
    )
@@ -4,13 +4,17 @@ from flask_hashids import HashidMixin
 from flask_login import UserMixin
 from itsdangerous import BadSignature, TimedJSONWebSignatureSerializer
 from time import sleep
 from tqdm import tqdm
 from werkzeug.security import generate_password_hash, check_password_hash
 import xml.etree.ElementTree as ET
 from . import db, login
 import base64
 import enum
 import json
 import os
 import requests
 import shutil
 import xml.etree.ElementTree as ET
 import yaml
 class Permission(enum.IntEnum):
@@ -25,7 +29,7 @@ class Permission(enum.IntEnum):
 class FileMixin:
    creation_date = db.Column(db.DateTime, default=datetime.utcnow)
-    filename = db.Column(db.String(256))
+    filename = db.Column(db.String(255))
    last_edited_date = db.Column(db.DateTime, default=datetime.utcnow)
    mimetype = db.Column(db.String(255))
@@ -86,7 +90,7 @@ class Role(HashidMixin, db.Model):
        return dict_role
    @staticmethod
-    def insert_roles():
+    def insert_defaults():
        roles = {
            'User': [],
            'API user': [Permission.USE_API],
@@ -132,6 +136,12 @@ class User(HashidMixin, UserMixin, db.Model):
        db.String(16), default='all')
    # Backrefs: role: Role
    # Relationships
    tesseract_ocr_models = db.relationship(
        'TesseractOCRModel',
        backref='user',
        cascade='all, delete-orphan',
        lazy='dynamic'
    )
    corpora = db.relationship(
        'Corpus',
        backref='user',
@@ -221,6 +231,12 @@ class User(HashidMixin, UserMixin, db.Model):
    def is_administrator(self):
        return self.can(Permission.ADMINISTRATE)
    def makedirs(self):
        os.mkdir(self.path)
        os.mkdir(os.path.join(self.path, 'tesseract_ocr_models'))
        os.mkdir(os.path.join(self.path, 'corpora'))
        os.mkdir(os.path.join(self.path, 'jobs'))
    def revoke_token(self):
        self.token_expiration = datetime.utcnow() - timedelta(seconds=1)
@@ -269,6 +285,21 @@ class User(HashidMixin, UserMixin, db.Model):
            return None
        return user
    @staticmethod
    def insert_defaults():
        if User.query.filter_by(username='nopaque').first() is not None:
            return
        user = User(username='nopaque')
        db.session.add(user)
        db.session.flush(objects=[user])
        db.session.refresh(user)
        try:
            user.makedirs()
        except OSError as e:
            current_app.logger.error(e)
            db.session.rollback()
        db.session.commit()
    @staticmethod
    def reset_password(token, new_password):
        s = TimedJSONWebSignatureSerializer(current_app.config['SECRET_KEY'])
@@ -284,6 +315,72 @@ class User(HashidMixin, UserMixin, db.Model):
        return True
 class TesseractOCRModel(FileMixin, HashidMixin, db.Model):
    __tablename__ = 'tesseract_ocr_models'
    # Primary key
    id = db.Column(db.Integer, primary_key=True)
    # Foreign keys
    user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
    # Fields
    compatible_service_versions = db.Column(db.String(255))
    description = db.Column(db.String(255))
    publisher = db.Column(db.String(128))
    publishing_year = db.Column(db.Integer)
    title = db.Column(db.String(64))
    version = db.Column(db.String(16))
    # Backrefs: user: User
    @property
    def path(self):
        return os.path.join(
            self.user.path,
            'tesseract_ocr_models',
            str(self.id)
        )
    @staticmethod
    def insert_defaults():
        user = User.query.filter_by(username='nopaque').first()
        defaults_file = os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            'TesseractOCRModel.defaults.yml'
        )
        with open(defaults_file, 'r') as f:
            defaults = yaml.safe_load(f)
        for m in defaults:
            if TesseractOCRModel.query.filter_by(title=m['title'], version=m['version']).first() is not None:  # noqa
                continue
            tesseract_ocr_model = TesseractOCRModel(
                compatible_service_versions=json.dumps(m['compatible_service_versions']),  # noqa
                description=m['description'],
                publisher=m['publisher'],
                publishing_year=m['publishing_year'],
                title=m['title'],
                user=user,
                version=m['version']
            )
            db.session.add(tesseract_ocr_model)
            db.session.flush(objects=[tesseract_ocr_model])
            db.session.refresh(tesseract_ocr_model)
            tesseract_ocr_model.filename = f'{tesseract_ocr_model.id}.traineddata'  # noqa
            r = requests.get(m['url'], stream=True)
            pbar = tqdm(
                desc=f'{tesseract_ocr_model.title} ({tesseract_ocr_model.filename})',  # noqa
                unit="B",
                unit_scale=True,
                unit_divisor=1024,
                total=int(r.headers['Content-Length'])
            )
            pbar.clear()
            with open(tesseract_ocr_model.path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=1024):
                    if chunk:  # filter out keep-alive new chunks
                        pbar.update(len(chunk))
                        f.write(chunk)
                pbar.close()
        db.session.commit()
 class JobInput(FileMixin, HashidMixin, db.Model):
    __tablename__ = 'job_inputs'
    # Primary key
@@ -309,7 +406,7 @@ class JobInput(FileMixin, HashidMixin, db.Model):
    @property
    def path(self):
-        return os.path.join(self.job.path, self.filename)
+        return os.path.join(self.job.path, 'inputs', str(self.id))
    def to_dict(self, backrefs=False, relationships=False):
        dict_job_input = {
@@ -347,6 +444,8 @@ class JobResult(FileMixin, HashidMixin, db.Model):
    id = db.Column(db.Integer, primary_key=True)
    # Foreign keys
    job_id = db.Column(db.Integer, db.ForeignKey('jobs.id'))
    # Fields
    description = db.Column(db.String(255))
    # Backrefs: job: Job
    def __repr__(self):
@@ -366,12 +465,13 @@ class JobResult(FileMixin, HashidMixin, db.Model):
    @property
    def path(self):
-        return os.path.join(self.job.path, 'output', self.filename)
+        return os.path.join(self.job.path, 'results', str(self.id))
    def to_dict(self, backrefs=False, relationships=False):
        dict_job_result = {
            'id': self.hashid,
            'job_id': self.job.hashid,
            'description': self.description,
            'download_url': self.download_url,
            'url': self.url,
            **self.file_mixin_to_dict(
@@ -414,8 +514,8 @@ class Job(HashidMixin, db.Model):
    end_date = db.Column(db.DateTime())
    service = db.Column(db.String(64))
    '''
-    ' Service specific arguments as string list.
+    ' Dictionary as JSON formatted string.
-    ' Example: ["-l eng", "--binarize"]
+    ' Example: {"binarization": True}
    '''
    service_args = db.Column(db.String(255))
    service_version = db.Column(db.String(16))
@@ -472,6 +572,12 @@ class Job(HashidMixin, db.Model):
        shutil.rmtree(self.path, ignore_errors=True)
        db.session.delete(self)
    def makedirs(self):
        os.mkdir(self.path)
        os.mkdir(os.path.join(self.path, 'inputs'))
        os.mkdir(os.path.join(self.path, 'pipeline_data'))
        os.mkdir(os.path.join(self.path, 'results'))
    def restart(self):
        '''
        Restart a job - only if the status is complete or failed
@@ -479,7 +585,7 @@ class Job(HashidMixin, db.Model):
        if self.status not in ['complete', 'failed']:
            raise Exception('Could not restart job: status is not "complete/failed"')  # noqa
-        shutil.rmtree(os.path.join(self.path, 'output'), ignore_errors=True)
+        shutil.rmtree(os.path.join(self.path, 'results'), ignore_errors=True)
        shutil.rmtree(os.path.join(self.path, 'pyflow.data'), ignore_errors=True)  # noqa
        for result in self.results:
            db.session.delete(result)
@@ -487,6 +593,10 @@ class Job(HashidMixin, db.Model):
        self.status = 'submitted'
    def to_dict(self, backrefs=False, relationships=False):
        service_args = json.loads(self.service_args)
        if self.service == 'tesseract-ocr' and 'model' in service_args:
            tesseract_ocr_pipeline_model = TesseractOCRModel.query.get(service_args['model'])  # noqa
            service_args['model'] = tesseract_ocr_pipeline_model.title
        dict_job = {
            'id': self.hashid,
            'user_id': self.user.hashid,
@@ -494,7 +604,7 @@ class Job(HashidMixin, db.Model):
            'description': self.description,
            'end_date': None if self.end_date is None else f'{self.end_date.isoformat()}Z',  # noqa
            'service': self.service,
-            'service_args': self.service_args,
+            'service_args': service_args,
            'service_version': self.service_version,
            'status': self.status,
            'title': self.title,
@@ -550,7 +660,7 @@ class CorpusFile(FileMixin, HashidMixin, db.Model):
    @property
    def path(self):
-        return os.path.join(self.corpus.path, self.filename)
+        return os.path.join(self.corpus.path, 'files', str(self.id))
    @property
    def url(self):
@@ -659,28 +769,27 @@ class Corpus(HashidMixin, db.Model):
        return self.user.hashid
    def build(self):
        output_dir = os.path.join(self.path, 'merged')
        shutil.rmtree(output_dir, ignore_errors=True)
        os.mkdir(output_dir)
        output_file = os.path.join(output_dir, 'corpus.vrt')
        corpus_element = ET.fromstring('<corpus>\n</corpus>')
        for corpus_file in self.files:
            element_tree = ET.parse(corpus_file.path)
-            text_node = element_tree.find('text')
+            text_element = element_tree.getroot()
-            text_node.set('address', corpus_file.address or 'NULL')
+            text_element.set('address', corpus_file.address or 'NULL')
-            text_node.set('author', corpus_file.author)
+            text_element.set('author', corpus_file.author)
-            text_node.set('booktitle', corpus_file.booktitle or 'NULL')
+            text_element.set('booktitle', corpus_file.booktitle or 'NULL')
-            text_node.set('chapter', corpus_file.chapter or 'NULL')
+            text_element.set('chapter', corpus_file.chapter or 'NULL')
-            text_node.set('editor', corpus_file.editor or 'NULL')
+            text_element.set('editor', corpus_file.editor or 'NULL')
-            text_node.set('institution', corpus_file.institution or 'NULL')
+            text_element.set('institution', corpus_file.institution or 'NULL')
-            text_node.set('journal', corpus_file.journal or 'NULL')
+            text_element.set('journal', corpus_file.journal or 'NULL')
-            text_node.set('pages', corpus_file.pages or 'NULL')
+            text_element.set('pages', corpus_file.pages or 'NULL')
-            text_node.set('publisher', corpus_file.publisher or 'NULL')
+            text_element.set('publisher', corpus_file.publisher or 'NULL')
-            text_node.set('publishing_year', str(corpus_file.publishing_year))
+            text_element.set('publishing_year', str(corpus_file.publishing_year))  # noqa
-            text_node.set('school', corpus_file.school or 'NULL')
+            text_element.set('school', corpus_file.school or 'NULL')
-            text_node.set('title', corpus_file.title)
+            text_element.set('title', corpus_file.title)
-            corpus_element.insert(1, text_node)
+            corpus_element.insert(1, text_element)
-        ET.ElementTree(corpus_element).write(output_file, encoding='utf-8')
+        ET.ElementTree(corpus_element).write(
            os.path.join(self.path, 'cwb', 'corpus.vrt'),
            encoding='utf-8'
        )
        self.last_edited_date = datetime.utcnow()
        self.status = 'submitted'
@@ -688,6 +797,13 @@ class Corpus(HashidMixin, db.Model):
        shutil.rmtree(self.path, ignore_errors=True)
        db.session.delete(self)
    def makedirs(self):
        os.mkdir(self.path)
        os.mkdir(os.path.join(self.path, 'files'))
        os.mkdir(os.path.join(self.path, 'cwb'))
        os.mkdir(os.path.join(self.path, 'cwb', 'data'))
        os.mkdir(os.path.join(self.path, 'cwb', 'registry'))
    def to_dict(self, backrefs=False, relationships=False):
        dict_corpus = {
            'id': self.hashid,
@@ -1,77 +1,13 @@
 from flask import Blueprint
 import os
 import yaml
-SERVICES = {
+services_file = os.path.join(
-    'file-setup': {
+    os.path.dirname(os.path.abspath(__file__)), 'services.yml')
-        'name': 'File setup',
+with open(services_file, 'r') as f:
-        'versions': {
+    SERVICES = yaml.safe_load(f)
            'latest': '1.0.0b',
            '1.0.0b': {
                'publishing_data': {
                    'date': None,
                    'title': 'nopaque File setup service',
                    'url': 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup/-/tree/1.0.0b',  # noqa
                    'version': '1.0.0'
                }
            }
        }
    },
    'nlp': {
        'name': 'Natural Language Processing',
        'versions': {
            'latest': '1.0.0b',
            '1.0.0b': {
                'check_encoding': True,
                'models': {
                    'de': 'German',
                    'en': 'English',
                    'it': 'Italian',
                    'nl': 'Dutch',
                    'pl': 'Polish',
                    'zh': 'Chinese'
                },
                'publishing_data': {
                    'date': None,
                    'title': 'nopaque NLP service',
                    'url': 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/tree/1.0.0b',  # noqa
                    'version': '1.0.0'
                }
            }
        }
    },
    'ocr': {
        'name': 'Optical Character Recognition',
        'versions': {
            'latest': '1.0.0b',
            '1.0.0b': {
                'binarization': True,
                'models': {
                    'ara': 'Arabic',
                    'chi_tra': 'Chinese - Traditional',
                    'dan': 'Danish',
                    'eng': 'English',
                    'enm': 'English, Middle 1100-1500',
                    'fra': 'French',
                    'frm': 'French, Middle ca. 1400-1600',
                    'deu': 'German',
                    'frk': 'German Fraktur',
                    'ell': 'Greek, Modern (1453-)',
                    'ita': 'Italian',
                    'por': 'Portuguese',
                    'rus': 'Russian',
                    'spa': 'Spanish; Castilian',
                },
                'publishing_data': {
                    'date': None,
                    'title': 'nopaque OCR service',
                    'url': 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/tree/1.0.0b',  # noqa
                    'version': '1.0.0'
                }
            }
        }
    }
 }
 bp = Blueprint('services', __name__)
-from . import routes
+from . import routes  # noqa
@@ -1,3 +1,4 @@
 from app.models import TesseractOCRModel
 from flask_wtf import FlaskForm
 from wtforms import (BooleanField, MultipleFileField, SelectField, StringField,
                     SubmitField, ValidationError)
@@ -6,85 +7,105 @@ from . import SERVICES
 class AddJobForm(FlaskForm):
-    description = StringField('Description',
+    description = StringField('Description', validators=[DataRequired(), Length(1, 255)])  # noqa
                              validators=[DataRequired(), Length(1, 255)])
    submit = SubmitField()
    title = StringField('Title', validators=[DataRequired(), Length(1, 32)])
    version = SelectField('Version', validators=[DataRequired()])
-class AddNLPJobForm(AddJobForm):
+class AddSpacyNLPJobForm(AddJobForm):
-    check_encoding = BooleanField('Check encoding')
+    encoding_detection = BooleanField('Encoding detection')
    files = MultipleFileField('Files', validators=[DataRequired()])
-    language = SelectField('Language',  choices=[('', 'Choose your option')],
+    model = SelectField(
-                           default='', validators=[DataRequired()])
+        'Model',
        choices=[('', 'Choose your option')],
        default='',
        validators=[DataRequired()]
    )
-    def validate_check_encoding(self, field):
+    def validate_encoding_detection(self, field):
-        if field.data and 'check_encoding' not in SERVICES['nlp']['versions'][self.version.data]:  # noqa
+        service_info = SERVICES['spacy-nlp']['versions'][self.version.data]
-            raise ValidationError('Check encoding is not available in this version')  # noqa
+        if field.data and 'encoding_detection' not in service_info:
            raise ValidationError('Encoding detection is not available')
    def validate_files(form, field):
        valid_extensions = ['.txt']
        for file in field.data:
-            if not file.filename.lower().endswith('.txt'):
+            if not file.filename.lower().endswith(tuple(valid_extensions)):
-                raise ValidationError('File does not have an approved '
+                raise ValidationError(
-                                      'extension: .txt')
+                    'File does not have an approved extension: '
                    '/'.join(valid_extensions)
                )
    def __init__(self, *args, **kwargs):
-        version = kwargs.pop('version', SERVICES['nlp']['versions']['latest'])
+        version = kwargs.pop('version', SERVICES['spacy-nlp']['latest_version'])  # noqa
        super().__init__(*args, **kwargs)
-        if 'check_encoding' not in SERVICES['nlp']['versions'][version]:
+        service_info = SERVICES['spacy-nlp']['versions'][version]
-            self.check_encoding.render_kw = {'disabled': True}
+        if 'check_encoding' not in service_info['methods']:
-        self.language.choices += [(x, y) for x, y in SERVICES['nlp']['versions'][version]['models'].items()]  # noqa
+            self.encoding_detection.render_kw = {'disabled': True}
-        self.version.choices = [(x, x) for x in SERVICES['nlp']['versions'] if x != 'latest']  # noqa
+        self.model.choices += [(x, y) for x, y in service_info['models'].items()]  # noqa
        self.version.choices = [(x, x) for x in SERVICES['spacy-nlp']['versions']]  # noqa
        self.version.default = version
-class AddOCRJobForm(AddJobForm):
+class AddTesseractOCRJobForm(AddJobForm):
-    binarization = BooleanField('Binarazation')
+    binarization = BooleanField('Binarization')
    files = MultipleFileField('Files', validators=[DataRequired()])
-    language = SelectField('Language', choices=[('', 'Choose your option')],
+    model = SelectField(
-                           default='', validators=[DataRequired()])
+        'Model',
        choices=[('', 'Choose your option')],
        default='',
        validators=[DataRequired()]
    )
    def validate_binarization(self, field):
-        if field.data and 'binarization' not in SERVICES['ocr']['versions'][self.version.data]:  # noqa
+        service_info = SERVICES['tesseract-ocr']['versions'][self.version.data]
-            raise ValidationError('Binarization is not available in this version')  # noqa
+        if field.data and 'binarization' not in service_info:
            raise ValidationError('Binarization is not available')
    def validate_files(self, field):
        valid_extensions = ['.pdf']
        for file in field.data:
-            if not file.filename.lower().endswith('.pdf'):
+            if not file.filename.lower().endswith(tuple(valid_extensions)):
-                raise ValidationError('File does not have an approved '
+                raise ValidationError(
-                                      'extension: .pdf')
+                    'File does not have an approved extension: '
                    '/'.join(valid_extensions)
                )
    def __init__(self, *args, **kwargs):
-        version = kwargs.pop('version', SERVICES['ocr']['versions']['latest'])
+        version = kwargs.pop('version', SERVICES['tesseract-ocr']['latest_version'])  # noqa
        super().__init__(*args, **kwargs)
-        if 'binarization' not in SERVICES['ocr']['versions'][version]:
+        service_info = SERVICES['tesseract-ocr']['versions'][version]
        if 'binarization' not in service_info['methods']:
            self.binarization.render_kw = {'disabled': True}
-        self.language.choices += [(x, y) for x, y in SERVICES['ocr']['versions'][version]['models'].items()]  # noqa
+        self.model.choices += [(x.hashid, x.title) for x in TesseractOCRModel.query.all()]  # noqa
-        self.version.choices = [(x, x) for x in SERVICES['ocr']['versions'] if x != 'latest']  # noqa
+        self.version.choices = [(x, x) for x in SERVICES['tesseract-ocr']['versions']]  # noqa
-        self.version.default = version
+        self.version.data = version
        self.version.default = SERVICES['tesseract-ocr']['latest_version']
 class AddFileSetupJobForm(AddJobForm):
    files = MultipleFileField('Files', validators=[DataRequired()])
    def validate_files(form, field):
        valid_extensions = ['.jpeg', '.jpg', '.png', '.tiff', '.tif']
        for file in field.data:
-            if not file.filename.lower().endswith(('.jpeg', '.jpg', '.png',
+            if not file.filename.lower().endswith(tuple(valid_extensions)):
-                                                   '.tiff', '.tif')):
+                raise ValidationError(
-                raise ValidationError('File does not have an approved '
+                    'File does not have an approved extension: '
-                                      'extension: .jpeg | .jpg | .png | .tiff '
+                    '/'.join(valid_extensions)
-                                      '| .tif')
+                )
    def __init__(self, *args, **kwargs):
-        version = kwargs.pop('version', SERVICES['file-setup']['versions']['latest'])
+        version = kwargs.pop('version', SERVICES['file-setup']['latest_version'])  # noqa
        super().__init__(*args, **kwargs)
-        self.version.choices = [(x, x) for x in SERVICES['file-setup']['versions'] if x != 'latest']  # noqa
+        self.version.choices = [(x, x) for x in SERVICES['file-setup']['versions']]  # noqa
-        self.version.default = version
+        self.version.data = version
        self.version.default = SERVICES['file-setup']['latest_version']
 AddJobForms = {
    'file-setup': AddFileSetupJobForm,
-    'ocr': AddOCRJobForm,
+    'tesseract-ocr': AddTesseractOCRJobForm,
-    'nlp': AddNLPJobForm
+    'spacy-nlp': AddSpacyNLPJobForm
 }
@@ -1,3 +1,4 @@
 from app import hashids
 from flask import (abort, current_app, flash, make_response, render_template,
                   request, url_for)
 from flask_login import current_user, login_required
@@ -8,7 +9,6 @@ from .. import db
 from .forms import AddJobForms
 from ..models import Job, JobInput
 import json
 import os
@bp.route('/corpus-analysis')
@@ -24,57 +24,65 @@ def service(service):
    # Check if the requested service exist
    if service not in SERVICES or service not in AddJobForms:
        abort(404)
-    version = request.args.get(
+    version = request.args.get('version', SERVICES[service]['latest_version'])
        'version', SERVICES[service]['versions']['latest'])
    if version not in SERVICES[service]['versions']:
        abort(404)
    form = AddJobForms[service](prefix='add-job-form', version=version)
    form.version.data = version
    title = SERVICES[service]['name']
    versions = SERVICES[service]['versions']
    if form.is_submitted():
        if not form.validate():
            return make_response(form.errors, 400)
-        service_args = []
+        service_args = {}
-        if service == 'nlp':
+        if service == 'spacy-nlp':
-            service_args.append(f'-l {form.language.data}')
+            service_args['model'] = form.model.data
-            if form.check_encoding.data:
+            if form.encoding_detection.data:
-                service_args.append('--check-encoding')
+                service_args['encoding_detection'] = True
-        if service == 'ocr':
+        if service == 'tesseract-ocr':
-            service_args.append(f'-l {form.language.data}')
+            service_args['model'] = hashids.decode(form.model.data)
            if form.binarization.data:
-                service_args.append('--binarize')
+                service_args['binarization'] = True
-        job = Job(user=current_user,
+        job = Job(
            user=current_user,
            description=form.description.data,
-                  service=service, service_args=json.dumps(service_args),
+            service=service,
            service_args=json.dumps(service_args),
            service_version=form.version.data,
-                  status='preparing', title=form.title.data)
+            status='preparing',
            title=form.title.data
        )
        db.session.add(job)
-        db.session.flush()
+        db.session.flush(objects=[job])
        db.session.refresh(job)
        try:
-            os.makedirs(job.path)
+            job.makedirs()
-        except OSError:
+        except OSError as e:
-            current_app.logger.error(f'Make dir {job.path} led to an OSError!')
+            current_app.logger.error(e)
            db.session.rollback()
            flash('Internal Server Error', 'error')
-            return make_response(
+            return make_response({'redirect_url': url_for('.service', service=service)}, 500)  # noqa
                {'redirect_url': url_for('.service', service=service)}, 500)
        else:
        for file in form.files.data:
            filename = secure_filename(file.filename)
            job_input = JobInput(
-                    filename=filename, job=job, mimetype=file.mimetype)
+                filename=filename,
-                file.save(job_input.path)
+                job=job,
                mimetype=file.mimetype
            )
            db.session.add(job_input)
            db.session.flush(objects=[job_input])
            db.session.refresh(job_input)
            try:
                file.save(job_input.path)
            except OSError as e:
                current_app.logger.error(e)
                db.session.rollback()
                flash('Internal Server Error', 'error')
                return make_response({'redirect_url': url_for('.service', service=service)}, 500)  # noqa
        job.status = 'submitted'
        db.session.commit()
        flash(f'Job "{job.title}" added', 'job')
-            return make_response(
+        return make_response({'redirect_url': url_for('jobs.job', job_id=job.id)}, 201)  # noqa
                {'redirect_url': url_for('jobs.job', job_id=job.id)}, 201)
    return render_template(
        f'services/{service.replace("-", "_")}.html.j2',
        form=form,
-        title=title,
+        title=title
        versions=versions
    )
@@ -0,0 +1,38 @@
 # TODO: This could also be done via GitLab/GitHub APIs
 #file-setup-pipeline:
 file-setup:
  name: 'File setup pipeline'
  latest_version: '0.1.0'
  versions:
    0.1.0:
      publisher: 'Bielefeld University - CRC 1288 - INF'
      publishing_year: 2022
      url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup/-/releases/v0.1.0'
 #spacy-nlp-pipeline:
 spacy-nlp:
  name: 'spaCy NLP'
  latest_version: '0.1.0'
  versions:
    0.1.0:
      methods:
        - 'encoding_detection'
      models:
        de: 'German'
        en: 'English'
        it: 'Italian'
        pl: 'Polish'
        zh: 'Chinese'
      publisher: 'Bielefeld University - CRC 1288 - INF'
      publishing_year: 2022
      url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/releases/v0.1.0'
 #tesseract-ocr-pipeline:
 tesseract-ocr:
  name: 'Tesseract OCR'
  latest_version: '0.1.0'
  versions:
    0.1.0:
      methods:
        - 'binarization'
      publisher: 'Bielefeld University - CRC 1288 - INF'
      publishing_year: 2022
      url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/releases/v0.1.0'
@@ -50,8 +50,8 @@ h1 .nopaque-icons, h2 .nopaque-icons, h3 .nopaque-icons, h4 .nopaque-icons,
 }
 .nopaque-icons.service-icon[data-service="corpus-analysis"]:empty:before {content: "H";}
 .nopaque-icons.service-icon[data-service="file-setup"]:empty:before {content: "E";}
-.nopaque-icons.service-icon[data-service="nlp"]:empty:before {content: "G";}
+.nopaque-icons.service-icon[data-service="spacy-nlp"]:empty:before {content: "G";}
-.nopaque-icons.service-icon[data-service="ocr"]:empty:before {content: "F";}
+.nopaque-icons.service-icon[data-service="tesseract-ocr"]:empty:before {content: "F";}
 .status-text[data-status]:empty:before {content: attr(data-status);}
@@ -53,7 +53,7 @@ class CorpusAnalysisApp {
    this.data.cQiClient = new CQiClient(this.settings.corpusId);
    this.data.cQiClient.connect()
      .then(cQiStatus => {
-        return this.data.cQiClient.corpora.get('CORPUS');
+        return this.data.cQiClient.corpora.get(`NOPAQUE_${this.settings.corpusId}`);
      })
      .then(
        cQiCorpus => {
@@ -100,7 +100,10 @@ class JobDisplay extends RessourceDisplay {
  }
  setServiceArgs(serviceArgs) {
-    this.setElements(this.displayElement.querySelectorAll('.job-service-args'), serviceArgs);
+    this.setElements(
      this.displayElement.querySelectorAll('.job-service-args'),
      JSON.stringify(serviceArgs)
    );
  }
  setServiceVersion(serviceVersion) {
@@ -10,25 +10,10 @@ class JobResultList extends RessourceList {
      </tr>
    `.trim(),
    ressourceMapper: jobResult => {
      let description;
      if (jobResult.filename.endsWith('.pdf.zip')) {
        description = 'PDF files with text layer';
      } else if (jobResult.filename.endsWith('.txt.zip')) {
        description = 'Raw text files';
      } else if (jobResult.filename.endsWith('.vrt.zip')) {
        description = 'VRT compliant files including the NLP data';
      } else if (jobResult.filename.endsWith('.xml.zip')) {
        description = 'TEI compliant files';
      } else if (jobResult.filename.endsWith('.poco.zip')) {
        description = 'HOCR and image files for post correction (PoCo)';
      } else {
        description = 'All result files created during this job';
      }
      return {
        id: jobResult.id,
        creationDate: jobResult.creation_date,
-        description: description,
+        description: jobResult.description,
        filename: jobResult.filename
      };
    },
@@ -19,12 +19,12 @@
    'darken': '#a1b300',
    'lighten': '#f2f3e1'
  },
-  'nlp': {
+  'spacy-nlp': {
    'base': '#98acd2',
    'darken': '#0064a3',
    'lighten': '#e5e8f5'
  },
-  'ocr': {
+  'tesseract-ocr': {
    'base': '#a9d8c8',
    'darken': '#00a58b',
    'lighten': '#e7f4f1'
@@ -15,8 +15,8 @@
  <li><div class="divider"></div></li>
  <li><a class="subheader">Processes & Services</a></li>
  <li class="service-color service-color-border border-darken" data-service="file-setup" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='file-setup') }}"><i class="nopaque-icons service-icon" data-service="file-setup"></i>File setup</a></li>
-  <li class="service-color service-color-border border-darken" data-service="ocr" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='ocr') }}"><i class="nopaque-icons service-icon" data-service="ocr"></i>OCR</a></li>
+  <li class="service-color service-color-border border-darken" data-service="tesseract-ocr" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='tesseract-ocr') }}"><i class="nopaque-icons service-icon" data-service="tesseract-ocr"></i>OCR</a></li>
-  <li class="service-color service-color-border border-darken" data-service="nlp" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='nlp') }}"><i class="nopaque-icons service-icon" data-service="nlp"></i>NLP</a></li>
+  <li class="service-color service-color-border border-darken" data-service="spacy-nlp" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='spacy-nlp') }}"><i class="nopaque-icons service-icon" data-service="spacy-nlp"></i>NLP</a></li>
  <li class="service-color service-color-border border-darken" data-service="corpus-analysis" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='corpus-analysis') }}"><i class="nopaque-icons service-icon" data-service="corpus-analysis"></i>Corpus analysis</a></li>
  <li><div class="divider"></div></li>
  <li><a class="subheader">Account</a></li>
@@ -28,6 +28,9 @@
  {% if current_user.can(Permission.ADMINISTRATE) %}
  <li><a href="{{ url_for('admin.index') }}"><i class="material-icons">admin_panel_settings</i>Administration</a></li>
  {% endif %}
  {% if current_user.can(Permission.CONTRIBUTE) %}
  <li><a href="{{ url_for('contribute.index') }}"><i class="material-icons">new_label</i>Contribute</a></li>
  {% endif %}
  {% if current_user.can(Permission.USE_API) %}
  <li><a href="{{ url_for('api.doc') }}"><i class="material-icons">api</i>API</a></li>
  {% endif %}
@@ -120,32 +120,32 @@
            </a>
            <br><br>
            <p class="service-color-text darken" data-service="file-setup"><b>File setup</b></p>
-            <p class="light">Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing and the application of other services.</p>
+            <p class="light">Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing.</p>
            <a href="{{ url_for('services.service', service='file-setup') }}" class="waves-effect waves-light btn service-color darken" data-service="file-setup">Create Job</a>
          </div>
      </div>
      <div class="col s12 m4">
          <div class="card-panel center-align hoverable">
            <br>
-            <a href="{{ url_for('services.service', service='ocr') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);">
+            <a href="{{ url_for('services.service', service='tesseract-ocr') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);">
-              <i class="nopaque-icons service-color darken service-icon" data-service="ocr" style="font-size: 2.5rem;"></i>
+              <i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr" style="font-size: 2.5rem;"></i>
            </a>
            <br><br>
-            <p class="service-color-text darken" data-service="ocr"><b>Optical Character Recognition</b></p>
+            <p class="service-color-text darken" data-service="tesseract-ocr"><b>Optical Character Recognition</b></p>
            <p class="light">nopaque converts your image data – like photos or scans – into text data through a process called OCR. This step enables you to proceed with further computational analysis of your documents.</p>
-            <a href="{{ url_for('services.service', service='ocr') }}" class="waves-effect waves-light btn service-color darken" data-service="ocr">Create Job</a>
+            <a href="{{ url_for('services.service', service='tesseract-ocr') }}" class="waves-effect waves-light btn service-color darken" data-service="tesseract-ocr">Create Job</a>
          </div>
      </div>
      <div class="col s12 m4">
          <div class="card-panel center-align hoverable">
            <br>
-            <a href="{{ url_for('services.service', service='nlp') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);">
+            <a href="{{ url_for('services.service', service='spacy-nlp') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);">
-              <i class="nopaque-icons service-color darken service-icon" data-service="nlp" style="font-size: 2.5rem;"></i>
+              <i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp" style="font-size: 2.5rem;"></i>
            </a>
            <br><br>
-            <p class="service-color-text darken" data-service="nlp"><b>Natural Language Processing</b></p>
+            <p class="service-color-text darken" data-service="spacy-nlp"><b>Natural Language Processing</b></p>
            <p class="light">By means of computational linguistic data processing (tokenization, lemmatization, part-of-speech tagging and named-entity recognition) nopaque extracts additional information from your text.</p>
-            <a href="{{ url_for('services.service', service='nlp') }}" class="waves-effect waves-light btn service-color darken" data-service="nlp">Create Job</a>
+            <a href="{{ url_for('services.service', service='spacy-nlp') }}" class="waves-effect waves-light btn service-color darken" data-service="spacy-nlp">Create Job</a>
          </div>
      </div>
    </div>
@@ -84,11 +84,11 @@
              <p class="light">Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing and the application of other services.</p>
            </div>
            <div class="col s12 m6 l3 center-align">
-              <a href="{{ url_for('services.service', service='ocr') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
+              <a href="{{ url_for('services.service', service='tesseract-ocr') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
-                <i class="nopaque-icons service-color darken service-icon" data-service="ocr"></i>
+                <i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr"></i>
              </a>
              <br><br>
-              <p class="service-color-text text-darken" data-service="ocr"><b>Optical Character Recognition</b></p>
+              <p class="service-color-text text-darken" data-service="tesseract-ocr"><b>Optical Character Recognition</b></p>
              <p class="light">nopaque converts your image data – like photos or scans – into text data through OCR making it machine readable. This step enables you to proceed with further computational analysis of your documents.</p>
            </div>
            <div class="col s12 m6 l3 center-align">
@@ -2,7 +2,7 @@
 {% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %}
 {% import "materialize/wtf.html.j2" as wtf %}
-{% block main_attribs %} class="service-scheme" data-service="nlp"{% endblock main_attribs %}
+{% block main_attribs %} class="service-scheme" data-service="spacy-nlp"{% endblock main_attribs %}
 {% block page_content %}
 <div class="container">
@@ -16,13 +16,13 @@
        <p class="hide-on-small-only">&nbsp;</p>
        <p class="hide-on-small-only">&nbsp;</p>
        <a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
-          <i class="nopaque-icons service-color darken service-icon" data-service="nlp"></i>
+          <i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp"></i>
        </a>
      </div>
    </div>
    <div class="col s12 m9 pull-m3">
-      <div class="card service-color-border border-darken" data-service="nlp" style="border-top: 10px solid;">
+      <div class="card service-color-border border-darken" data-service="spacy-nlp" style="border-top: 10px solid;">
        <div class="card-content">
          <div class="row">
            <div class="col s12 m6">
@@ -71,7 +71,7 @@
                {{ wtf.render_field(form.files, accept='text/plain', placeholder='Choose your .txt files') }}
              </div>
              <div class="col s12 l4">
-                {{ wtf.render_field(form.language, material_icon='language') }}
+                {{ wtf.render_field(form.model, material_icon='language') }}
              </div>
              <div class="col s12 l3">
                {{ wtf.render_field(form.version, material_icon='apps') }}
@@ -80,13 +80,13 @@
                <span class="card-title">Preprocessing</span>
              </div>
              <div class="col s9">
-                <p>{{ form.check_encoding.label.text }}</p>
+                <p>{{ form.encoding_detection.label.text }}</p>
                <p class="light">If the input files are not created with the nopaque OCR service or you do not know if your text files are UTF-8 encoded, check this switch. We will try to automatically determine the right encoding for your texts to process them.</p>
              </div>
              <div class="col s3 right-align">
                <div class="switch">
                  <label>
-                    {{ form.check_encoding() }}
+                    {{ form.encoding_detection() }}
                    <span class="lever"></span>
                  </label>
                </div>
@@ -2,7 +2,7 @@
 {% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %}
 {% import "materialize/wtf.html.j2" as wtf %}
-{% block main_attribs %} class="service-scheme" data-service="ocr"{% endblock main_attribs %}
+{% block main_attribs %} class="service-scheme" data-service="tesseract-ocr"{% endblock main_attribs %}
 {% block page_content %}
 <div class="container">
@@ -16,13 +16,13 @@
        <p class="hide-on-small-only">&nbsp;</p>
        <p class="hide-on-small-only">&nbsp;</p>
        <a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
-          <i class="nopaque-icons service-color darken service-icon" data-service="ocr"></i>
+          <i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr"></i>
        </a>
      </div>
    </div>
    <div class="col s12 m9 pull-m3">
-      <div class="card service-color-border border-darken" data-service="ocr" style="border-top: 10px solid;">
+      <div class="card service-color-border border-darken" data-service="tesseract-ocr" style="border-top: 10px solid;">
        <div class="card-content">
          <div class="row">
            <div class="col s12">
@@ -50,10 +50,10 @@
                {{ wtf.render_field(form.description, data_length='255', material_icon='description') }}
              </div>
              <div class="col s12 l5">
-                {{ wtf.render_field(form.files, accept='application/pdf', color=ocr_color_darken, placeholder='Choose your .pdf files') }}
+                {{ wtf.render_field(form.files, accept='application/pdf', placeholder='Choose your .pdf files') }}
              </div>
              <div class="col s12 l4">
-                {{ wtf.render_field(form.language, material_icon='language') }}
+                {{ wtf.render_field(form.model, material_icon='language') }}
              </div>
              <div class="col s12 l3">
                {{ wtf.render_field(form.version, material_icon='apps') }}
@@ -127,7 +127,7 @@
            </div>
          </div>
          <div class="card-action right-align">
-            {{ wtf.render_field(form.submit, color=ocr_color_darken, material_icon='send') }}
+            {{ wtf.render_field(form.submit, material_icon='send') }}
          </div>
        </form>
      </div>
@@ -1,10 +0,0 @@
 from app import hashids
 from werkzeug.routing import BaseConverter
 class HashidConverter(BaseConverter):
    def to_python(self, value: str) -> int:
        return hashids.decode(value)[0]
    def to_url(self, value: int) -> str:
        return hashids.encode(value)
@@ -5,14 +5,14 @@
 version: "3.5"
 networks:
-  reverse-proxy:
+  traefik:
-    external:
+    external: true
-      name: reverse-proxy
+    name: "traefik"
 services:
  nopaque:
    labels:
-      - "traefik.docker.network=reverse-proxy"
+      - "traefik.docker.network=traefik"
      - "traefik.enable=true"
      ### <http> ###
      - "traefik.http.middlewares.http-nopaque-headers.headers.customrequestheaders.X-Forwarded-Proto=http"
@@ -0,0 +1,45 @@
 """empty message
 Revision ID: ad0d835fe5b1
 Revises: 68ed092ffe5e
 Create Date: 2022-01-18 16:23:45.673993
 """
 from alembic import op
 import sqlalchemy as sa
 # revision identifiers, used by Alembic.
 revision = 'ad0d835fe5b1'
 down_revision = '68ed092ffe5e'
 branch_labels = None
 depends_on = None
 def upgrade():
    # ### commands auto generated by Alembic - please adjust! ###
    op.create_table('tesseract_ocr_models',
    sa.Column('creation_date', sa.DateTime(), nullable=True),
    sa.Column('filename', sa.String(length=255), nullable=True),
    sa.Column('last_edited_date', sa.DateTime(), nullable=True),
    sa.Column('mimetype', sa.String(length=255), nullable=True),
    sa.Column('id', sa.Integer(), nullable=False),
    sa.Column('user_id', sa.Integer(), nullable=True),
    sa.Column('compatible_service_versions', sa.String(length=255), nullable=True),
    sa.Column('description', sa.String(length=255), nullable=True),
    sa.Column('publisher', sa.String(length=128), nullable=True),
    sa.Column('publishing_year', sa.Integer(), nullable=True),
    sa.Column('title', sa.String(length=64), nullable=True),
    sa.Column('version', sa.String(length=16), nullable=True),
    sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
    sa.PrimaryKeyConstraint('id')
    )
    op.add_column('job_results', sa.Column('description', sa.String(length=255), nullable=True))
    # ### end Alembic commands ###
 def downgrade():
    # ### commands auto generated by Alembic - please adjust! ###
    op.drop_column('job_results', 'description')
    op.drop_table('tesseract_ocr_models')
    # ### end Alembic commands ###
@@ -3,10 +3,9 @@
 import eventlet
 eventlet.monkey_patch()
 from app import db, cli, create_app  # noqa
 from app.models import (Corpus, CorpusFile, Job, JobInput, JobResult,
-                        Permission, QueryResult, Role, User)  # noqa
+                        Permission, QueryResult, Role, TesseractOCRModel, User)  # noqa
 from app import db, cli, create_app  # noqa
 from flask import Flask  # noqa
 from typing import Any, Dict  # noqa
@@ -34,5 +33,6 @@ def make_shell_context() -> Dict[str, Any]:
        'Permission': Permission,
        'QueryResult': QueryResult,
        'Role': Role,
        'TesseractOCRModel': TesseractOCRModel,
        'User': User
    }
@@ -19,5 +19,7 @@ hiredis
 jsonschema
 psycopg2
 python-dotenv
 pyyaml
 redis
 tqdm
 wtforms[email]