Big update, corpus analysis reworked, versioned services, preliminary work for contributions

This commit is contained in:
Patrick Jentsch 2022-02-03 12:39:16 +01:00
parent 0647537192
commit fe938c0ca2
36 changed files with 1552 additions and 431 deletions

View File

@ -0,0 +1,816 @@
# - title: 'Afrikaans'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/afr.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Amharic'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/amh.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
- title: 'Arabic'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ara.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
# - title: 'Assamese'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/asm.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Azerbaijani'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/aze.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Azerbaijani - Cyrillic'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/aze_cyrl.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Belarusian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bel.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Bengali'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ben.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Tibetan'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bod.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Bosnian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bos.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Bulgarian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bul.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Catalan; Valencian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/cat.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Cebuano'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ceb.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Czech'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ces.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Chinese - Simplified'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chi_sim.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
- title: 'Chinese - Traditional'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chi_tra.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
# - title: 'Cherokee'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chr.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Welsh'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/cym.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
- title: 'Danish'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/dan.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
- title: 'German'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/deu.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
# - title: 'Dzongkha'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/dzo.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
- title: 'Greek, Modern (1453-)'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ell.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
- title: 'English'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/eng.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
- title: 'English, Middle (1100-1500)'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/enm.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
# - title: 'Esperanto'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/epo.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Estonian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/est.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Basque'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/eus.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Persian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fas.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Finnish'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fin.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
- title: 'French'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fra.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
- title: 'German Fraktur'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/frk.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
- title: 'French, Middle (ca. 1400-1600)'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/frm.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
# - title: 'Irish'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/gle.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Galician'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/glg.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
- title: 'Greek, Ancient (-1453)'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/grc.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
# - title: 'Gujarati'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/guj.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Haitian; Haitian Creole'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hat.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Hebrew'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/heb.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Hindi'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hin.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Croatian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hrv.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Hungarian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hun.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Inuktitut'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/iku.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Indonesian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ind.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Icelandic'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/isl.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
- title: 'Italian'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ita.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
- title: 'Italian - Old'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ita_old.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
# - title: 'Javanese'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/jav.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Japanese'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/jpn.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Kannada'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kan.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Georgian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kat.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Georgian - Old'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kat_old.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Kazakh'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kaz.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Central Khmer'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/khm.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Kirghiz; Kyrgyz'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kir.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Korean'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kor.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Kurdish'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kur.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Lao'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lao.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Latin'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lat.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Latvian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lav.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Lithuanian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lit.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Malayalam'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mal.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Marathi'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mar.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Macedonian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mkd.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Maltese'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mlt.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Malay'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/msa.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Burmese'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mya.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Nepali'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nep.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Dutch; Flemish'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nld.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Norwegian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nor.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Oriya'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ori.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Panjabi; Punjabi'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pan.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Polish'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pol.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
- title: 'Portuguese'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/por.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
# - title: 'Pushto; Pashto'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pus.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Romanian; Moldavian; Moldovan'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ron.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
- title: 'Russian'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/rus.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
# - title: 'Sanskrit'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/san.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Sinhala; Sinhalese'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/sin.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Slovak'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/slk.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Slovenian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/slv.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
- title: 'Spanish; Castilian'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/spa.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
- title: 'Spanish; Castilian - Old'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/spa_old.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
# - title: 'Albanian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/sqi.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Serbian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/srp.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Serbian - Latin'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/srp_latn.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Swahili'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/swa.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Swedish'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/swe.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Syriac'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/syr.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Tamil'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tam.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Telugu'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tel.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Tajik'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tgk.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Tagalog'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tgl.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Thai'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tha.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Tigrinya'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tir.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Turkish'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tur.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Uighur; Uyghur'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uig.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Ukrainian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ukr.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Urdu'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/urd.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Uzbek'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uzb.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Uzbek - Cyrillic'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uzb_cyrl.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Vietnamese'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/vie.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Yiddish'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/yid.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'

View File

@ -39,9 +39,6 @@ def create_app(config: Config = Config) -> Flask:
socketio.init_app( socketio.init_app(
app, message_queue=app.config['NOPAQUE_SOCKETIO_MESSAGE_QUEUE_URI']) app, message_queue=app.config['NOPAQUE_SOCKETIO_MESSAGE_QUEUE_URI'])
# from .utils import HashidConverter
# app.url_map.converters['hashid'] = HashidConverter
from .events import socketio as socketio_events from .events import socketio as socketio_events
from .events import sqlalchemy as sqlalchemy_events from .events import sqlalchemy as sqlalchemy_events
@ -54,6 +51,9 @@ def create_app(config: Config = Config) -> Flask:
from .auth import bp as auth_blueprint from .auth import bp as auth_blueprint
app.register_blueprint(auth_blueprint, url_prefix='/auth') app.register_blueprint(auth_blueprint, url_prefix='/auth')
from .contribute import bp as contribute_blueprint
app.register_blueprint(contribute_blueprint, url_prefix='/contribute')
from .corpora import bp as corpora_blueprint from .corpora import bp as corpora_blueprint
app.register_blueprint(corpora_blueprint, url_prefix='/corpora') app.register_blueprint(corpora_blueprint, url_prefix='/corpora')

View File

@ -1,7 +1,6 @@
from flask import Blueprint from flask import Blueprint
from flask_restx import Api from flask_restx import Api
from .jobs import ns as jobs_ns
from .tokens import ns as tokens_ns from .tokens import ns as tokens_ns
bp = Blueprint('api', __name__) bp = Blueprint('api', __name__)
@ -23,5 +22,4 @@ api = Api(
version='1.0' version='1.0'
) )
api.add_namespace(jobs_ns)
api.add_namespace(tokens_ns) api.add_namespace(tokens_ns)

View File

@ -9,8 +9,12 @@ token_auth = HTTPTokenAuth()
@basic_auth.verify_password @basic_auth.verify_password
def verify_password(email_or_username, password): def verify_password(email_or_username, password):
user = User.query.filter(or_(User.username == email_or_username, user = User.query.filter(
User.email == email_or_username.lower())).first() or_(
User.username == email_or_username,
User.email == email_or_username.lower()
)
).first()
if user and user.verify_password(password): if user and user.verify_password(password):
return user return user

View File

@ -1,48 +0,0 @@
from flask_restx import Namespace, Resource
from .auth import token_auth
from ..jobs import tasks
from ..models import Job
ns = Namespace('jobs', description='Job operations')
@ns.route('')
class API_Jobs(Resource):
'''Shows a list of all jobs and lets you POST to add new job'''
@ns.doc(security='apiKey')
@token_auth.login_required
def get(self):
'''List all jobs'''
# TODO: Implement the correct get_jobs functionality
jobs = Job.query.all()
return [job.to_dict(include_relationships=False) for job in jobs]
@ns.doc(security='apiKey')
@token_auth.login_required
def post(self):
'''Create a new job'''
# TODO: Implement this
pass
@ns.route('/<hashid:id>')
class API_Job(Resource):
'''Show a single job and lets you delete it'''
@ns.doc(security='apiKey')
@token_auth.login_required
def get(self, id):
'''Get a job by id'''
job = Job.query.get_or_404(id)
return job.to_dict(include_relationships=False)
@ns.doc(security='apiKey')
@token_auth.login_required
def delete(self, id):
'''Delete a job by id'''
job = Job.query.get_or_404(id)
# We use this imported task because it will run in the background
tasks.delete_job(job.id)
return '', 204

View File

@ -60,28 +60,37 @@ def register():
return redirect(url_for('main.dashboard')) return redirect(url_for('main.dashboard'))
form = RegistrationForm(prefix='registration-form') form = RegistrationForm(prefix='registration-form')
if form.validate_on_submit(): if form.validate_on_submit():
user = User(email=form.email.data.lower(), user = User(
email=form.email.data.lower(),
password=form.password.data, password=form.password.data,
username=form.username.data) username=form.username.data
)
db.session.add(user) db.session.add(user)
db.session.commit() db.session.flush(objects=[user])
db.session.refresh(user)
try: try:
os.makedirs(user.path) user.makedirs()
except OSError: except OSError as e:
current_app.logger.error( current_app.logger.error(e)
f'Make dir {user.path} led to an OSError!') db.session.rollback()
db.session.delete(user)
db.session.commit()
abort(500) abort(500)
else: else:
token = user.generate_confirmation_token() token = user.generate_confirmation_token()
msg = create_message(user.email, 'Confirm Your Account', msg = create_message(
'auth/email/confirm', token=token, user=user) user.email,
'Confirm Your Account',
'auth/email/confirm',
token=token,
user=user
)
send(msg) send(msg)
flash('A confirmation email has been sent to you by email.') flash('A confirmation email has been sent to you by email.')
return redirect(url_for('.login')) return redirect(url_for('.login'))
return render_template('auth/register.html.j2', form=form, return render_template(
title='Register') 'auth/register.html.j2',
form=form,
title='Register'
)
@bp.route('/confirm/<token>') @bp.route('/confirm/<token>')

View File

@ -1,16 +1,44 @@
from . import db from flask import current_app
from .models import Corpus, Role
from flask_migrate import upgrade from flask_migrate import upgrade
from . import db
from .models import Corpus, Job, Role, User, TesseractOCRModel
import json
import os
import re
def _make_default_dirs():
base_dir = current_app.config['NOPAQUE_DATA_DIR']
default_directories = [
os.path.join(base_dir, 'tmp'),
os.path.join(base_dir, 'users')
]
for directory in default_directories:
if os.path.exists(directory):
if not os.path.isdir(directory):
raise NotADirectoryError(f'{directory} is not a directory')
else:
os.mkdir(directory)
def register(app): def register(app):
@app.cli.command() @app.cli.command()
def deploy(): def deploy():
''' Run deployment tasks. ''' ''' Run deployment tasks. '''
# Make default directories
_make_default_dirs()
# migrate database to latest revision # migrate database to latest revision
upgrade() upgrade()
# create or update user roles
Role.insert_roles() # Insert/Update default database values
current_app.logger.info('Insert/Update default roles')
Role.insert_defaults()
current_app.logger.info('Insert/Update default users')
User.insert_defaults()
current_app.logger.info('Insert/Update default tesseract ocr models')
TesseractOCRModel.insert_defaults()
@app.cli.group() @app.cli.group()
def daemon(): def daemon():
@ -40,3 +68,55 @@ def register(app):
from unittest.suite import TestSuite from unittest.suite import TestSuite
tests: TestSuite = TestLoader().discover('tests') tests: TestSuite = TestLoader().discover('tests')
TextTestRunner(verbosity=2).run(tests) TextTestRunner(verbosity=2).run(tests)
@app.cli.group()
def convert():
''' Datebase convert commands. '''
@convert.command()
def nlp_jobs():
for job in Job.query.filter_by(service='nlp').all():
job.service = 'spacy-nlp'
service_args = json.loads(job.service_args)
new_service_args = {}
for service_arg in service_args:
if service_arg == '--check-encoding':
new_service_args['encoding_detection'] = True
elif re.match(r'-l ([a-z]{2})', service_arg):
language_code = re.search(r'-l ([a-z]{2})', service_arg).group(1) # noqa
new_service_args['language'] = language_code
job.service_args = json.dumps(new_service_args)
db.session.commit()
@convert.command()
def ocr_jobs():
# Language code to TesseractOCRModel.title lookup
language_code_lookup = {
'ara': 'Arabic',
'chi_tra': 'Chinese - Traditional',
'dan': 'Danish',
'eng': 'English',
'enm': 'English, Middle (1100-1500)',
'fra': 'French',
'frm': 'French, Middle (ca. 1400-1600)',
'deu': 'German',
'frk': 'German Fraktur',
'ell': 'Greek, Modern (1453-)',
'ita': 'Italian',
'por': 'Portuguese',
'rus': 'Russian',
'spa': 'Spanish; Castilian'
}
for job in Job.query.filter_by(service='ocr').all():
job.service = 'tesseract-ocr'
service_args = json.loads(job.service_args)
new_service_args = {}
for service_arg in service_args:
if service_arg == '--binarize':
new_service_args['binarization'] = True
elif re.match(r'-l ([a-z]{3})', service_arg):
language_code = re.search(r'-l ([a-z]{3})', service_arg).group(1) # noqa
tesseract_ocr_model = TesseractOCRModel.query.filter_by(title=language_code_lookup[language_code]).first() # noqa
new_service_args['model'] = tesseract_ocr_model.id
job.service_args = json.dumps(new_service_args)
db.session.commit()

View File

@ -0,0 +1,5 @@
from flask import Blueprint
bp = Blueprint('contribute', __name__)
from . import routes

19
app/contribute/routes.py Normal file
View File

@ -0,0 +1,19 @@
from flask import flash, redirect, render_template, url_for
from flask_login import login_required
from . import bp
from .. import db
from ..decorators import permission_required
from ..models import Permission, Role, User
from ..settings import tasks as settings_tasks
@bp.before_request
@login_required
@permission_required(Permission.CONTRIBUTE)
def before_request():
pass
@bp.route('/')
def index():
pass

View File

@ -93,12 +93,12 @@ def connect(auth):
@socketio.on('disconnect', namespace=NAMESPACE) @socketio.on('disconnect', namespace=NAMESPACE)
def disconnect(): def disconnect():
if 'd' not in session:
return
session['d']['cqi_client_lock'].acquire() session['d']['cqi_client_lock'].acquire()
try: try:
session['d']['cqi_client'].disconnect() session['d']['cqi_client'].disconnect()
except cqi.errors.CQiException: except (BrokenPipeError, cqi.errors.CQiException):
pass
except BrokenPipeError:
pass pass
session['d']['cqi_client_lock'].release() session['d']['cqi_client_lock'].release()
corpus = Corpus.query.get(session['d']['corpus_id']) corpus = Corpus.query.get(session['d']['corpus_id'])

View File

@ -12,7 +12,10 @@ def cqi_over_socketio(f):
f_args = {} f_args = {}
# Check for missing args and if all provided args are of the right type # Check for missing args and if all provided args are of the right type
for param in signature(f).parameters.values(): for param in signature(f).parameters.values():
if param.annotation == cqi.CQiClient: if param.name == 'corpus_name':
f_args[param.name] = f'NOPAQUE_{session["d"]["corpus_id"]}'
continue
if param.name == 'cqi_client':
f_args[param.name] = session['d']['cqi_client'] f_args[param.name] = session['d']['cqi_client']
continue continue
if param.default is param.empty: if param.default is param.empty:

View File

@ -1,6 +1,7 @@
from flask import (abort, current_app, flash, make_response, redirect, from flask import (abort, current_app, flash, make_response, redirect,
render_template, url_for, send_from_directory) render_template, url_for, send_from_directory)
from flask_login import current_user, login_required from flask_login import current_user, login_required
from werkzeug.utils import secure_filename
from . import bp from . import bp
from . import tasks from . import tasks
from .forms import (AddCorpusFileForm, AddCorpusForm, EditCorpusFileForm, from .forms import (AddCorpusFileForm, AddCorpusForm, EditCorpusFileForm,
@ -29,18 +30,20 @@ def add_corpus():
db.session.flush() db.session.flush()
db.session.refresh(corpus) db.session.refresh(corpus)
try: try:
os.makedirs(corpus.path) corpus.makedirs()
except OSError as e: except OSError as e:
current_app.logger.error(f'Could not add corpus: {e}') current_app.logger.error(e)
db.session.rollback() db.session.rollback()
flash('Internal Server Error', 'error') flash('Internal Server Error', 'error')
abort(500) abort(500)
else:
db.session.commit() db.session.commit()
flash(f'Corpus "{corpus.title}" added!', 'corpus') flash(f'Corpus "{corpus.title}" added', 'corpus')
return redirect(url_for('.corpus', corpus_id=corpus.id)) return redirect(url_for('.corpus', corpus_id=corpus.id))
return render_template('corpora/add_corpus.html.j2', form=form, return render_template(
title='Add corpus') 'corpora/add_corpus.html.j2',
form=form,
title='Add corpus'
)
@bp.route('/import', methods=['GET', 'POST']) @bp.route('/import', methods=['GET', 'POST'])
@ -174,7 +177,7 @@ def add_corpus_file(corpus_id):
if not form.validate(): if not form.validate():
return make_response(form.errors, 400) return make_response(form.errors, 400)
# Save the file # Save the file
form.file.data.save(os.path.join(corpus.path, form.file.data.filename)) filename = secure_filename(form.file.data.filename)
corpus_file = CorpusFile( corpus_file = CorpusFile(
address=form.address.data, address=form.address.data,
author=form.author.data, author=form.author.data,
@ -182,9 +185,10 @@ def add_corpus_file(corpus_id):
chapter=form.chapter.data, chapter=form.chapter.data,
corpus=corpus, corpus=corpus,
editor=form.editor.data, editor=form.editor.data,
filename=form.file.data.filename, filename=filename,
institution=form.institution.data, institution=form.institution.data,
journal=form.journal.data, journal=form.journal.data,
mimetype='application/vrt+xml',
pages=form.pages.data, pages=form.pages.data,
publisher=form.publisher.data, publisher=form.publisher.data,
publishing_year=form.publishing_year.data, publishing_year=form.publishing_year.data,
@ -192,12 +196,25 @@ def add_corpus_file(corpus_id):
title=form.title.data title=form.title.data
) )
db.session.add(corpus_file) db.session.add(corpus_file)
db.session.flush(objects=[corpus_file])
db.session.refresh(corpus_file)
try:
form.file.data.save(corpus_file.path)
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
flash('Internal Server Error', 'error')
return make_response({'redirect_url': url_for('.add_corpus_file', corpus_id=corpus.id)}, 500) # noqa
corpus.status = 'unprepared' corpus.status = 'unprepared'
db.session.commit() db.session.commit()
flash(f'Corpus file "{corpus_file.filename}" added!', 'corpus') flash(f'Corpus file "{corpus_file.title}" added!', 'corpus')
return make_response({'redirect_url': url_for('.corpus', corpus_id=corpus.id)}, 201) # noqa return make_response({'redirect_url': url_for('.corpus', corpus_id=corpus.id)}, 201) # noqa
return render_template('corpora/add_corpus_file.html.j2', corpus=corpus, return render_template(
form=form, title='Add corpus file') 'corpora/add_corpus_file.html.j2',
corpus=corpus,
form=form,
title='Add corpus file'
)
@bp.route('/<hashid:corpus_id>/files/<hashid:corpus_file_id>/delete') @bp.route('/<hashid:corpus_id>/files/<hashid:corpus_file_id>/delete')

View File

@ -17,11 +17,7 @@ class Daemon(CheckCorporaMixin, CheckJobsMixin):
def run(self): def run(self):
while True: while True:
try:
self.check_corpora() self.check_corpora()
self.check_jobs() self.check_jobs()
db.session.commit() db.session.commit()
except Exception as e:
current_app.logger.warning(e)
pass
sleep(1.5) sleep(1.5)

View File

@ -26,37 +26,55 @@ class CheckCorporaMixin:
def create_build_corpus_service(self, corpus): def create_build_corpus_service(self, corpus):
''' # Docker service settings # ''' ''' # Docker service settings # '''
''' ## Command ## ''' ''' ## Command ## '''
command = 'docker-entrypoint.sh build-corpus' command = ['bash', '-c']
command.append(
f'mkdir /corpora/data/nopaque_{corpus.id}'
' && '
'cwb-encode'
' -c utf8'
f' -d /corpora/data/nopaque_{corpus.id}'
' -f /root/files/corpus.vrt'
f' -R /usr/local/share/cwb/registry/nopaque_{corpus.id}'
' -P pos -P lemma -P simple_pos'
' -S ent:0+type -S s:0'
' -S text:0+address+author+booktitle+chapter+editor+institution+journal+pages+publisher+publishing_year+school+title' # noqa
' -xsB -9'
' && '
f'cwb-make -V NOPAQUE_{corpus.id}'
)
''' ## Constraints ## ''' ''' ## Constraints ## '''
constraints = ['node.role==worker'] constraints = ['node.role==worker']
''' ## Image ## ''' ''' ## Image ## '''
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cqpserver:r1674' # noqa image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cwb:r1702'
''' ## Labels ## ''' ''' ## Labels ## '''
labels = { labels = {
'origin': current_app.config['SERVER_NAME'], 'origin': current_app.config['SERVER_NAME'],
'type': 'build-corpus', 'type': 'corpus.build',
'corpus_id': str(corpus.id) 'corpus_id': str(corpus.id)
} }
''' ## Mounts ## ''' ''' ## Mounts ## '''
''' ### Corpus file mount ### ''' mounts = []
corpus_file_source = os.path.join(corpus.path, 'merged', 'corpus.vrt') ''' ### Data mount ### '''
corpus_file_target = '/root/files/corpus.vrt' data_mount_source = os.path.join(corpus.path, 'cwb', 'data')
corpus_file_mount = f'{corpus_file_source}:{corpus_file_target}:ro' data_mount_target = '/corpora/data'
''' ### Corpus data mount ### ''' data_mount = f'{data_mount_source}:{data_mount_target}:rw'
corpus_data_source = os.path.join(corpus.path, 'data') # Make sure that their is no data in the data directory
corpus_data_target = '/corpora/data' shutil.rmtree(data_mount_source, ignore_errors=True)
corpus_data_mount = f'{corpus_data_source}:{corpus_data_target}:rw' os.makedirs(data_mount_source)
# Make sure that their is no data in the corpus data directory mounts.append(data_mount)
shutil.rmtree(corpus_data_source, ignore_errors=True) ''' ### File mount ### '''
os.mkdir(corpus_data_source) file_mount_source = os.path.join(corpus.path, 'cwb', 'corpus.vrt')
''' ### Corpus registry mount ### ''' file_mount_target = '/root/files/corpus.vrt'
corpus_registry_source = os.path.join(corpus.path, 'registry') file_mount = f'{file_mount_source}:{file_mount_target}:ro'
corpus_registry_target = '/usr/local/share/cwb/registry' mounts.append(file_mount)
corpus_registry_mount = f'{corpus_registry_source}:{corpus_registry_target}:rw' # noqa ''' ### Registry mount ### '''
# Make sure that their is no data in the corpus registry directory registry_mount_source = os.path.join(corpus.path, 'cwb', 'registry')
shutil.rmtree(corpus_registry_source, ignore_errors=True) registry_mount_target = '/usr/local/share/cwb/registry'
os.mkdir(corpus_registry_source) registry_mount = f'{registry_mount_source}:{registry_mount_target}:rw'
mounts = [corpus_file_mount, corpus_data_mount, corpus_registry_mount] # Make sure that their is no data in the registry directory
shutil.rmtree(registry_mount_source, ignore_errors=True)
os.makedirs(registry_mount_source)
mounts.append(registry_mount)
''' ## Name ## ''' ''' ## Name ## '''
name = f'build-corpus_{corpus.id}' name = f'build-corpus_{corpus.id}'
''' ## Restart policy ## ''' ''' ## Restart policy ## '''
@ -74,7 +92,7 @@ class CheckCorporaMixin:
except docker.errors.APIError as e: except docker.errors.APIError as e:
current_app.logger.error( current_app.logger.error(
f'Create service "{name}" failed ' f'Create service "{name}" failed '
+ f'due to "docker.errors.APIError": {e}' f'due to "docker.errors.APIError": {e}'
) )
return return
corpus.status = 'queued' corpus.status = 'queued'
@ -86,14 +104,14 @@ class CheckCorporaMixin:
except docker.errors.NotFound as e: except docker.errors.NotFound as e:
current_app.logger.error( current_app.logger.error(
f'Get service "{service_name}" failed ' f'Get service "{service_name}" failed '
+ f'due to "docker.errors.NotFound": {e}' f'due to "docker.errors.NotFound": {e}'
) )
corpus.status = 'failed' corpus.status = 'failed'
return return
except docker.errors.APIError as e: except docker.errors.APIError as e:
current_app.logger.error( current_app.logger.error(
f'Get service "{service_name}" failed ' f'Get service "{service_name}" failed '
+ f'due to "docker.errors.APIError": {e}' f'due to "docker.errors.APIError": {e}'
) )
service_tasks = service.tasks() service_tasks = service.tasks()
if not service_tasks: if not service_tasks:
@ -108,36 +126,47 @@ class CheckCorporaMixin:
corpus.status = 'failed' corpus.status = 'failed'
else: else:
return return
try: # try:
service.remove() # service.remove()
except docker.errors.APIError as e: # except docker.errors.APIError as e:
current_app.logger.error( # current_app.logger.error(
f'Remove service "{service_name}" failed ' # f'Remove service "{service_name}" failed '
+ f'due to "docker.errors.APIError": {e}' # f'due to "docker.errors.APIError": {e}'
) # )
def create_cqpserver_container(self, corpus): def create_cqpserver_container(self, corpus):
''' # Docker container settings # ''' ''' # Docker container settings # '''
''' ## Command ## ''' ''' ## Command ## '''
command = 'cqpserver' command = []
command.append(
'echo "host *;" > cqpserver.init'
' && '
'echo "user anonymous \\"\\";" >> cqpserver.init'
' && '
'cqpserver -I cqpserver.init'
)
''' ## Detach ## ''' ''' ## Detach ## '''
detach = True detach = True
''' ## Entrypoint ## '''
entrypoint = ['bash', '-c']
''' ## Image ## ''' ''' ## Image ## '''
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cqpserver:r1674' # noqa image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cwb:r1702'
''' ## Name ## ''' ''' ## Name ## '''
name = f'cqpserver_{corpus.id}' name = f'cqpserver_{corpus.id}'
''' ## Network ## ''' ''' ## Network ## '''
network = 'nopaque_default' network = 'nopaque_default'
''' ## Volumes ## ''' ''' ## Volumes ## '''
volumes = []
''' ### Corpus data volume ### ''' ''' ### Corpus data volume ### '''
corpus_data_source = os.path.join(corpus.path, 'data') data_volume_source = os.path.join(corpus.path, 'cwb', 'data')
corpus_data_target = '/corpora/data' data_volume_target = '/corpora/data'
corpus_data_volume = f'{corpus_data_source}:{corpus_data_target}:rw' data_volume = f'{data_volume_source}:{data_volume_target}:rw'
volumes.append(data_volume)
''' ### Corpus registry volume ### ''' ''' ### Corpus registry volume ### '''
corpus_registry_source = os.path.join(corpus.path, 'registry') registry_volume_source = os.path.join(corpus.path, 'cwb', 'registry')
corpus_registry_target = '/usr/local/share/cwb/registry' registry_volume_target = '/usr/local/share/cwb/registry'
corpus_registry_volume = f'{corpus_registry_source}:{corpus_registry_target}:rw' # noqa registry_volume = f'{registry_volume_source}:{registry_volume_target}:rw' # noqa
volumes = [corpus_data_volume, corpus_registry_volume] volumes.append(registry_volume)
# Check if a cqpserver container already exists. If this is the case, # Check if a cqpserver container already exists. If this is the case,
# remove it and create a new one # remove it and create a new one
try: try:
@ -147,7 +176,7 @@ class CheckCorporaMixin:
except docker.errors.APIError as e: except docker.errors.APIError as e:
current_app.logger.error( current_app.logger.error(
f'Get container "{name}" failed ' f'Get container "{name}" failed '
+ f'due to "docker.errors.APIError": {e}' f'due to "docker.errors.APIError": {e}'
) )
return return
else: else:
@ -156,7 +185,7 @@ class CheckCorporaMixin:
except docker.errors.APIError as e: except docker.errors.APIError as e:
current_app.logger.error( current_app.logger.error(
f'Remove container "{name}" failed ' f'Remove container "{name}" failed '
+ f'due to "docker.errors.APIError": {e}' f'due to "docker.errors.APIError": {e}'
) )
return return
try: try:
@ -164,6 +193,7 @@ class CheckCorporaMixin:
image, image,
command=command, command=command,
detach=detach, detach=detach,
entrypoint=entrypoint,
volumes=volumes, volumes=volumes,
name=name, name=name,
network=network network=network
@ -171,14 +201,14 @@ class CheckCorporaMixin:
except docker.errors.ImageNotFound as e: except docker.errors.ImageNotFound as e:
current_app.logger.error( current_app.logger.error(
f'Run container "{name}" failed ' f'Run container "{name}" failed '
+ f'due to "docker.errors.ImageNotFound" error: {e}' f'due to "docker.errors.ImageNotFound" error: {e}'
) )
corpus.status = 'failed' corpus.status = 'failed'
return return
except docker.errors.APIError as e: except docker.errors.APIError as e:
current_app.logger.error( current_app.logger.error(
f'Run container "{name}" failed ' f'Run container "{name}" failed '
+ f'due to "docker.errors.APIError" error: {e}' f'due to "docker.errors.APIError" error: {e}'
) )
return return
corpus.status = 'analysing' corpus.status = 'analysing'
@ -190,14 +220,14 @@ class CheckCorporaMixin:
except docker.errors.NotFound as e: except docker.errors.NotFound as e:
current_app.logger.error( current_app.logger.error(
f'Get container "{container_name}" failed ' f'Get container "{container_name}" failed '
+ f'due to "docker.errors.NotFound": {e}' f'due to "docker.errors.NotFound": {e}'
) )
corpus.num_analysis_sessions = 0 corpus.num_analysis_sessions = 0
corpus.status = 'prepared' corpus.status = 'prepared'
except docker.errors.APIError as e: except docker.errors.APIError as e:
current_app.logger.error( current_app.logger.error(
f'Get container "{container_name}" failed ' f'Get container "{container_name}" failed '
+ f'due to "docker.errors.APIError": {e}' f'due to "docker.errors.APIError": {e}'
) )
def remove_cqpserver_container(self, corpus): def remove_cqpserver_container(self, corpus):
@ -210,7 +240,7 @@ class CheckCorporaMixin:
except docker.errors.APIError as e: except docker.errors.APIError as e:
current_app.logger.error( current_app.logger.error(
f'Get container "{container_name}" failed ' f'Get container "{container_name}" failed '
+ f'due to "docker.errors.APIError": {e}' f'due to "docker.errors.APIError": {e}'
) )
return return
try: try:
@ -218,5 +248,5 @@ class CheckCorporaMixin:
except docker.errors.APIError as e: except docker.errors.APIError as e:
current_app.logger.error( current_app.logger.error(
f'Remove container "{container_name}" failed ' f'Remove container "{container_name}" failed '
+ f'due to "docker.errors.APIError": {e}' f'due to "docker.errors.APIError": {e}'
) )

View File

@ -2,7 +2,7 @@ from datetime import datetime
from flask import current_app from flask import current_app
from werkzeug.utils import secure_filename from werkzeug.utils import secure_filename
from .. import db from .. import db
from ..models import Job, JobResult from ..models import Job, JobResult, TesseractOCRModel
import docker import docker
import json import json
import os import os
@ -23,27 +23,34 @@ class CheckJobsMixin:
''' # Docker service settings # ''' ''' # Docker service settings # '''
''' ## Service specific settings ## ''' ''' ## Service specific settings ## '''
if job.service == 'file-setup': if job.service == 'file-setup':
mem_mb = 2048 mem_mb = 512
n_cores = 2 n_cores = 2
executable = 'file-setup' executable = 'file-setup'
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}file-setup:{job.service_version}' # noqa image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}file-setup:v{job.service_version}' # noqa
elif job.service == 'ocr': elif job.service == 'tesseract-ocr':
mem_mb = 4096 mem_mb = 2048
n_cores = 4 n_cores = 4
executable = 'ocr' executable = 'ocr'
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}ocr:{job.service_version}' # noqa image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}ocr:v{job.service_version}' # noqa
elif job.service == 'nlp': elif job.service == 'spacy-nlp':
mem_mb = 2048 mem_mb = 1024
n_cores = 2 n_cores = 1
executable = 'nlp' executable = 'nlp'
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}nlp:{job.service_version}' # noqa image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}nlp:v{job.service_version}' # noqa
''' ## Command ## ''' ''' ## Command ## '''
command = f'{executable} -i /input -o /output' command = f'{executable} -i /input -o /output'
command += ' --log-dir /input' command += ' --log-dir /logs'
command += f' --mem-mb {mem_mb}' command += f' --mem-mb {mem_mb}'
command += f' --n-cores {n_cores}' command += f' --n-cores {n_cores}'
command += f' --zip [{job.service}]_{secure_filename(job.title)}' service_args = json.loads(job.service_args)
command += ' ' + ' '.join(json.loads(job.service_args)) if job.service == 'spacy-nlp':
command += f' -m {service_args["model"]}'
if 'encoding_detection' in service_args and service_args['encoding_detection']: # noqa
command += ' --check-encoding'
elif job.service == 'tesseract-ocr':
command += f' -m {service_args["model"]}'
if 'binarization' in service_args and service_args['binarization']:
command += ' --binarize'
''' ## Constraints ## ''' ''' ## Constraints ## '''
constraints = ['node.role==worker'] constraints = ['node.role==worker']
''' ## Labels ## ''' ''' ## Labels ## '''
@ -53,20 +60,42 @@ class CheckJobsMixin:
'job_id': str(job.id) 'job_id': str(job.id)
} }
''' ## Mounts ## ''' ''' ## Mounts ## '''
''' ### Input mount ### ''' mounts = []
input_mount_source = job.path ''' ### Input mount(s) ### '''
input_mount_target = '/input' input_mount_target_base = '/input'
if job.service == 'file-setup': if job.service == 'file-setup':
input_mount_target += f'/{secure_filename(job.title)}' input_mount_target_base += f'/{secure_filename(job.title)}'
input_mount = f'{input_mount_source}:{input_mount_target}:rw' for job_input in job.inputs:
input_mount_source = job_input.path
input_mount_target = f'/{input_mount_target_base}/{job_input.filename}' # noqa
input_mount = f'{input_mount_source}:{input_mount_target}:ro'
mounts.append(input_mount)
if job.service == 'tesseract-ocr':
service_args = json.loads(job.service_args)
model = TesseractOCRModel.query.get(service_args['model'])
if model is None:
job.status = 'failed'
return
models_mount_source = model.path
models_mount_target = f'/usr/local/share/tessdata/{model.filename}'
models_mount = f'{models_mount_source}:{models_mount_target}:ro'
mounts.append(models_mount)
''' ### Output mount ### ''' ''' ### Output mount ### '''
output_mount_source = os.path.join(job.path, 'output') output_mount_source = os.path.join(job.path, 'results')
output_mount_target = '/output' output_mount_target = '/output'
output_mount = f'{output_mount_source}:{output_mount_target}:rw' output_mount = f'{output_mount_source}:{output_mount_target}:rw'
# Make sure that their is no data in the output directory # Make sure that their is no data in the output directory
shutil.rmtree(output_mount_source, ignore_errors=True) shutil.rmtree(output_mount_source, ignore_errors=True)
os.makedirs(output_mount_source) os.makedirs(output_mount_source)
mounts = [input_mount, output_mount] mounts.append(output_mount)
''' ### Pipeline data mount ### '''
pyflow_data_mount_source = os.path.join(job.path, 'pipeline_data')
pyflow_data_mount_target = '/logs/pyflow.data'
pyflow_data_mount = f'{pyflow_data_mount_source}:{pyflow_data_mount_target}:rw' # noqa
# Make sure that their is no data in the output directory
shutil.rmtree(pyflow_data_mount_source, ignore_errors=True)
os.makedirs(pyflow_data_mount_source)
mounts.append(pyflow_data_mount)
''' ## Name ## ''' ''' ## Name ## '''
name = f'job_{job.id}' name = f'job_{job.id}'
''' ## Resources ## ''' ''' ## Resources ## '''
@ -90,7 +119,7 @@ class CheckJobsMixin:
except docker.errors.APIError as e: except docker.errors.APIError as e:
current_app.logger.error( current_app.logger.error(
f'Create service "{name}" failed ' f'Create service "{name}" failed '
+ f'due to "docker.errors.APIError": {e}' f'due to "docker.errors.APIError": {e}'
) )
return return
job.status = 'queued' job.status = 'queued'
@ -102,14 +131,14 @@ class CheckJobsMixin:
except docker.errors.NotFound as e: except docker.errors.NotFound as e:
current_app.logger.error( current_app.logger.error(
f'Get service "{service_name}" failed ' f'Get service "{service_name}" failed '
+ f'due to "docker.errors.NotFound": {e}' f'due to "docker.errors.NotFound": {e}'
) )
job.status = 'failed' job.status = 'failed'
return return
except docker.errors.APIError as e: except docker.errors.APIError as e:
current_app.logger.error( current_app.logger.error(
f'Get service "{service_name}" failed ' f'Get service "{service_name}" failed '
+ f'due to "docker.errors.APIError": {e}' f'due to "docker.errors.APIError": {e}'
) )
return return
service_tasks = service.tasks() service_tasks = service.tasks()
@ -121,13 +150,25 @@ class CheckJobsMixin:
return return
elif job.status == 'running' and task_state == 'complete': elif job.status == 'running' and task_state == 'complete':
job.status = 'complete' job.status = 'complete'
results_dir = os.path.join(job.path, 'output') results_dir = os.path.join(job.path, 'results')
result_files = [x for x in os.listdir(results_dir) if x.endswith('.zip')] # noqa with open(os.path.join(results_dir, 'outputs.json')) as f:
for result_file in result_files: outputs = json.load(f)
job_result = JobResult(filename=result_file, job=job) for output in outputs:
filename = os.path.basename(output['file'])
job_result = JobResult(
filename=filename,
job=job,
mimetype=output['mimetype']
)
if 'description' in output:
job_result.description = output['description']
db.session.add(job_result) db.session.add(job_result)
db.session.flush() db.session.flush(objects=[job_result])
db.session.refresh(job_result) db.session.refresh(job_result)
os.rename(
os.path.join(results_dir, output['file']),
job_result.path
)
elif job.status == 'running' and task_state == 'failed': elif job.status == 'running' and task_state == 'failed':
job.status = 'failed' job.status = 'failed'
else: else:
@ -138,7 +179,7 @@ class CheckJobsMixin:
except docker.errors.APIError as e: except docker.errors.APIError as e:
current_app.logger.error( current_app.logger.error(
f'Remove service "{service_name}" failed ' f'Remove service "{service_name}" failed '
+ f'due to "docker.errors.APIError": {e}' f'due to "docker.errors.APIError": {e}'
) )
def remove_job_service(self, job): def remove_job_service(self, job):
@ -151,7 +192,7 @@ class CheckJobsMixin:
except docker.errors.APIError as e: except docker.errors.APIError as e:
current_app.logger.error( current_app.logger.error(
f'Get service "{service_name}" failed ' f'Get service "{service_name}" failed '
+ f'due to "docker.errors.APIError": {e}' f'due to "docker.errors.APIError": {e}'
) )
return return
try: try:
@ -159,7 +200,7 @@ class CheckJobsMixin:
except docker.errors.APIError as e: except docker.errors.APIError as e:
current_app.logger.error( current_app.logger.error(
f'Update service "{service_name}" failed ' f'Update service "{service_name}" failed '
+ f'due to "docker.errors.APIError": {e}' f'due to "docker.errors.APIError": {e}'
) )
return return
try: try:
@ -167,5 +208,5 @@ class CheckJobsMixin:
except docker.errors.APIError as e: except docker.errors.APIError as e:
current_app.logger.error( current_app.logger.error(
f'Remove "{service_name}" service failed ' f'Remove "{service_name}" service failed '
+ f'due to "docker.errors.APIError": {e}' f'due to "docker.errors.APIError": {e}'
) )

View File

@ -34,12 +34,14 @@ def delete_job(job_id):
@login_required @login_required
def download_job_input(job_id, job_input_id): def download_job_input(job_id, job_input_id):
job_input = JobInput.query.filter(JobInput.job_id == job_id, JobInput.id == job_input_id).first_or_404() # noqa job_input = JobInput.query.filter(JobInput.job_id == job_id, JobInput.id == job_input_id).first_or_404() # noqa
if not (job_input.job.user == current_user if not (job_input.job.user == current_user or current_user.is_administrator()): # noqa
or current_user.is_administrator()):
abort(403) abort(403)
return send_from_directory(as_attachment=True, return send_from_directory(
as_attachment=True,
attachment_filename=job_input.filename,
directory=os.path.dirname(job_input.path), directory=os.path.dirname(job_input.path),
filename=job_input.filename) filename=os.path.basename(job_input.path)
)
@bp.route('/<hashid:job_id>/restart') @bp.route('/<hashid:job_id>/restart')
@ -59,9 +61,11 @@ def restart(job_id):
@login_required @login_required
def download_job_result(job_id, job_result_id): def download_job_result(job_id, job_result_id):
job_result = JobResult.query.filter(JobResult.job_id == job_id, JobResult.id == job_result_id).first_or_404() # noqa job_result = JobResult.query.filter(JobResult.job_id == job_id, JobResult.id == job_result_id).first_or_404() # noqa
if not (job_result.job.user == current_user if not (job_result.job.user == current_user or current_user.is_administrator()): # noqa
or current_user.is_administrator()):
abort(403) abort(403)
return send_from_directory(as_attachment=True, return send_from_directory(
as_attachment=True,
attachment_filename=job_result.filename,
directory=os.path.dirname(job_result.path), directory=os.path.dirname(job_result.path),
filename=job_result.filename) filename=os.path.basename(job_result.path)
)

View File

@ -4,13 +4,17 @@ from flask_hashids import HashidMixin
from flask_login import UserMixin from flask_login import UserMixin
from itsdangerous import BadSignature, TimedJSONWebSignatureSerializer from itsdangerous import BadSignature, TimedJSONWebSignatureSerializer
from time import sleep from time import sleep
from tqdm import tqdm
from werkzeug.security import generate_password_hash, check_password_hash from werkzeug.security import generate_password_hash, check_password_hash
import xml.etree.ElementTree as ET
from . import db, login from . import db, login
import base64 import base64
import enum import enum
import json
import os import os
import requests
import shutil import shutil
import xml.etree.ElementTree as ET
import yaml
class Permission(enum.IntEnum): class Permission(enum.IntEnum):
@ -25,7 +29,7 @@ class Permission(enum.IntEnum):
class FileMixin: class FileMixin:
creation_date = db.Column(db.DateTime, default=datetime.utcnow) creation_date = db.Column(db.DateTime, default=datetime.utcnow)
filename = db.Column(db.String(256)) filename = db.Column(db.String(255))
last_edited_date = db.Column(db.DateTime, default=datetime.utcnow) last_edited_date = db.Column(db.DateTime, default=datetime.utcnow)
mimetype = db.Column(db.String(255)) mimetype = db.Column(db.String(255))
@ -86,7 +90,7 @@ class Role(HashidMixin, db.Model):
return dict_role return dict_role
@staticmethod @staticmethod
def insert_roles(): def insert_defaults():
roles = { roles = {
'User': [], 'User': [],
'API user': [Permission.USE_API], 'API user': [Permission.USE_API],
@ -132,6 +136,12 @@ class User(HashidMixin, UserMixin, db.Model):
db.String(16), default='all') db.String(16), default='all')
# Backrefs: role: Role # Backrefs: role: Role
# Relationships # Relationships
tesseract_ocr_models = db.relationship(
'TesseractOCRModel',
backref='user',
cascade='all, delete-orphan',
lazy='dynamic'
)
corpora = db.relationship( corpora = db.relationship(
'Corpus', 'Corpus',
backref='user', backref='user',
@ -221,6 +231,12 @@ class User(HashidMixin, UserMixin, db.Model):
def is_administrator(self): def is_administrator(self):
return self.can(Permission.ADMINISTRATE) return self.can(Permission.ADMINISTRATE)
def makedirs(self):
os.mkdir(self.path)
os.mkdir(os.path.join(self.path, 'tesseract_ocr_models'))
os.mkdir(os.path.join(self.path, 'corpora'))
os.mkdir(os.path.join(self.path, 'jobs'))
def revoke_token(self): def revoke_token(self):
self.token_expiration = datetime.utcnow() - timedelta(seconds=1) self.token_expiration = datetime.utcnow() - timedelta(seconds=1)
@ -269,6 +285,21 @@ class User(HashidMixin, UserMixin, db.Model):
return None return None
return user return user
@staticmethod
def insert_defaults():
if User.query.filter_by(username='nopaque').first() is not None:
return
user = User(username='nopaque')
db.session.add(user)
db.session.flush(objects=[user])
db.session.refresh(user)
try:
user.makedirs()
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
db.session.commit()
@staticmethod @staticmethod
def reset_password(token, new_password): def reset_password(token, new_password):
s = TimedJSONWebSignatureSerializer(current_app.config['SECRET_KEY']) s = TimedJSONWebSignatureSerializer(current_app.config['SECRET_KEY'])
@ -284,6 +315,72 @@ class User(HashidMixin, UserMixin, db.Model):
return True return True
class TesseractOCRModel(FileMixin, HashidMixin, db.Model):
__tablename__ = 'tesseract_ocr_models'
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
# Fields
compatible_service_versions = db.Column(db.String(255))
description = db.Column(db.String(255))
publisher = db.Column(db.String(128))
publishing_year = db.Column(db.Integer)
title = db.Column(db.String(64))
version = db.Column(db.String(16))
# Backrefs: user: User
@property
def path(self):
return os.path.join(
self.user.path,
'tesseract_ocr_models',
str(self.id)
)
@staticmethod
def insert_defaults():
user = User.query.filter_by(username='nopaque').first()
defaults_file = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'TesseractOCRModel.defaults.yml'
)
with open(defaults_file, 'r') as f:
defaults = yaml.safe_load(f)
for m in defaults:
if TesseractOCRModel.query.filter_by(title=m['title'], version=m['version']).first() is not None: # noqa
continue
tesseract_ocr_model = TesseractOCRModel(
compatible_service_versions=json.dumps(m['compatible_service_versions']), # noqa
description=m['description'],
publisher=m['publisher'],
publishing_year=m['publishing_year'],
title=m['title'],
user=user,
version=m['version']
)
db.session.add(tesseract_ocr_model)
db.session.flush(objects=[tesseract_ocr_model])
db.session.refresh(tesseract_ocr_model)
tesseract_ocr_model.filename = f'{tesseract_ocr_model.id}.traineddata' # noqa
r = requests.get(m['url'], stream=True)
pbar = tqdm(
desc=f'{tesseract_ocr_model.title} ({tesseract_ocr_model.filename})', # noqa
unit="B",
unit_scale=True,
unit_divisor=1024,
total=int(r.headers['Content-Length'])
)
pbar.clear()
with open(tesseract_ocr_model.path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
pbar.update(len(chunk))
f.write(chunk)
pbar.close()
db.session.commit()
class JobInput(FileMixin, HashidMixin, db.Model): class JobInput(FileMixin, HashidMixin, db.Model):
__tablename__ = 'job_inputs' __tablename__ = 'job_inputs'
# Primary key # Primary key
@ -309,7 +406,7 @@ class JobInput(FileMixin, HashidMixin, db.Model):
@property @property
def path(self): def path(self):
return os.path.join(self.job.path, self.filename) return os.path.join(self.job.path, 'inputs', str(self.id))
def to_dict(self, backrefs=False, relationships=False): def to_dict(self, backrefs=False, relationships=False):
dict_job_input = { dict_job_input = {
@ -347,6 +444,8 @@ class JobResult(FileMixin, HashidMixin, db.Model):
id = db.Column(db.Integer, primary_key=True) id = db.Column(db.Integer, primary_key=True)
# Foreign keys # Foreign keys
job_id = db.Column(db.Integer, db.ForeignKey('jobs.id')) job_id = db.Column(db.Integer, db.ForeignKey('jobs.id'))
# Fields
description = db.Column(db.String(255))
# Backrefs: job: Job # Backrefs: job: Job
def __repr__(self): def __repr__(self):
@ -366,12 +465,13 @@ class JobResult(FileMixin, HashidMixin, db.Model):
@property @property
def path(self): def path(self):
return os.path.join(self.job.path, 'output', self.filename) return os.path.join(self.job.path, 'results', str(self.id))
def to_dict(self, backrefs=False, relationships=False): def to_dict(self, backrefs=False, relationships=False):
dict_job_result = { dict_job_result = {
'id': self.hashid, 'id': self.hashid,
'job_id': self.job.hashid, 'job_id': self.job.hashid,
'description': self.description,
'download_url': self.download_url, 'download_url': self.download_url,
'url': self.url, 'url': self.url,
**self.file_mixin_to_dict( **self.file_mixin_to_dict(
@ -414,8 +514,8 @@ class Job(HashidMixin, db.Model):
end_date = db.Column(db.DateTime()) end_date = db.Column(db.DateTime())
service = db.Column(db.String(64)) service = db.Column(db.String(64))
''' '''
' Service specific arguments as string list. ' Dictionary as JSON formatted string.
' Example: ["-l eng", "--binarize"] ' Example: {"binarization": True}
''' '''
service_args = db.Column(db.String(255)) service_args = db.Column(db.String(255))
service_version = db.Column(db.String(16)) service_version = db.Column(db.String(16))
@ -472,6 +572,12 @@ class Job(HashidMixin, db.Model):
shutil.rmtree(self.path, ignore_errors=True) shutil.rmtree(self.path, ignore_errors=True)
db.session.delete(self) db.session.delete(self)
def makedirs(self):
os.mkdir(self.path)
os.mkdir(os.path.join(self.path, 'inputs'))
os.mkdir(os.path.join(self.path, 'pipeline_data'))
os.mkdir(os.path.join(self.path, 'results'))
def restart(self): def restart(self):
''' '''
Restart a job - only if the status is complete or failed Restart a job - only if the status is complete or failed
@ -479,7 +585,7 @@ class Job(HashidMixin, db.Model):
if self.status not in ['complete', 'failed']: if self.status not in ['complete', 'failed']:
raise Exception('Could not restart job: status is not "complete/failed"') # noqa raise Exception('Could not restart job: status is not "complete/failed"') # noqa
shutil.rmtree(os.path.join(self.path, 'output'), ignore_errors=True) shutil.rmtree(os.path.join(self.path, 'results'), ignore_errors=True)
shutil.rmtree(os.path.join(self.path, 'pyflow.data'), ignore_errors=True) # noqa shutil.rmtree(os.path.join(self.path, 'pyflow.data'), ignore_errors=True) # noqa
for result in self.results: for result in self.results:
db.session.delete(result) db.session.delete(result)
@ -487,6 +593,10 @@ class Job(HashidMixin, db.Model):
self.status = 'submitted' self.status = 'submitted'
def to_dict(self, backrefs=False, relationships=False): def to_dict(self, backrefs=False, relationships=False):
service_args = json.loads(self.service_args)
if self.service == 'tesseract-ocr' and 'model' in service_args:
tesseract_ocr_pipeline_model = TesseractOCRModel.query.get(service_args['model']) # noqa
service_args['model'] = tesseract_ocr_pipeline_model.title
dict_job = { dict_job = {
'id': self.hashid, 'id': self.hashid,
'user_id': self.user.hashid, 'user_id': self.user.hashid,
@ -494,7 +604,7 @@ class Job(HashidMixin, db.Model):
'description': self.description, 'description': self.description,
'end_date': None if self.end_date is None else f'{self.end_date.isoformat()}Z', # noqa 'end_date': None if self.end_date is None else f'{self.end_date.isoformat()}Z', # noqa
'service': self.service, 'service': self.service,
'service_args': self.service_args, 'service_args': service_args,
'service_version': self.service_version, 'service_version': self.service_version,
'status': self.status, 'status': self.status,
'title': self.title, 'title': self.title,
@ -550,7 +660,7 @@ class CorpusFile(FileMixin, HashidMixin, db.Model):
@property @property
def path(self): def path(self):
return os.path.join(self.corpus.path, self.filename) return os.path.join(self.corpus.path, 'files', str(self.id))
@property @property
def url(self): def url(self):
@ -659,28 +769,27 @@ class Corpus(HashidMixin, db.Model):
return self.user.hashid return self.user.hashid
def build(self): def build(self):
output_dir = os.path.join(self.path, 'merged')
shutil.rmtree(output_dir, ignore_errors=True)
os.mkdir(output_dir)
output_file = os.path.join(output_dir, 'corpus.vrt')
corpus_element = ET.fromstring('<corpus>\n</corpus>') corpus_element = ET.fromstring('<corpus>\n</corpus>')
for corpus_file in self.files: for corpus_file in self.files:
element_tree = ET.parse(corpus_file.path) element_tree = ET.parse(corpus_file.path)
text_node = element_tree.find('text') text_element = element_tree.getroot()
text_node.set('address', corpus_file.address or 'NULL') text_element.set('address', corpus_file.address or 'NULL')
text_node.set('author', corpus_file.author) text_element.set('author', corpus_file.author)
text_node.set('booktitle', corpus_file.booktitle or 'NULL') text_element.set('booktitle', corpus_file.booktitle or 'NULL')
text_node.set('chapter', corpus_file.chapter or 'NULL') text_element.set('chapter', corpus_file.chapter or 'NULL')
text_node.set('editor', corpus_file.editor or 'NULL') text_element.set('editor', corpus_file.editor or 'NULL')
text_node.set('institution', corpus_file.institution or 'NULL') text_element.set('institution', corpus_file.institution or 'NULL')
text_node.set('journal', corpus_file.journal or 'NULL') text_element.set('journal', corpus_file.journal or 'NULL')
text_node.set('pages', corpus_file.pages or 'NULL') text_element.set('pages', corpus_file.pages or 'NULL')
text_node.set('publisher', corpus_file.publisher or 'NULL') text_element.set('publisher', corpus_file.publisher or 'NULL')
text_node.set('publishing_year', str(corpus_file.publishing_year)) text_element.set('publishing_year', str(corpus_file.publishing_year)) # noqa
text_node.set('school', corpus_file.school or 'NULL') text_element.set('school', corpus_file.school or 'NULL')
text_node.set('title', corpus_file.title) text_element.set('title', corpus_file.title)
corpus_element.insert(1, text_node) corpus_element.insert(1, text_element)
ET.ElementTree(corpus_element).write(output_file, encoding='utf-8') ET.ElementTree(corpus_element).write(
os.path.join(self.path, 'cwb', 'corpus.vrt'),
encoding='utf-8'
)
self.last_edited_date = datetime.utcnow() self.last_edited_date = datetime.utcnow()
self.status = 'submitted' self.status = 'submitted'
@ -688,6 +797,13 @@ class Corpus(HashidMixin, db.Model):
shutil.rmtree(self.path, ignore_errors=True) shutil.rmtree(self.path, ignore_errors=True)
db.session.delete(self) db.session.delete(self)
def makedirs(self):
os.mkdir(self.path)
os.mkdir(os.path.join(self.path, 'files'))
os.mkdir(os.path.join(self.path, 'cwb'))
os.mkdir(os.path.join(self.path, 'cwb', 'data'))
os.mkdir(os.path.join(self.path, 'cwb', 'registry'))
def to_dict(self, backrefs=False, relationships=False): def to_dict(self, backrefs=False, relationships=False):
dict_corpus = { dict_corpus = {
'id': self.hashid, 'id': self.hashid,

View File

@ -1,77 +1,13 @@
from flask import Blueprint from flask import Blueprint
import os
import yaml
SERVICES = { services_file = os.path.join(
'file-setup': { os.path.dirname(os.path.abspath(__file__)), 'services.yml')
'name': 'File setup', with open(services_file, 'r') as f:
'versions': { SERVICES = yaml.safe_load(f)
'latest': '1.0.0b',
'1.0.0b': {
'publishing_data': {
'date': None,
'title': 'nopaque File setup service',
'url': 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup/-/tree/1.0.0b', # noqa
'version': '1.0.0'
}
}
}
},
'nlp': {
'name': 'Natural Language Processing',
'versions': {
'latest': '1.0.0b',
'1.0.0b': {
'check_encoding': True,
'models': {
'de': 'German',
'en': 'English',
'it': 'Italian',
'nl': 'Dutch',
'pl': 'Polish',
'zh': 'Chinese'
},
'publishing_data': {
'date': None,
'title': 'nopaque NLP service',
'url': 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/tree/1.0.0b', # noqa
'version': '1.0.0'
}
}
}
},
'ocr': {
'name': 'Optical Character Recognition',
'versions': {
'latest': '1.0.0b',
'1.0.0b': {
'binarization': True,
'models': {
'ara': 'Arabic',
'chi_tra': 'Chinese - Traditional',
'dan': 'Danish',
'eng': 'English',
'enm': 'English, Middle 1100-1500',
'fra': 'French',
'frm': 'French, Middle ca. 1400-1600',
'deu': 'German',
'frk': 'German Fraktur',
'ell': 'Greek, Modern (1453-)',
'ita': 'Italian',
'por': 'Portuguese',
'rus': 'Russian',
'spa': 'Spanish; Castilian',
},
'publishing_data': {
'date': None,
'title': 'nopaque OCR service',
'url': 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/tree/1.0.0b', # noqa
'version': '1.0.0'
}
}
}
}
}
bp = Blueprint('services', __name__) bp = Blueprint('services', __name__)
from . import routes from . import routes # noqa

View File

@ -1,3 +1,4 @@
from app.models import TesseractOCRModel
from flask_wtf import FlaskForm from flask_wtf import FlaskForm
from wtforms import (BooleanField, MultipleFileField, SelectField, StringField, from wtforms import (BooleanField, MultipleFileField, SelectField, StringField,
SubmitField, ValidationError) SubmitField, ValidationError)
@ -6,85 +7,105 @@ from . import SERVICES
class AddJobForm(FlaskForm): class AddJobForm(FlaskForm):
description = StringField('Description', description = StringField('Description', validators=[DataRequired(), Length(1, 255)]) # noqa
validators=[DataRequired(), Length(1, 255)])
submit = SubmitField() submit = SubmitField()
title = StringField('Title', validators=[DataRequired(), Length(1, 32)]) title = StringField('Title', validators=[DataRequired(), Length(1, 32)])
version = SelectField('Version', validators=[DataRequired()]) version = SelectField('Version', validators=[DataRequired()])
class AddNLPJobForm(AddJobForm): class AddSpacyNLPJobForm(AddJobForm):
check_encoding = BooleanField('Check encoding') encoding_detection = BooleanField('Encoding detection')
files = MultipleFileField('Files', validators=[DataRequired()]) files = MultipleFileField('Files', validators=[DataRequired()])
language = SelectField('Language', choices=[('', 'Choose your option')], model = SelectField(
default='', validators=[DataRequired()]) 'Model',
choices=[('', 'Choose your option')],
default='',
validators=[DataRequired()]
)
def validate_check_encoding(self, field): def validate_encoding_detection(self, field):
if field.data and 'check_encoding' not in SERVICES['nlp']['versions'][self.version.data]: # noqa service_info = SERVICES['spacy-nlp']['versions'][self.version.data]
raise ValidationError('Check encoding is not available in this version') # noqa if field.data and 'encoding_detection' not in service_info:
raise ValidationError('Encoding detection is not available')
def validate_files(form, field): def validate_files(form, field):
valid_extensions = ['.txt']
for file in field.data: for file in field.data:
if not file.filename.lower().endswith('.txt'): if not file.filename.lower().endswith(tuple(valid_extensions)):
raise ValidationError('File does not have an approved ' raise ValidationError(
'extension: .txt') 'File does not have an approved extension: '
'/'.join(valid_extensions)
)
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
version = kwargs.pop('version', SERVICES['nlp']['versions']['latest']) version = kwargs.pop('version', SERVICES['spacy-nlp']['latest_version']) # noqa
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
if 'check_encoding' not in SERVICES['nlp']['versions'][version]: service_info = SERVICES['spacy-nlp']['versions'][version]
self.check_encoding.render_kw = {'disabled': True} if 'check_encoding' not in service_info['methods']:
self.language.choices += [(x, y) for x, y in SERVICES['nlp']['versions'][version]['models'].items()] # noqa self.encoding_detection.render_kw = {'disabled': True}
self.version.choices = [(x, x) for x in SERVICES['nlp']['versions'] if x != 'latest'] # noqa self.model.choices += [(x, y) for x, y in service_info['models'].items()] # noqa
self.version.choices = [(x, x) for x in SERVICES['spacy-nlp']['versions']] # noqa
self.version.default = version self.version.default = version
class AddOCRJobForm(AddJobForm): class AddTesseractOCRJobForm(AddJobForm):
binarization = BooleanField('Binarazation') binarization = BooleanField('Binarization')
files = MultipleFileField('Files', validators=[DataRequired()]) files = MultipleFileField('Files', validators=[DataRequired()])
language = SelectField('Language', choices=[('', 'Choose your option')], model = SelectField(
default='', validators=[DataRequired()]) 'Model',
choices=[('', 'Choose your option')],
default='',
validators=[DataRequired()]
)
def validate_binarization(self, field): def validate_binarization(self, field):
if field.data and 'binarization' not in SERVICES['ocr']['versions'][self.version.data]: # noqa service_info = SERVICES['tesseract-ocr']['versions'][self.version.data]
raise ValidationError('Binarization is not available in this version') # noqa if field.data and 'binarization' not in service_info:
raise ValidationError('Binarization is not available')
def validate_files(self, field): def validate_files(self, field):
valid_extensions = ['.pdf']
for file in field.data: for file in field.data:
if not file.filename.lower().endswith('.pdf'): if not file.filename.lower().endswith(tuple(valid_extensions)):
raise ValidationError('File does not have an approved ' raise ValidationError(
'extension: .pdf') 'File does not have an approved extension: '
'/'.join(valid_extensions)
)
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
version = kwargs.pop('version', SERVICES['ocr']['versions']['latest']) version = kwargs.pop('version', SERVICES['tesseract-ocr']['latest_version']) # noqa
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
if 'binarization' not in SERVICES['ocr']['versions'][version]: service_info = SERVICES['tesseract-ocr']['versions'][version]
if 'binarization' not in service_info['methods']:
self.binarization.render_kw = {'disabled': True} self.binarization.render_kw = {'disabled': True}
self.language.choices += [(x, y) for x, y in SERVICES['ocr']['versions'][version]['models'].items()] # noqa self.model.choices += [(x.hashid, x.title) for x in TesseractOCRModel.query.all()] # noqa
self.version.choices = [(x, x) for x in SERVICES['ocr']['versions'] if x != 'latest'] # noqa self.version.choices = [(x, x) for x in SERVICES['tesseract-ocr']['versions']] # noqa
self.version.default = version self.version.data = version
self.version.default = SERVICES['tesseract-ocr']['latest_version']
class AddFileSetupJobForm(AddJobForm): class AddFileSetupJobForm(AddJobForm):
files = MultipleFileField('Files', validators=[DataRequired()]) files = MultipleFileField('Files', validators=[DataRequired()])
def validate_files(form, field): def validate_files(form, field):
valid_extensions = ['.jpeg', '.jpg', '.png', '.tiff', '.tif']
for file in field.data: for file in field.data:
if not file.filename.lower().endswith(('.jpeg', '.jpg', '.png', if not file.filename.lower().endswith(tuple(valid_extensions)):
'.tiff', '.tif')): raise ValidationError(
raise ValidationError('File does not have an approved ' 'File does not have an approved extension: '
'extension: .jpeg | .jpg | .png | .tiff ' '/'.join(valid_extensions)
'| .tif') )
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
version = kwargs.pop('version', SERVICES['file-setup']['versions']['latest']) version = kwargs.pop('version', SERVICES['file-setup']['latest_version']) # noqa
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.version.choices = [(x, x) for x in SERVICES['file-setup']['versions'] if x != 'latest'] # noqa self.version.choices = [(x, x) for x in SERVICES['file-setup']['versions']] # noqa
self.version.default = version self.version.data = version
self.version.default = SERVICES['file-setup']['latest_version']
AddJobForms = { AddJobForms = {
'file-setup': AddFileSetupJobForm, 'file-setup': AddFileSetupJobForm,
'ocr': AddOCRJobForm, 'tesseract-ocr': AddTesseractOCRJobForm,
'nlp': AddNLPJobForm 'spacy-nlp': AddSpacyNLPJobForm
} }

View File

@ -1,3 +1,4 @@
from app import hashids
from flask import (abort, current_app, flash, make_response, render_template, from flask import (abort, current_app, flash, make_response, render_template,
request, url_for) request, url_for)
from flask_login import current_user, login_required from flask_login import current_user, login_required
@ -8,7 +9,6 @@ from .. import db
from .forms import AddJobForms from .forms import AddJobForms
from ..models import Job, JobInput from ..models import Job, JobInput
import json import json
import os
@bp.route('/corpus-analysis') @bp.route('/corpus-analysis')
@ -24,57 +24,65 @@ def service(service):
# Check if the requested service exist # Check if the requested service exist
if service not in SERVICES or service not in AddJobForms: if service not in SERVICES or service not in AddJobForms:
abort(404) abort(404)
version = request.args.get( version = request.args.get('version', SERVICES[service]['latest_version'])
'version', SERVICES[service]['versions']['latest'])
if version not in SERVICES[service]['versions']: if version not in SERVICES[service]['versions']:
abort(404) abort(404)
form = AddJobForms[service](prefix='add-job-form', version=version) form = AddJobForms[service](prefix='add-job-form', version=version)
form.version.data = version
title = SERVICES[service]['name'] title = SERVICES[service]['name']
versions = SERVICES[service]['versions']
if form.is_submitted(): if form.is_submitted():
if not form.validate(): if not form.validate():
return make_response(form.errors, 400) return make_response(form.errors, 400)
service_args = [] service_args = {}
if service == 'nlp': if service == 'spacy-nlp':
service_args.append(f'-l {form.language.data}') service_args['model'] = form.model.data
if form.check_encoding.data: if form.encoding_detection.data:
service_args.append('--check-encoding') service_args['encoding_detection'] = True
if service == 'ocr': if service == 'tesseract-ocr':
service_args.append(f'-l {form.language.data}') service_args['model'] = hashids.decode(form.model.data)
if form.binarization.data: if form.binarization.data:
service_args.append('--binarize') service_args['binarization'] = True
job = Job(user=current_user, job = Job(
user=current_user,
description=form.description.data, description=form.description.data,
service=service, service_args=json.dumps(service_args), service=service,
service_args=json.dumps(service_args),
service_version=form.version.data, service_version=form.version.data,
status='preparing', title=form.title.data) status='preparing',
title=form.title.data
)
db.session.add(job) db.session.add(job)
db.session.flush() db.session.flush(objects=[job])
db.session.refresh(job) db.session.refresh(job)
try: try:
os.makedirs(job.path) job.makedirs()
except OSError: except OSError as e:
current_app.logger.error(f'Make dir {job.path} led to an OSError!') current_app.logger.error(e)
db.session.rollback() db.session.rollback()
flash('Internal Server Error', 'error') flash('Internal Server Error', 'error')
return make_response( return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa
{'redirect_url': url_for('.service', service=service)}, 500)
else:
for file in form.files.data: for file in form.files.data:
filename = secure_filename(file.filename) filename = secure_filename(file.filename)
job_input = JobInput( job_input = JobInput(
filename=filename, job=job, mimetype=file.mimetype) filename=filename,
file.save(job_input.path) job=job,
mimetype=file.mimetype
)
db.session.add(job_input) db.session.add(job_input)
db.session.flush(objects=[job_input])
db.session.refresh(job_input)
try:
file.save(job_input.path)
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
flash('Internal Server Error', 'error')
return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa
job.status = 'submitted' job.status = 'submitted'
db.session.commit() db.session.commit()
flash(f'Job "{job.title}" added', 'job') flash(f'Job "{job.title}" added', 'job')
return make_response( return make_response({'redirect_url': url_for('jobs.job', job_id=job.id)}, 201) # noqa
{'redirect_url': url_for('jobs.job', job_id=job.id)}, 201)
return render_template( return render_template(
f'services/{service.replace("-", "_")}.html.j2', f'services/{service.replace("-", "_")}.html.j2',
form=form, form=form,
title=title, title=title
versions=versions
) )

38
app/services/services.yml Normal file
View File

@ -0,0 +1,38 @@
# TODO: This could also be done via GitLab/GitHub APIs
#file-setup-pipeline:
file-setup:
name: 'File setup pipeline'
latest_version: '0.1.0'
versions:
0.1.0:
publisher: 'Bielefeld University - CRC 1288 - INF'
publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup/-/releases/v0.1.0'
#spacy-nlp-pipeline:
spacy-nlp:
name: 'spaCy NLP'
latest_version: '0.1.0'
versions:
0.1.0:
methods:
- 'encoding_detection'
models:
de: 'German'
en: 'English'
it: 'Italian'
pl: 'Polish'
zh: 'Chinese'
publisher: 'Bielefeld University - CRC 1288 - INF'
publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/releases/v0.1.0'
#tesseract-ocr-pipeline:
tesseract-ocr:
name: 'Tesseract OCR'
latest_version: '0.1.0'
versions:
0.1.0:
methods:
- 'binarization'
publisher: 'Bielefeld University - CRC 1288 - INF'
publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/releases/v0.1.0'

View File

@ -50,8 +50,8 @@ h1 .nopaque-icons, h2 .nopaque-icons, h3 .nopaque-icons, h4 .nopaque-icons,
} }
.nopaque-icons.service-icon[data-service="corpus-analysis"]:empty:before {content: "H";} .nopaque-icons.service-icon[data-service="corpus-analysis"]:empty:before {content: "H";}
.nopaque-icons.service-icon[data-service="file-setup"]:empty:before {content: "E";} .nopaque-icons.service-icon[data-service="file-setup"]:empty:before {content: "E";}
.nopaque-icons.service-icon[data-service="nlp"]:empty:before {content: "G";} .nopaque-icons.service-icon[data-service="spacy-nlp"]:empty:before {content: "G";}
.nopaque-icons.service-icon[data-service="ocr"]:empty:before {content: "F";} .nopaque-icons.service-icon[data-service="tesseract-ocr"]:empty:before {content: "F";}
.status-text[data-status]:empty:before {content: attr(data-status);} .status-text[data-status]:empty:before {content: attr(data-status);}

View File

@ -53,7 +53,7 @@ class CorpusAnalysisApp {
this.data.cQiClient = new CQiClient(this.settings.corpusId); this.data.cQiClient = new CQiClient(this.settings.corpusId);
this.data.cQiClient.connect() this.data.cQiClient.connect()
.then(cQiStatus => { .then(cQiStatus => {
return this.data.cQiClient.corpora.get('CORPUS'); return this.data.cQiClient.corpora.get(`NOPAQUE_${this.settings.corpusId}`);
}) })
.then( .then(
cQiCorpus => { cQiCorpus => {

View File

@ -100,7 +100,10 @@ class JobDisplay extends RessourceDisplay {
} }
setServiceArgs(serviceArgs) { setServiceArgs(serviceArgs) {
this.setElements(this.displayElement.querySelectorAll('.job-service-args'), serviceArgs); this.setElements(
this.displayElement.querySelectorAll('.job-service-args'),
JSON.stringify(serviceArgs)
);
} }
setServiceVersion(serviceVersion) { setServiceVersion(serviceVersion) {

View File

@ -10,25 +10,10 @@ class JobResultList extends RessourceList {
</tr> </tr>
`.trim(), `.trim(),
ressourceMapper: jobResult => { ressourceMapper: jobResult => {
let description;
if (jobResult.filename.endsWith('.pdf.zip')) {
description = 'PDF files with text layer';
} else if (jobResult.filename.endsWith('.txt.zip')) {
description = 'Raw text files';
} else if (jobResult.filename.endsWith('.vrt.zip')) {
description = 'VRT compliant files including the NLP data';
} else if (jobResult.filename.endsWith('.xml.zip')) {
description = 'TEI compliant files';
} else if (jobResult.filename.endsWith('.poco.zip')) {
description = 'HOCR and image files for post correction (PoCo)';
} else {
description = 'All result files created during this job';
}
return { return {
id: jobResult.id, id: jobResult.id,
creationDate: jobResult.creation_date, creationDate: jobResult.creation_date,
description: description, description: jobResult.description,
filename: jobResult.filename filename: jobResult.filename
}; };
}, },

View File

@ -19,12 +19,12 @@
'darken': '#a1b300', 'darken': '#a1b300',
'lighten': '#f2f3e1' 'lighten': '#f2f3e1'
}, },
'nlp': { 'spacy-nlp': {
'base': '#98acd2', 'base': '#98acd2',
'darken': '#0064a3', 'darken': '#0064a3',
'lighten': '#e5e8f5' 'lighten': '#e5e8f5'
}, },
'ocr': { 'tesseract-ocr': {
'base': '#a9d8c8', 'base': '#a9d8c8',
'darken': '#00a58b', 'darken': '#00a58b',
'lighten': '#e7f4f1' 'lighten': '#e7f4f1'

View File

@ -15,8 +15,8 @@
<li><div class="divider"></div></li> <li><div class="divider"></div></li>
<li><a class="subheader">Processes & Services</a></li> <li><a class="subheader">Processes & Services</a></li>
<li class="service-color service-color-border border-darken" data-service="file-setup" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='file-setup') }}"><i class="nopaque-icons service-icon" data-service="file-setup"></i>File setup</a></li> <li class="service-color service-color-border border-darken" data-service="file-setup" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='file-setup') }}"><i class="nopaque-icons service-icon" data-service="file-setup"></i>File setup</a></li>
<li class="service-color service-color-border border-darken" data-service="ocr" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='ocr') }}"><i class="nopaque-icons service-icon" data-service="ocr"></i>OCR</a></li> <li class="service-color service-color-border border-darken" data-service="tesseract-ocr" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='tesseract-ocr') }}"><i class="nopaque-icons service-icon" data-service="tesseract-ocr"></i>OCR</a></li>
<li class="service-color service-color-border border-darken" data-service="nlp" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='nlp') }}"><i class="nopaque-icons service-icon" data-service="nlp"></i>NLP</a></li> <li class="service-color service-color-border border-darken" data-service="spacy-nlp" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='spacy-nlp') }}"><i class="nopaque-icons service-icon" data-service="spacy-nlp"></i>NLP</a></li>
<li class="service-color service-color-border border-darken" data-service="corpus-analysis" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='corpus-analysis') }}"><i class="nopaque-icons service-icon" data-service="corpus-analysis"></i>Corpus analysis</a></li> <li class="service-color service-color-border border-darken" data-service="corpus-analysis" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='corpus-analysis') }}"><i class="nopaque-icons service-icon" data-service="corpus-analysis"></i>Corpus analysis</a></li>
<li><div class="divider"></div></li> <li><div class="divider"></div></li>
<li><a class="subheader">Account</a></li> <li><a class="subheader">Account</a></li>
@ -28,6 +28,9 @@
{% if current_user.can(Permission.ADMINISTRATE) %} {% if current_user.can(Permission.ADMINISTRATE) %}
<li><a href="{{ url_for('admin.index') }}"><i class="material-icons">admin_panel_settings</i>Administration</a></li> <li><a href="{{ url_for('admin.index') }}"><i class="material-icons">admin_panel_settings</i>Administration</a></li>
{% endif %} {% endif %}
{% if current_user.can(Permission.CONTRIBUTE) %}
<li><a href="{{ url_for('contribute.index') }}"><i class="material-icons">new_label</i>Contribute</a></li>
{% endif %}
{% if current_user.can(Permission.USE_API) %} {% if current_user.can(Permission.USE_API) %}
<li><a href="{{ url_for('api.doc') }}"><i class="material-icons">api</i>API</a></li> <li><a href="{{ url_for('api.doc') }}"><i class="material-icons">api</i>API</a></li>
{% endif %} {% endif %}

View File

@ -120,32 +120,32 @@
</a> </a>
<br><br> <br><br>
<p class="service-color-text darken" data-service="file-setup"><b>File setup</b></p> <p class="service-color-text darken" data-service="file-setup"><b>File setup</b></p>
<p class="light">Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing and the application of other services.</p> <p class="light">Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing.</p>
<a href="{{ url_for('services.service', service='file-setup') }}" class="waves-effect waves-light btn service-color darken" data-service="file-setup">Create Job</a> <a href="{{ url_for('services.service', service='file-setup') }}" class="waves-effect waves-light btn service-color darken" data-service="file-setup">Create Job</a>
</div> </div>
</div> </div>
<div class="col s12 m4"> <div class="col s12 m4">
<div class="card-panel center-align hoverable"> <div class="card-panel center-align hoverable">
<br> <br>
<a href="{{ url_for('services.service', service='ocr') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);"> <a href="{{ url_for('services.service', service='tesseract-ocr') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);">
<i class="nopaque-icons service-color darken service-icon" data-service="ocr" style="font-size: 2.5rem;"></i> <i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr" style="font-size: 2.5rem;"></i>
</a> </a>
<br><br> <br><br>
<p class="service-color-text darken" data-service="ocr"><b>Optical Character Recognition</b></p> <p class="service-color-text darken" data-service="tesseract-ocr"><b>Optical Character Recognition</b></p>
<p class="light">nopaque converts your image data like photos or scans into text data through a process called OCR. This step enables you to proceed with further computational analysis of your documents.</p> <p class="light">nopaque converts your image data like photos or scans into text data through a process called OCR. This step enables you to proceed with further computational analysis of your documents.</p>
<a href="{{ url_for('services.service', service='ocr') }}" class="waves-effect waves-light btn service-color darken" data-service="ocr">Create Job</a> <a href="{{ url_for('services.service', service='tesseract-ocr') }}" class="waves-effect waves-light btn service-color darken" data-service="tesseract-ocr">Create Job</a>
</div> </div>
</div> </div>
<div class="col s12 m4"> <div class="col s12 m4">
<div class="card-panel center-align hoverable"> <div class="card-panel center-align hoverable">
<br> <br>
<a href="{{ url_for('services.service', service='nlp') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);"> <a href="{{ url_for('services.service', service='spacy-nlp') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);">
<i class="nopaque-icons service-color darken service-icon" data-service="nlp" style="font-size: 2.5rem;"></i> <i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp" style="font-size: 2.5rem;"></i>
</a> </a>
<br><br> <br><br>
<p class="service-color-text darken" data-service="nlp"><b>Natural Language Processing</b></p> <p class="service-color-text darken" data-service="spacy-nlp"><b>Natural Language Processing</b></p>
<p class="light">By means of computational linguistic data processing (tokenization, lemmatization, part-of-speech tagging and named-entity recognition) nopaque extracts additional information from your text.</p> <p class="light">By means of computational linguistic data processing (tokenization, lemmatization, part-of-speech tagging and named-entity recognition) nopaque extracts additional information from your text.</p>
<a href="{{ url_for('services.service', service='nlp') }}" class="waves-effect waves-light btn service-color darken" data-service="nlp">Create Job</a> <a href="{{ url_for('services.service', service='spacy-nlp') }}" class="waves-effect waves-light btn service-color darken" data-service="spacy-nlp">Create Job</a>
</div> </div>
</div> </div>
</div> </div>

View File

@ -84,11 +84,11 @@
<p class="light">Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing and the application of other services.</p> <p class="light">Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing and the application of other services.</p>
</div> </div>
<div class="col s12 m6 l3 center-align"> <div class="col s12 m6 l3 center-align">
<a href="{{ url_for('services.service', service='ocr') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light"> <a href="{{ url_for('services.service', service='tesseract-ocr') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
<i class="nopaque-icons service-color darken service-icon" data-service="ocr"></i> <i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr"></i>
</a> </a>
<br><br> <br><br>
<p class="service-color-text text-darken" data-service="ocr"><b>Optical Character Recognition</b></p> <p class="service-color-text text-darken" data-service="tesseract-ocr"><b>Optical Character Recognition</b></p>
<p class="light">nopaque converts your image data like photos or scans into text data through OCR making it machine readable. This step enables you to proceed with further computational analysis of your documents.</p> <p class="light">nopaque converts your image data like photos or scans into text data through OCR making it machine readable. This step enables you to proceed with further computational analysis of your documents.</p>
</div> </div>
<div class="col s12 m6 l3 center-align"> <div class="col s12 m6 l3 center-align">

View File

@ -2,7 +2,7 @@
{% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %} {% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %}
{% import "materialize/wtf.html.j2" as wtf %} {% import "materialize/wtf.html.j2" as wtf %}
{% block main_attribs %} class="service-scheme" data-service="nlp"{% endblock main_attribs %} {% block main_attribs %} class="service-scheme" data-service="spacy-nlp"{% endblock main_attribs %}
{% block page_content %} {% block page_content %}
<div class="container"> <div class="container">
@ -16,13 +16,13 @@
<p class="hide-on-small-only">&nbsp;</p> <p class="hide-on-small-only">&nbsp;</p>
<p class="hide-on-small-only">&nbsp;</p> <p class="hide-on-small-only">&nbsp;</p>
<a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light"> <a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
<i class="nopaque-icons service-color darken service-icon" data-service="nlp"></i> <i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp"></i>
</a> </a>
</div> </div>
</div> </div>
<div class="col s12 m9 pull-m3"> <div class="col s12 m9 pull-m3">
<div class="card service-color-border border-darken" data-service="nlp" style="border-top: 10px solid;"> <div class="card service-color-border border-darken" data-service="spacy-nlp" style="border-top: 10px solid;">
<div class="card-content"> <div class="card-content">
<div class="row"> <div class="row">
<div class="col s12 m6"> <div class="col s12 m6">
@ -71,7 +71,7 @@
{{ wtf.render_field(form.files, accept='text/plain', placeholder='Choose your .txt files') }} {{ wtf.render_field(form.files, accept='text/plain', placeholder='Choose your .txt files') }}
</div> </div>
<div class="col s12 l4"> <div class="col s12 l4">
{{ wtf.render_field(form.language, material_icon='language') }} {{ wtf.render_field(form.model, material_icon='language') }}
</div> </div>
<div class="col s12 l3"> <div class="col s12 l3">
{{ wtf.render_field(form.version, material_icon='apps') }} {{ wtf.render_field(form.version, material_icon='apps') }}
@ -80,13 +80,13 @@
<span class="card-title">Preprocessing</span> <span class="card-title">Preprocessing</span>
</div> </div>
<div class="col s9"> <div class="col s9">
<p>{{ form.check_encoding.label.text }}</p> <p>{{ form.encoding_detection.label.text }}</p>
<p class="light">If the input files are not created with the nopaque OCR service or you do not know if your text files are UTF-8 encoded, check this switch. We will try to automatically determine the right encoding for your texts to process them.</p> <p class="light">If the input files are not created with the nopaque OCR service or you do not know if your text files are UTF-8 encoded, check this switch. We will try to automatically determine the right encoding for your texts to process them.</p>
</div> </div>
<div class="col s3 right-align"> <div class="col s3 right-align">
<div class="switch"> <div class="switch">
<label> <label>
{{ form.check_encoding() }} {{ form.encoding_detection() }}
<span class="lever"></span> <span class="lever"></span>
</label> </label>
</div> </div>

View File

@ -2,7 +2,7 @@
{% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %} {% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %}
{% import "materialize/wtf.html.j2" as wtf %} {% import "materialize/wtf.html.j2" as wtf %}
{% block main_attribs %} class="service-scheme" data-service="ocr"{% endblock main_attribs %} {% block main_attribs %} class="service-scheme" data-service="tesseract-ocr"{% endblock main_attribs %}
{% block page_content %} {% block page_content %}
<div class="container"> <div class="container">
@ -16,13 +16,13 @@
<p class="hide-on-small-only">&nbsp;</p> <p class="hide-on-small-only">&nbsp;</p>
<p class="hide-on-small-only">&nbsp;</p> <p class="hide-on-small-only">&nbsp;</p>
<a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light"> <a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
<i class="nopaque-icons service-color darken service-icon" data-service="ocr"></i> <i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr"></i>
</a> </a>
</div> </div>
</div> </div>
<div class="col s12 m9 pull-m3"> <div class="col s12 m9 pull-m3">
<div class="card service-color-border border-darken" data-service="ocr" style="border-top: 10px solid;"> <div class="card service-color-border border-darken" data-service="tesseract-ocr" style="border-top: 10px solid;">
<div class="card-content"> <div class="card-content">
<div class="row"> <div class="row">
<div class="col s12"> <div class="col s12">
@ -50,10 +50,10 @@
{{ wtf.render_field(form.description, data_length='255', material_icon='description') }} {{ wtf.render_field(form.description, data_length='255', material_icon='description') }}
</div> </div>
<div class="col s12 l5"> <div class="col s12 l5">
{{ wtf.render_field(form.files, accept='application/pdf', color=ocr_color_darken, placeholder='Choose your .pdf files') }} {{ wtf.render_field(form.files, accept='application/pdf', placeholder='Choose your .pdf files') }}
</div> </div>
<div class="col s12 l4"> <div class="col s12 l4">
{{ wtf.render_field(form.language, material_icon='language') }} {{ wtf.render_field(form.model, material_icon='language') }}
</div> </div>
<div class="col s12 l3"> <div class="col s12 l3">
{{ wtf.render_field(form.version, material_icon='apps') }} {{ wtf.render_field(form.version, material_icon='apps') }}
@ -127,7 +127,7 @@
</div> </div>
</div> </div>
<div class="card-action right-align"> <div class="card-action right-align">
{{ wtf.render_field(form.submit, color=ocr_color_darken, material_icon='send') }} {{ wtf.render_field(form.submit, material_icon='send') }}
</div> </div>
</form> </form>
</div> </div>

View File

@ -1,10 +0,0 @@
from app import hashids
from werkzeug.routing import BaseConverter
class HashidConverter(BaseConverter):
def to_python(self, value: str) -> int:
return hashids.decode(value)[0]
def to_url(self, value: int) -> str:
return hashids.encode(value)

View File

@ -5,14 +5,14 @@
version: "3.5" version: "3.5"
networks: networks:
reverse-proxy: traefik:
external: external: true
name: reverse-proxy name: "traefik"
services: services:
nopaque: nopaque:
labels: labels:
- "traefik.docker.network=reverse-proxy" - "traefik.docker.network=traefik"
- "traefik.enable=true" - "traefik.enable=true"
### <http> ### ### <http> ###
- "traefik.http.middlewares.http-nopaque-headers.headers.customrequestheaders.X-Forwarded-Proto=http" - "traefik.http.middlewares.http-nopaque-headers.headers.customrequestheaders.X-Forwarded-Proto=http"

View File

@ -0,0 +1,45 @@
"""empty message
Revision ID: ad0d835fe5b1
Revises: 68ed092ffe5e
Create Date: 2022-01-18 16:23:45.673993
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = 'ad0d835fe5b1'
down_revision = '68ed092ffe5e'
branch_labels = None
depends_on = None
def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.create_table('tesseract_ocr_models',
sa.Column('creation_date', sa.DateTime(), nullable=True),
sa.Column('filename', sa.String(length=255), nullable=True),
sa.Column('last_edited_date', sa.DateTime(), nullable=True),
sa.Column('mimetype', sa.String(length=255), nullable=True),
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('user_id', sa.Integer(), nullable=True),
sa.Column('compatible_service_versions', sa.String(length=255), nullable=True),
sa.Column('description', sa.String(length=255), nullable=True),
sa.Column('publisher', sa.String(length=128), nullable=True),
sa.Column('publishing_year', sa.Integer(), nullable=True),
sa.Column('title', sa.String(length=64), nullable=True),
sa.Column('version', sa.String(length=16), nullable=True),
sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
sa.PrimaryKeyConstraint('id')
)
op.add_column('job_results', sa.Column('description', sa.String(length=255), nullable=True))
# ### end Alembic commands ###
def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column('job_results', 'description')
op.drop_table('tesseract_ocr_models')
# ### end Alembic commands ###

View File

@ -3,10 +3,9 @@
import eventlet import eventlet
eventlet.monkey_patch() eventlet.monkey_patch()
from app import db, cli, create_app # noqa
from app.models import (Corpus, CorpusFile, Job, JobInput, JobResult, from app.models import (Corpus, CorpusFile, Job, JobInput, JobResult,
Permission, QueryResult, Role, User) # noqa Permission, QueryResult, Role, TesseractOCRModel, User) # noqa
from app import db, cli, create_app # noqa
from flask import Flask # noqa from flask import Flask # noqa
from typing import Any, Dict # noqa from typing import Any, Dict # noqa
@ -34,5 +33,6 @@ def make_shell_context() -> Dict[str, Any]:
'Permission': Permission, 'Permission': Permission,
'QueryResult': QueryResult, 'QueryResult': QueryResult,
'Role': Role, 'Role': Role,
'TesseractOCRModel': TesseractOCRModel,
'User': User 'User': User
} }

View File

@ -19,5 +19,7 @@ hiredis
jsonschema jsonschema
psycopg2 psycopg2
python-dotenv python-dotenv
pyyaml
redis redis
tqdm
wtforms[email] wtforms[email]