Big update, corpus analysis reworked, versioned services, preliminary work for contributions

This commit is contained in:
Patrick Jentsch 2022-02-03 12:39:16 +01:00
parent 0647537192
commit fe938c0ca2
36 changed files with 1552 additions and 431 deletions

View File

@ -0,0 +1,816 @@
# - title: 'Afrikaans'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/afr.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Amharic'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/amh.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
- title: 'Arabic'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ara.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
# - title: 'Assamese'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/asm.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Azerbaijani'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/aze.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Azerbaijani - Cyrillic'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/aze_cyrl.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Belarusian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bel.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Bengali'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ben.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Tibetan'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bod.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Bosnian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bos.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Bulgarian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bul.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Catalan; Valencian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/cat.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Cebuano'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ceb.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Czech'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ces.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Chinese - Simplified'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chi_sim.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
- title: 'Chinese - Traditional'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chi_tra.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
# - title: 'Cherokee'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chr.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Welsh'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/cym.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
- title: 'Danish'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/dan.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
- title: 'German'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/deu.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
# - title: 'Dzongkha'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/dzo.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
- title: 'Greek, Modern (1453-)'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ell.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
- title: 'English'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/eng.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
- title: 'English, Middle (1100-1500)'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/enm.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
# - title: 'Esperanto'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/epo.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Estonian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/est.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Basque'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/eus.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Persian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fas.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Finnish'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fin.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
- title: 'French'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fra.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
- title: 'German Fraktur'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/frk.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
- title: 'French, Middle (ca. 1400-1600)'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/frm.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
# - title: 'Irish'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/gle.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Galician'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/glg.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
- title: 'Greek, Ancient (-1453)'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/grc.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
# - title: 'Gujarati'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/guj.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Haitian; Haitian Creole'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hat.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Hebrew'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/heb.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Hindi'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hin.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Croatian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hrv.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Hungarian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hun.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Inuktitut'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/iku.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Indonesian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ind.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Icelandic'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/isl.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
- title: 'Italian'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ita.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
- title: 'Italian - Old'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ita_old.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
# - title: 'Javanese'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/jav.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Japanese'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/jpn.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Kannada'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kan.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Georgian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kat.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Georgian - Old'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kat_old.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Kazakh'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kaz.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Central Khmer'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/khm.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Kirghiz; Kyrgyz'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kir.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Korean'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kor.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Kurdish'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kur.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Lao'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lao.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Latin'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lat.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Latvian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lav.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Lithuanian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lit.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Malayalam'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mal.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Marathi'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mar.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Macedonian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mkd.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Maltese'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mlt.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Malay'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/msa.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Burmese'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mya.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Nepali'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nep.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Dutch; Flemish'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nld.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Norwegian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nor.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Oriya'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ori.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Panjabi; Punjabi'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pan.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Polish'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pol.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
- title: 'Portuguese'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/por.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
# - title: 'Pushto; Pashto'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pus.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Romanian; Moldavian; Moldovan'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ron.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
- title: 'Russian'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/rus.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
# - title: 'Sanskrit'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/san.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Sinhala; Sinhalese'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/sin.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Slovak'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/slk.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Slovenian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/slv.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
- title: 'Spanish; Castilian'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/spa.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
- title: 'Spanish; Castilian - Old'
description: ''
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/spa_old.traineddata'
publisher: 'tesseract-ocr'
publishing_year: 2021
version: '4.1.0'
compatible_service_versions:
- '0.1.0'
# - title: 'Albanian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/sqi.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Serbian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/srp.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Serbian - Latin'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/srp_latn.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Swahili'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/swa.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Swedish'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/swe.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Syriac'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/syr.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Tamil'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tam.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Telugu'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tel.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Tajik'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tgk.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Tagalog'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tgl.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Thai'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tha.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Tigrinya'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tir.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Turkish'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tur.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Uighur; Uyghur'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uig.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Ukrainian'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ukr.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Urdu'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/urd.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Uzbek'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uzb.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Uzbek - Cyrillic'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uzb_cyrl.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Vietnamese'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/vie.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'
# - title: 'Yiddish'
# description: ''
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/yid.traineddata'
# publisher: 'tesseract-ocr'
# publishing_year: 2021
# version: '4.1.0'
# compatible_service_versions:
# - '0.1.0'

View File

@ -39,9 +39,6 @@ def create_app(config: Config = Config) -> Flask:
socketio.init_app(
app, message_queue=app.config['NOPAQUE_SOCKETIO_MESSAGE_QUEUE_URI'])
# from .utils import HashidConverter
# app.url_map.converters['hashid'] = HashidConverter
from .events import socketio as socketio_events
from .events import sqlalchemy as sqlalchemy_events
@ -54,6 +51,9 @@ def create_app(config: Config = Config) -> Flask:
from .auth import bp as auth_blueprint
app.register_blueprint(auth_blueprint, url_prefix='/auth')
from .contribute import bp as contribute_blueprint
app.register_blueprint(contribute_blueprint, url_prefix='/contribute')
from .corpora import bp as corpora_blueprint
app.register_blueprint(corpora_blueprint, url_prefix='/corpora')

View File

@ -1,7 +1,6 @@
from flask import Blueprint
from flask_restx import Api
from .jobs import ns as jobs_ns
from .tokens import ns as tokens_ns
bp = Blueprint('api', __name__)
@ -23,5 +22,4 @@ api = Api(
version='1.0'
)
api.add_namespace(jobs_ns)
api.add_namespace(tokens_ns)

View File

@ -9,8 +9,12 @@ token_auth = HTTPTokenAuth()
@basic_auth.verify_password
def verify_password(email_or_username, password):
user = User.query.filter(or_(User.username == email_or_username,
User.email == email_or_username.lower())).first()
user = User.query.filter(
or_(
User.username == email_or_username,
User.email == email_or_username.lower()
)
).first()
if user and user.verify_password(password):
return user

View File

@ -1,48 +0,0 @@
from flask_restx import Namespace, Resource
from .auth import token_auth
from ..jobs import tasks
from ..models import Job
ns = Namespace('jobs', description='Job operations')
@ns.route('')
class API_Jobs(Resource):
'''Shows a list of all jobs and lets you POST to add new job'''
@ns.doc(security='apiKey')
@token_auth.login_required
def get(self):
'''List all jobs'''
# TODO: Implement the correct get_jobs functionality
jobs = Job.query.all()
return [job.to_dict(include_relationships=False) for job in jobs]
@ns.doc(security='apiKey')
@token_auth.login_required
def post(self):
'''Create a new job'''
# TODO: Implement this
pass
@ns.route('/<hashid:id>')
class API_Job(Resource):
'''Show a single job and lets you delete it'''
@ns.doc(security='apiKey')
@token_auth.login_required
def get(self, id):
'''Get a job by id'''
job = Job.query.get_or_404(id)
return job.to_dict(include_relationships=False)
@ns.doc(security='apiKey')
@token_auth.login_required
def delete(self, id):
'''Delete a job by id'''
job = Job.query.get_or_404(id)
# We use this imported task because it will run in the background
tasks.delete_job(job.id)
return '', 204

View File

@ -60,28 +60,37 @@ def register():
return redirect(url_for('main.dashboard'))
form = RegistrationForm(prefix='registration-form')
if form.validate_on_submit():
user = User(email=form.email.data.lower(),
password=form.password.data,
username=form.username.data)
user = User(
email=form.email.data.lower(),
password=form.password.data,
username=form.username.data
)
db.session.add(user)
db.session.commit()
db.session.flush(objects=[user])
db.session.refresh(user)
try:
os.makedirs(user.path)
except OSError:
current_app.logger.error(
f'Make dir {user.path} led to an OSError!')
db.session.delete(user)
db.session.commit()
user.makedirs()
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
abort(500)
else:
token = user.generate_confirmation_token()
msg = create_message(user.email, 'Confirm Your Account',
'auth/email/confirm', token=token, user=user)
msg = create_message(
user.email,
'Confirm Your Account',
'auth/email/confirm',
token=token,
user=user
)
send(msg)
flash('A confirmation email has been sent to you by email.')
return redirect(url_for('.login'))
return render_template('auth/register.html.j2', form=form,
title='Register')
return render_template(
'auth/register.html.j2',
form=form,
title='Register'
)
@bp.route('/confirm/<token>')

View File

@ -1,16 +1,44 @@
from . import db
from .models import Corpus, Role
from flask import current_app
from flask_migrate import upgrade
from . import db
from .models import Corpus, Job, Role, User, TesseractOCRModel
import json
import os
import re
def _make_default_dirs():
base_dir = current_app.config['NOPAQUE_DATA_DIR']
default_directories = [
os.path.join(base_dir, 'tmp'),
os.path.join(base_dir, 'users')
]
for directory in default_directories:
if os.path.exists(directory):
if not os.path.isdir(directory):
raise NotADirectoryError(f'{directory} is not a directory')
else:
os.mkdir(directory)
def register(app):
@app.cli.command()
def deploy():
''' Run deployment tasks. '''
# Make default directories
_make_default_dirs()
# migrate database to latest revision
upgrade()
# create or update user roles
Role.insert_roles()
# Insert/Update default database values
current_app.logger.info('Insert/Update default roles')
Role.insert_defaults()
current_app.logger.info('Insert/Update default users')
User.insert_defaults()
current_app.logger.info('Insert/Update default tesseract ocr models')
TesseractOCRModel.insert_defaults()
@app.cli.group()
def daemon():
@ -40,3 +68,55 @@ def register(app):
from unittest.suite import TestSuite
tests: TestSuite = TestLoader().discover('tests')
TextTestRunner(verbosity=2).run(tests)
@app.cli.group()
def convert():
''' Datebase convert commands. '''
@convert.command()
def nlp_jobs():
for job in Job.query.filter_by(service='nlp').all():
job.service = 'spacy-nlp'
service_args = json.loads(job.service_args)
new_service_args = {}
for service_arg in service_args:
if service_arg == '--check-encoding':
new_service_args['encoding_detection'] = True
elif re.match(r'-l ([a-z]{2})', service_arg):
language_code = re.search(r'-l ([a-z]{2})', service_arg).group(1) # noqa
new_service_args['language'] = language_code
job.service_args = json.dumps(new_service_args)
db.session.commit()
@convert.command()
def ocr_jobs():
# Language code to TesseractOCRModel.title lookup
language_code_lookup = {
'ara': 'Arabic',
'chi_tra': 'Chinese - Traditional',
'dan': 'Danish',
'eng': 'English',
'enm': 'English, Middle (1100-1500)',
'fra': 'French',
'frm': 'French, Middle (ca. 1400-1600)',
'deu': 'German',
'frk': 'German Fraktur',
'ell': 'Greek, Modern (1453-)',
'ita': 'Italian',
'por': 'Portuguese',
'rus': 'Russian',
'spa': 'Spanish; Castilian'
}
for job in Job.query.filter_by(service='ocr').all():
job.service = 'tesseract-ocr'
service_args = json.loads(job.service_args)
new_service_args = {}
for service_arg in service_args:
if service_arg == '--binarize':
new_service_args['binarization'] = True
elif re.match(r'-l ([a-z]{3})', service_arg):
language_code = re.search(r'-l ([a-z]{3})', service_arg).group(1) # noqa
tesseract_ocr_model = TesseractOCRModel.query.filter_by(title=language_code_lookup[language_code]).first() # noqa
new_service_args['model'] = tesseract_ocr_model.id
job.service_args = json.dumps(new_service_args)
db.session.commit()

View File

@ -0,0 +1,5 @@
from flask import Blueprint
bp = Blueprint('contribute', __name__)
from . import routes

19
app/contribute/routes.py Normal file
View File

@ -0,0 +1,19 @@
from flask import flash, redirect, render_template, url_for
from flask_login import login_required
from . import bp
from .. import db
from ..decorators import permission_required
from ..models import Permission, Role, User
from ..settings import tasks as settings_tasks
@bp.before_request
@login_required
@permission_required(Permission.CONTRIBUTE)
def before_request():
pass
@bp.route('/')
def index():
pass

View File

@ -93,12 +93,12 @@ def connect(auth):
@socketio.on('disconnect', namespace=NAMESPACE)
def disconnect():
if 'd' not in session:
return
session['d']['cqi_client_lock'].acquire()
try:
session['d']['cqi_client'].disconnect()
except cqi.errors.CQiException:
pass
except BrokenPipeError:
except (BrokenPipeError, cqi.errors.CQiException):
pass
session['d']['cqi_client_lock'].release()
corpus = Corpus.query.get(session['d']['corpus_id'])

View File

@ -12,7 +12,10 @@ def cqi_over_socketio(f):
f_args = {}
# Check for missing args and if all provided args are of the right type
for param in signature(f).parameters.values():
if param.annotation == cqi.CQiClient:
if param.name == 'corpus_name':
f_args[param.name] = f'NOPAQUE_{session["d"]["corpus_id"]}'
continue
if param.name == 'cqi_client':
f_args[param.name] = session['d']['cqi_client']
continue
if param.default is param.empty:

View File

@ -1,6 +1,7 @@
from flask import (abort, current_app, flash, make_response, redirect,
render_template, url_for, send_from_directory)
from flask_login import current_user, login_required
from werkzeug.utils import secure_filename
from . import bp
from . import tasks
from .forms import (AddCorpusFileForm, AddCorpusForm, EditCorpusFileForm,
@ -29,18 +30,20 @@ def add_corpus():
db.session.flush()
db.session.refresh(corpus)
try:
os.makedirs(corpus.path)
corpus.makedirs()
except OSError as e:
current_app.logger.error(f'Could not add corpus: {e}')
current_app.logger.error(e)
db.session.rollback()
flash('Internal Server Error', 'error')
abort(500)
else:
db.session.commit()
flash(f'Corpus "{corpus.title}" added!', 'corpus')
return redirect(url_for('.corpus', corpus_id=corpus.id))
return render_template('corpora/add_corpus.html.j2', form=form,
title='Add corpus')
db.session.commit()
flash(f'Corpus "{corpus.title}" added', 'corpus')
return redirect(url_for('.corpus', corpus_id=corpus.id))
return render_template(
'corpora/add_corpus.html.j2',
form=form,
title='Add corpus'
)
@bp.route('/import', methods=['GET', 'POST'])
@ -174,7 +177,7 @@ def add_corpus_file(corpus_id):
if not form.validate():
return make_response(form.errors, 400)
# Save the file
form.file.data.save(os.path.join(corpus.path, form.file.data.filename))
filename = secure_filename(form.file.data.filename)
corpus_file = CorpusFile(
address=form.address.data,
author=form.author.data,
@ -182,9 +185,10 @@ def add_corpus_file(corpus_id):
chapter=form.chapter.data,
corpus=corpus,
editor=form.editor.data,
filename=form.file.data.filename,
filename=filename,
institution=form.institution.data,
journal=form.journal.data,
mimetype='application/vrt+xml',
pages=form.pages.data,
publisher=form.publisher.data,
publishing_year=form.publishing_year.data,
@ -192,12 +196,25 @@ def add_corpus_file(corpus_id):
title=form.title.data
)
db.session.add(corpus_file)
db.session.flush(objects=[corpus_file])
db.session.refresh(corpus_file)
try:
form.file.data.save(corpus_file.path)
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
flash('Internal Server Error', 'error')
return make_response({'redirect_url': url_for('.add_corpus_file', corpus_id=corpus.id)}, 500) # noqa
corpus.status = 'unprepared'
db.session.commit()
flash(f'Corpus file "{corpus_file.filename}" added!', 'corpus')
flash(f'Corpus file "{corpus_file.title}" added!', 'corpus')
return make_response({'redirect_url': url_for('.corpus', corpus_id=corpus.id)}, 201) # noqa
return render_template('corpora/add_corpus_file.html.j2', corpus=corpus,
form=form, title='Add corpus file')
return render_template(
'corpora/add_corpus_file.html.j2',
corpus=corpus,
form=form,
title='Add corpus file'
)
@bp.route('/<hashid:corpus_id>/files/<hashid:corpus_file_id>/delete')

View File

@ -17,11 +17,7 @@ class Daemon(CheckCorporaMixin, CheckJobsMixin):
def run(self):
while True:
try:
self.check_corpora()
self.check_jobs()
db.session.commit()
except Exception as e:
current_app.logger.warning(e)
pass
self.check_corpora()
self.check_jobs()
db.session.commit()
sleep(1.5)

View File

@ -26,37 +26,55 @@ class CheckCorporaMixin:
def create_build_corpus_service(self, corpus):
''' # Docker service settings # '''
''' ## Command ## '''
command = 'docker-entrypoint.sh build-corpus'
command = ['bash', '-c']
command.append(
f'mkdir /corpora/data/nopaque_{corpus.id}'
' && '
'cwb-encode'
' -c utf8'
f' -d /corpora/data/nopaque_{corpus.id}'
' -f /root/files/corpus.vrt'
f' -R /usr/local/share/cwb/registry/nopaque_{corpus.id}'
' -P pos -P lemma -P simple_pos'
' -S ent:0+type -S s:0'
' -S text:0+address+author+booktitle+chapter+editor+institution+journal+pages+publisher+publishing_year+school+title' # noqa
' -xsB -9'
' && '
f'cwb-make -V NOPAQUE_{corpus.id}'
)
''' ## Constraints ## '''
constraints = ['node.role==worker']
''' ## Image ## '''
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cqpserver:r1674' # noqa
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cwb:r1702'
''' ## Labels ## '''
labels = {
'origin': current_app.config['SERVER_NAME'],
'type': 'build-corpus',
'type': 'corpus.build',
'corpus_id': str(corpus.id)
}
''' ## Mounts ## '''
''' ### Corpus file mount ### '''
corpus_file_source = os.path.join(corpus.path, 'merged', 'corpus.vrt')
corpus_file_target = '/root/files/corpus.vrt'
corpus_file_mount = f'{corpus_file_source}:{corpus_file_target}:ro'
''' ### Corpus data mount ### '''
corpus_data_source = os.path.join(corpus.path, 'data')
corpus_data_target = '/corpora/data'
corpus_data_mount = f'{corpus_data_source}:{corpus_data_target}:rw'
# Make sure that their is no data in the corpus data directory
shutil.rmtree(corpus_data_source, ignore_errors=True)
os.mkdir(corpus_data_source)
''' ### Corpus registry mount ### '''
corpus_registry_source = os.path.join(corpus.path, 'registry')
corpus_registry_target = '/usr/local/share/cwb/registry'
corpus_registry_mount = f'{corpus_registry_source}:{corpus_registry_target}:rw' # noqa
# Make sure that their is no data in the corpus registry directory
shutil.rmtree(corpus_registry_source, ignore_errors=True)
os.mkdir(corpus_registry_source)
mounts = [corpus_file_mount, corpus_data_mount, corpus_registry_mount]
mounts = []
''' ### Data mount ### '''
data_mount_source = os.path.join(corpus.path, 'cwb', 'data')
data_mount_target = '/corpora/data'
data_mount = f'{data_mount_source}:{data_mount_target}:rw'
# Make sure that their is no data in the data directory
shutil.rmtree(data_mount_source, ignore_errors=True)
os.makedirs(data_mount_source)
mounts.append(data_mount)
''' ### File mount ### '''
file_mount_source = os.path.join(corpus.path, 'cwb', 'corpus.vrt')
file_mount_target = '/root/files/corpus.vrt'
file_mount = f'{file_mount_source}:{file_mount_target}:ro'
mounts.append(file_mount)
''' ### Registry mount ### '''
registry_mount_source = os.path.join(corpus.path, 'cwb', 'registry')
registry_mount_target = '/usr/local/share/cwb/registry'
registry_mount = f'{registry_mount_source}:{registry_mount_target}:rw'
# Make sure that their is no data in the registry directory
shutil.rmtree(registry_mount_source, ignore_errors=True)
os.makedirs(registry_mount_source)
mounts.append(registry_mount)
''' ## Name ## '''
name = f'build-corpus_{corpus.id}'
''' ## Restart policy ## '''
@ -74,7 +92,7 @@ class CheckCorporaMixin:
except docker.errors.APIError as e:
current_app.logger.error(
f'Create service "{name}" failed '
+ f'due to "docker.errors.APIError": {e}'
f'due to "docker.errors.APIError": {e}'
)
return
corpus.status = 'queued'
@ -86,14 +104,14 @@ class CheckCorporaMixin:
except docker.errors.NotFound as e:
current_app.logger.error(
f'Get service "{service_name}" failed '
+ f'due to "docker.errors.NotFound": {e}'
f'due to "docker.errors.NotFound": {e}'
)
corpus.status = 'failed'
return
except docker.errors.APIError as e:
current_app.logger.error(
f'Get service "{service_name}" failed '
+ f'due to "docker.errors.APIError": {e}'
f'due to "docker.errors.APIError": {e}'
)
service_tasks = service.tasks()
if not service_tasks:
@ -108,36 +126,47 @@ class CheckCorporaMixin:
corpus.status = 'failed'
else:
return
try:
service.remove()
except docker.errors.APIError as e:
current_app.logger.error(
f'Remove service "{service_name}" failed '
+ f'due to "docker.errors.APIError": {e}'
)
# try:
# service.remove()
# except docker.errors.APIError as e:
# current_app.logger.error(
# f'Remove service "{service_name}" failed '
# f'due to "docker.errors.APIError": {e}'
# )
def create_cqpserver_container(self, corpus):
''' # Docker container settings # '''
''' ## Command ## '''
command = 'cqpserver'
command = []
command.append(
'echo "host *;" > cqpserver.init'
' && '
'echo "user anonymous \\"\\";" >> cqpserver.init'
' && '
'cqpserver -I cqpserver.init'
)
''' ## Detach ## '''
detach = True
''' ## Entrypoint ## '''
entrypoint = ['bash', '-c']
''' ## Image ## '''
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cqpserver:r1674' # noqa
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cwb:r1702'
''' ## Name ## '''
name = f'cqpserver_{corpus.id}'
''' ## Network ## '''
network = 'nopaque_default'
''' ## Volumes ## '''
volumes = []
''' ### Corpus data volume ### '''
corpus_data_source = os.path.join(corpus.path, 'data')
corpus_data_target = '/corpora/data'
corpus_data_volume = f'{corpus_data_source}:{corpus_data_target}:rw'
data_volume_source = os.path.join(corpus.path, 'cwb', 'data')
data_volume_target = '/corpora/data'
data_volume = f'{data_volume_source}:{data_volume_target}:rw'
volumes.append(data_volume)
''' ### Corpus registry volume ### '''
corpus_registry_source = os.path.join(corpus.path, 'registry')
corpus_registry_target = '/usr/local/share/cwb/registry'
corpus_registry_volume = f'{corpus_registry_source}:{corpus_registry_target}:rw' # noqa
volumes = [corpus_data_volume, corpus_registry_volume]
registry_volume_source = os.path.join(corpus.path, 'cwb', 'registry')
registry_volume_target = '/usr/local/share/cwb/registry'
registry_volume = f'{registry_volume_source}:{registry_volume_target}:rw' # noqa
volumes.append(registry_volume)
# Check if a cqpserver container already exists. If this is the case,
# remove it and create a new one
try:
@ -147,7 +176,7 @@ class CheckCorporaMixin:
except docker.errors.APIError as e:
current_app.logger.error(
f'Get container "{name}" failed '
+ f'due to "docker.errors.APIError": {e}'
f'due to "docker.errors.APIError": {e}'
)
return
else:
@ -156,7 +185,7 @@ class CheckCorporaMixin:
except docker.errors.APIError as e:
current_app.logger.error(
f'Remove container "{name}" failed '
+ f'due to "docker.errors.APIError": {e}'
f'due to "docker.errors.APIError": {e}'
)
return
try:
@ -164,6 +193,7 @@ class CheckCorporaMixin:
image,
command=command,
detach=detach,
entrypoint=entrypoint,
volumes=volumes,
name=name,
network=network
@ -171,14 +201,14 @@ class CheckCorporaMixin:
except docker.errors.ImageNotFound as e:
current_app.logger.error(
f'Run container "{name}" failed '
+ f'due to "docker.errors.ImageNotFound" error: {e}'
f'due to "docker.errors.ImageNotFound" error: {e}'
)
corpus.status = 'failed'
return
except docker.errors.APIError as e:
current_app.logger.error(
f'Run container "{name}" failed '
+ f'due to "docker.errors.APIError" error: {e}'
f'due to "docker.errors.APIError" error: {e}'
)
return
corpus.status = 'analysing'
@ -190,14 +220,14 @@ class CheckCorporaMixin:
except docker.errors.NotFound as e:
current_app.logger.error(
f'Get container "{container_name}" failed '
+ f'due to "docker.errors.NotFound": {e}'
f'due to "docker.errors.NotFound": {e}'
)
corpus.num_analysis_sessions = 0
corpus.status = 'prepared'
except docker.errors.APIError as e:
current_app.logger.error(
f'Get container "{container_name}" failed '
+ f'due to "docker.errors.APIError": {e}'
f'due to "docker.errors.APIError": {e}'
)
def remove_cqpserver_container(self, corpus):
@ -210,7 +240,7 @@ class CheckCorporaMixin:
except docker.errors.APIError as e:
current_app.logger.error(
f'Get container "{container_name}" failed '
+ f'due to "docker.errors.APIError": {e}'
f'due to "docker.errors.APIError": {e}'
)
return
try:
@ -218,5 +248,5 @@ class CheckCorporaMixin:
except docker.errors.APIError as e:
current_app.logger.error(
f'Remove container "{container_name}" failed '
+ f'due to "docker.errors.APIError": {e}'
f'due to "docker.errors.APIError": {e}'
)

View File

@ -2,7 +2,7 @@ from datetime import datetime
from flask import current_app
from werkzeug.utils import secure_filename
from .. import db
from ..models import Job, JobResult
from ..models import Job, JobResult, TesseractOCRModel
import docker
import json
import os
@ -23,27 +23,34 @@ class CheckJobsMixin:
''' # Docker service settings # '''
''' ## Service specific settings ## '''
if job.service == 'file-setup':
mem_mb = 2048
mem_mb = 512
n_cores = 2
executable = 'file-setup'
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}file-setup:{job.service_version}' # noqa
elif job.service == 'ocr':
mem_mb = 4096
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}file-setup:v{job.service_version}' # noqa
elif job.service == 'tesseract-ocr':
mem_mb = 2048
n_cores = 4
executable = 'ocr'
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}ocr:{job.service_version}' # noqa
elif job.service == 'nlp':
mem_mb = 2048
n_cores = 2
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}ocr:v{job.service_version}' # noqa
elif job.service == 'spacy-nlp':
mem_mb = 1024
n_cores = 1
executable = 'nlp'
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}nlp:{job.service_version}' # noqa
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}nlp:v{job.service_version}' # noqa
''' ## Command ## '''
command = f'{executable} -i /input -o /output'
command += ' --log-dir /input'
command += ' --log-dir /logs'
command += f' --mem-mb {mem_mb}'
command += f' --n-cores {n_cores}'
command += f' --zip [{job.service}]_{secure_filename(job.title)}'
command += ' ' + ' '.join(json.loads(job.service_args))
service_args = json.loads(job.service_args)
if job.service == 'spacy-nlp':
command += f' -m {service_args["model"]}'
if 'encoding_detection' in service_args and service_args['encoding_detection']: # noqa
command += ' --check-encoding'
elif job.service == 'tesseract-ocr':
command += f' -m {service_args["model"]}'
if 'binarization' in service_args and service_args['binarization']:
command += ' --binarize'
''' ## Constraints ## '''
constraints = ['node.role==worker']
''' ## Labels ## '''
@ -53,20 +60,42 @@ class CheckJobsMixin:
'job_id': str(job.id)
}
''' ## Mounts ## '''
''' ### Input mount ### '''
input_mount_source = job.path
input_mount_target = '/input'
mounts = []
''' ### Input mount(s) ### '''
input_mount_target_base = '/input'
if job.service == 'file-setup':
input_mount_target += f'/{secure_filename(job.title)}'
input_mount = f'{input_mount_source}:{input_mount_target}:rw'
input_mount_target_base += f'/{secure_filename(job.title)}'
for job_input in job.inputs:
input_mount_source = job_input.path
input_mount_target = f'/{input_mount_target_base}/{job_input.filename}' # noqa
input_mount = f'{input_mount_source}:{input_mount_target}:ro'
mounts.append(input_mount)
if job.service == 'tesseract-ocr':
service_args = json.loads(job.service_args)
model = TesseractOCRModel.query.get(service_args['model'])
if model is None:
job.status = 'failed'
return
models_mount_source = model.path
models_mount_target = f'/usr/local/share/tessdata/{model.filename}'
models_mount = f'{models_mount_source}:{models_mount_target}:ro'
mounts.append(models_mount)
''' ### Output mount ### '''
output_mount_source = os.path.join(job.path, 'output')
output_mount_source = os.path.join(job.path, 'results')
output_mount_target = '/output'
output_mount = f'{output_mount_source}:{output_mount_target}:rw'
# Make sure that their is no data in the output directory
shutil.rmtree(output_mount_source, ignore_errors=True)
os.makedirs(output_mount_source)
mounts = [input_mount, output_mount]
mounts.append(output_mount)
''' ### Pipeline data mount ### '''
pyflow_data_mount_source = os.path.join(job.path, 'pipeline_data')
pyflow_data_mount_target = '/logs/pyflow.data'
pyflow_data_mount = f'{pyflow_data_mount_source}:{pyflow_data_mount_target}:rw' # noqa
# Make sure that their is no data in the output directory
shutil.rmtree(pyflow_data_mount_source, ignore_errors=True)
os.makedirs(pyflow_data_mount_source)
mounts.append(pyflow_data_mount)
''' ## Name ## '''
name = f'job_{job.id}'
''' ## Resources ## '''
@ -90,7 +119,7 @@ class CheckJobsMixin:
except docker.errors.APIError as e:
current_app.logger.error(
f'Create service "{name}" failed '
+ f'due to "docker.errors.APIError": {e}'
f'due to "docker.errors.APIError": {e}'
)
return
job.status = 'queued'
@ -102,14 +131,14 @@ class CheckJobsMixin:
except docker.errors.NotFound as e:
current_app.logger.error(
f'Get service "{service_name}" failed '
+ f'due to "docker.errors.NotFound": {e}'
f'due to "docker.errors.NotFound": {e}'
)
job.status = 'failed'
return
except docker.errors.APIError as e:
current_app.logger.error(
f'Get service "{service_name}" failed '
+ f'due to "docker.errors.APIError": {e}'
f'due to "docker.errors.APIError": {e}'
)
return
service_tasks = service.tasks()
@ -121,13 +150,25 @@ class CheckJobsMixin:
return
elif job.status == 'running' and task_state == 'complete':
job.status = 'complete'
results_dir = os.path.join(job.path, 'output')
result_files = [x for x in os.listdir(results_dir) if x.endswith('.zip')] # noqa
for result_file in result_files:
job_result = JobResult(filename=result_file, job=job)
results_dir = os.path.join(job.path, 'results')
with open(os.path.join(results_dir, 'outputs.json')) as f:
outputs = json.load(f)
for output in outputs:
filename = os.path.basename(output['file'])
job_result = JobResult(
filename=filename,
job=job,
mimetype=output['mimetype']
)
if 'description' in output:
job_result.description = output['description']
db.session.add(job_result)
db.session.flush()
db.session.flush(objects=[job_result])
db.session.refresh(job_result)
os.rename(
os.path.join(results_dir, output['file']),
job_result.path
)
elif job.status == 'running' and task_state == 'failed':
job.status = 'failed'
else:
@ -138,7 +179,7 @@ class CheckJobsMixin:
except docker.errors.APIError as e:
current_app.logger.error(
f'Remove service "{service_name}" failed '
+ f'due to "docker.errors.APIError": {e}'
f'due to "docker.errors.APIError": {e}'
)
def remove_job_service(self, job):
@ -151,7 +192,7 @@ class CheckJobsMixin:
except docker.errors.APIError as e:
current_app.logger.error(
f'Get service "{service_name}" failed '
+ f'due to "docker.errors.APIError": {e}'
f'due to "docker.errors.APIError": {e}'
)
return
try:
@ -159,7 +200,7 @@ class CheckJobsMixin:
except docker.errors.APIError as e:
current_app.logger.error(
f'Update service "{service_name}" failed '
+ f'due to "docker.errors.APIError": {e}'
f'due to "docker.errors.APIError": {e}'
)
return
try:
@ -167,5 +208,5 @@ class CheckJobsMixin:
except docker.errors.APIError as e:
current_app.logger.error(
f'Remove "{service_name}" service failed '
+ f'due to "docker.errors.APIError": {e}'
f'due to "docker.errors.APIError": {e}'
)

View File

@ -34,12 +34,14 @@ def delete_job(job_id):
@login_required
def download_job_input(job_id, job_input_id):
job_input = JobInput.query.filter(JobInput.job_id == job_id, JobInput.id == job_input_id).first_or_404() # noqa
if not (job_input.job.user == current_user
or current_user.is_administrator()):
if not (job_input.job.user == current_user or current_user.is_administrator()): # noqa
abort(403)
return send_from_directory(as_attachment=True,
directory=os.path.dirname(job_input.path),
filename=job_input.filename)
return send_from_directory(
as_attachment=True,
attachment_filename=job_input.filename,
directory=os.path.dirname(job_input.path),
filename=os.path.basename(job_input.path)
)
@bp.route('/<hashid:job_id>/restart')
@ -59,9 +61,11 @@ def restart(job_id):
@login_required
def download_job_result(job_id, job_result_id):
job_result = JobResult.query.filter(JobResult.job_id == job_id, JobResult.id == job_result_id).first_or_404() # noqa
if not (job_result.job.user == current_user
or current_user.is_administrator()):
if not (job_result.job.user == current_user or current_user.is_administrator()): # noqa
abort(403)
return send_from_directory(as_attachment=True,
directory=os.path.dirname(job_result.path),
filename=job_result.filename)
return send_from_directory(
as_attachment=True,
attachment_filename=job_result.filename,
directory=os.path.dirname(job_result.path),
filename=os.path.basename(job_result.path)
)

View File

@ -4,13 +4,17 @@ from flask_hashids import HashidMixin
from flask_login import UserMixin
from itsdangerous import BadSignature, TimedJSONWebSignatureSerializer
from time import sleep
from tqdm import tqdm
from werkzeug.security import generate_password_hash, check_password_hash
import xml.etree.ElementTree as ET
from . import db, login
import base64
import enum
import json
import os
import requests
import shutil
import xml.etree.ElementTree as ET
import yaml
class Permission(enum.IntEnum):
@ -25,7 +29,7 @@ class Permission(enum.IntEnum):
class FileMixin:
creation_date = db.Column(db.DateTime, default=datetime.utcnow)
filename = db.Column(db.String(256))
filename = db.Column(db.String(255))
last_edited_date = db.Column(db.DateTime, default=datetime.utcnow)
mimetype = db.Column(db.String(255))
@ -86,7 +90,7 @@ class Role(HashidMixin, db.Model):
return dict_role
@staticmethod
def insert_roles():
def insert_defaults():
roles = {
'User': [],
'API user': [Permission.USE_API],
@ -132,6 +136,12 @@ class User(HashidMixin, UserMixin, db.Model):
db.String(16), default='all')
# Backrefs: role: Role
# Relationships
tesseract_ocr_models = db.relationship(
'TesseractOCRModel',
backref='user',
cascade='all, delete-orphan',
lazy='dynamic'
)
corpora = db.relationship(
'Corpus',
backref='user',
@ -221,6 +231,12 @@ class User(HashidMixin, UserMixin, db.Model):
def is_administrator(self):
return self.can(Permission.ADMINISTRATE)
def makedirs(self):
os.mkdir(self.path)
os.mkdir(os.path.join(self.path, 'tesseract_ocr_models'))
os.mkdir(os.path.join(self.path, 'corpora'))
os.mkdir(os.path.join(self.path, 'jobs'))
def revoke_token(self):
self.token_expiration = datetime.utcnow() - timedelta(seconds=1)
@ -269,6 +285,21 @@ class User(HashidMixin, UserMixin, db.Model):
return None
return user
@staticmethod
def insert_defaults():
if User.query.filter_by(username='nopaque').first() is not None:
return
user = User(username='nopaque')
db.session.add(user)
db.session.flush(objects=[user])
db.session.refresh(user)
try:
user.makedirs()
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
db.session.commit()
@staticmethod
def reset_password(token, new_password):
s = TimedJSONWebSignatureSerializer(current_app.config['SECRET_KEY'])
@ -284,6 +315,72 @@ class User(HashidMixin, UserMixin, db.Model):
return True
class TesseractOCRModel(FileMixin, HashidMixin, db.Model):
__tablename__ = 'tesseract_ocr_models'
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
# Fields
compatible_service_versions = db.Column(db.String(255))
description = db.Column(db.String(255))
publisher = db.Column(db.String(128))
publishing_year = db.Column(db.Integer)
title = db.Column(db.String(64))
version = db.Column(db.String(16))
# Backrefs: user: User
@property
def path(self):
return os.path.join(
self.user.path,
'tesseract_ocr_models',
str(self.id)
)
@staticmethod
def insert_defaults():
user = User.query.filter_by(username='nopaque').first()
defaults_file = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'TesseractOCRModel.defaults.yml'
)
with open(defaults_file, 'r') as f:
defaults = yaml.safe_load(f)
for m in defaults:
if TesseractOCRModel.query.filter_by(title=m['title'], version=m['version']).first() is not None: # noqa
continue
tesseract_ocr_model = TesseractOCRModel(
compatible_service_versions=json.dumps(m['compatible_service_versions']), # noqa
description=m['description'],
publisher=m['publisher'],
publishing_year=m['publishing_year'],
title=m['title'],
user=user,
version=m['version']
)
db.session.add(tesseract_ocr_model)
db.session.flush(objects=[tesseract_ocr_model])
db.session.refresh(tesseract_ocr_model)
tesseract_ocr_model.filename = f'{tesseract_ocr_model.id}.traineddata' # noqa
r = requests.get(m['url'], stream=True)
pbar = tqdm(
desc=f'{tesseract_ocr_model.title} ({tesseract_ocr_model.filename})', # noqa
unit="B",
unit_scale=True,
unit_divisor=1024,
total=int(r.headers['Content-Length'])
)
pbar.clear()
with open(tesseract_ocr_model.path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
pbar.update(len(chunk))
f.write(chunk)
pbar.close()
db.session.commit()
class JobInput(FileMixin, HashidMixin, db.Model):
__tablename__ = 'job_inputs'
# Primary key
@ -309,7 +406,7 @@ class JobInput(FileMixin, HashidMixin, db.Model):
@property
def path(self):
return os.path.join(self.job.path, self.filename)
return os.path.join(self.job.path, 'inputs', str(self.id))
def to_dict(self, backrefs=False, relationships=False):
dict_job_input = {
@ -347,6 +444,8 @@ class JobResult(FileMixin, HashidMixin, db.Model):
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
job_id = db.Column(db.Integer, db.ForeignKey('jobs.id'))
# Fields
description = db.Column(db.String(255))
# Backrefs: job: Job
def __repr__(self):
@ -366,12 +465,13 @@ class JobResult(FileMixin, HashidMixin, db.Model):
@property
def path(self):
return os.path.join(self.job.path, 'output', self.filename)
return os.path.join(self.job.path, 'results', str(self.id))
def to_dict(self, backrefs=False, relationships=False):
dict_job_result = {
'id': self.hashid,
'job_id': self.job.hashid,
'description': self.description,
'download_url': self.download_url,
'url': self.url,
**self.file_mixin_to_dict(
@ -414,8 +514,8 @@ class Job(HashidMixin, db.Model):
end_date = db.Column(db.DateTime())
service = db.Column(db.String(64))
'''
' Service specific arguments as string list.
' Example: ["-l eng", "--binarize"]
' Dictionary as JSON formatted string.
' Example: {"binarization": True}
'''
service_args = db.Column(db.String(255))
service_version = db.Column(db.String(16))
@ -472,6 +572,12 @@ class Job(HashidMixin, db.Model):
shutil.rmtree(self.path, ignore_errors=True)
db.session.delete(self)
def makedirs(self):
os.mkdir(self.path)
os.mkdir(os.path.join(self.path, 'inputs'))
os.mkdir(os.path.join(self.path, 'pipeline_data'))
os.mkdir(os.path.join(self.path, 'results'))
def restart(self):
'''
Restart a job - only if the status is complete or failed
@ -479,7 +585,7 @@ class Job(HashidMixin, db.Model):
if self.status not in ['complete', 'failed']:
raise Exception('Could not restart job: status is not "complete/failed"') # noqa
shutil.rmtree(os.path.join(self.path, 'output'), ignore_errors=True)
shutil.rmtree(os.path.join(self.path, 'results'), ignore_errors=True)
shutil.rmtree(os.path.join(self.path, 'pyflow.data'), ignore_errors=True) # noqa
for result in self.results:
db.session.delete(result)
@ -487,6 +593,10 @@ class Job(HashidMixin, db.Model):
self.status = 'submitted'
def to_dict(self, backrefs=False, relationships=False):
service_args = json.loads(self.service_args)
if self.service == 'tesseract-ocr' and 'model' in service_args:
tesseract_ocr_pipeline_model = TesseractOCRModel.query.get(service_args['model']) # noqa
service_args['model'] = tesseract_ocr_pipeline_model.title
dict_job = {
'id': self.hashid,
'user_id': self.user.hashid,
@ -494,7 +604,7 @@ class Job(HashidMixin, db.Model):
'description': self.description,
'end_date': None if self.end_date is None else f'{self.end_date.isoformat()}Z', # noqa
'service': self.service,
'service_args': self.service_args,
'service_args': service_args,
'service_version': self.service_version,
'status': self.status,
'title': self.title,
@ -550,7 +660,7 @@ class CorpusFile(FileMixin, HashidMixin, db.Model):
@property
def path(self):
return os.path.join(self.corpus.path, self.filename)
return os.path.join(self.corpus.path, 'files', str(self.id))
@property
def url(self):
@ -659,28 +769,27 @@ class Corpus(HashidMixin, db.Model):
return self.user.hashid
def build(self):
output_dir = os.path.join(self.path, 'merged')
shutil.rmtree(output_dir, ignore_errors=True)
os.mkdir(output_dir)
output_file = os.path.join(output_dir, 'corpus.vrt')
corpus_element = ET.fromstring('<corpus>\n</corpus>')
for corpus_file in self.files:
element_tree = ET.parse(corpus_file.path)
text_node = element_tree.find('text')
text_node.set('address', corpus_file.address or 'NULL')
text_node.set('author', corpus_file.author)
text_node.set('booktitle', corpus_file.booktitle or 'NULL')
text_node.set('chapter', corpus_file.chapter or 'NULL')
text_node.set('editor', corpus_file.editor or 'NULL')
text_node.set('institution', corpus_file.institution or 'NULL')
text_node.set('journal', corpus_file.journal or 'NULL')
text_node.set('pages', corpus_file.pages or 'NULL')
text_node.set('publisher', corpus_file.publisher or 'NULL')
text_node.set('publishing_year', str(corpus_file.publishing_year))
text_node.set('school', corpus_file.school or 'NULL')
text_node.set('title', corpus_file.title)
corpus_element.insert(1, text_node)
ET.ElementTree(corpus_element).write(output_file, encoding='utf-8')
text_element = element_tree.getroot()
text_element.set('address', corpus_file.address or 'NULL')
text_element.set('author', corpus_file.author)
text_element.set('booktitle', corpus_file.booktitle or 'NULL')
text_element.set('chapter', corpus_file.chapter or 'NULL')
text_element.set('editor', corpus_file.editor or 'NULL')
text_element.set('institution', corpus_file.institution or 'NULL')
text_element.set('journal', corpus_file.journal or 'NULL')
text_element.set('pages', corpus_file.pages or 'NULL')
text_element.set('publisher', corpus_file.publisher or 'NULL')
text_element.set('publishing_year', str(corpus_file.publishing_year)) # noqa
text_element.set('school', corpus_file.school or 'NULL')
text_element.set('title', corpus_file.title)
corpus_element.insert(1, text_element)
ET.ElementTree(corpus_element).write(
os.path.join(self.path, 'cwb', 'corpus.vrt'),
encoding='utf-8'
)
self.last_edited_date = datetime.utcnow()
self.status = 'submitted'
@ -688,6 +797,13 @@ class Corpus(HashidMixin, db.Model):
shutil.rmtree(self.path, ignore_errors=True)
db.session.delete(self)
def makedirs(self):
os.mkdir(self.path)
os.mkdir(os.path.join(self.path, 'files'))
os.mkdir(os.path.join(self.path, 'cwb'))
os.mkdir(os.path.join(self.path, 'cwb', 'data'))
os.mkdir(os.path.join(self.path, 'cwb', 'registry'))
def to_dict(self, backrefs=False, relationships=False):
dict_corpus = {
'id': self.hashid,

View File

@ -1,77 +1,13 @@
from flask import Blueprint
import os
import yaml
SERVICES = {
'file-setup': {
'name': 'File setup',
'versions': {
'latest': '1.0.0b',
'1.0.0b': {
'publishing_data': {
'date': None,
'title': 'nopaque File setup service',
'url': 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup/-/tree/1.0.0b', # noqa
'version': '1.0.0'
}
}
}
},
'nlp': {
'name': 'Natural Language Processing',
'versions': {
'latest': '1.0.0b',
'1.0.0b': {
'check_encoding': True,
'models': {
'de': 'German',
'en': 'English',
'it': 'Italian',
'nl': 'Dutch',
'pl': 'Polish',
'zh': 'Chinese'
},
'publishing_data': {
'date': None,
'title': 'nopaque NLP service',
'url': 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/tree/1.0.0b', # noqa
'version': '1.0.0'
}
}
}
},
'ocr': {
'name': 'Optical Character Recognition',
'versions': {
'latest': '1.0.0b',
'1.0.0b': {
'binarization': True,
'models': {
'ara': 'Arabic',
'chi_tra': 'Chinese - Traditional',
'dan': 'Danish',
'eng': 'English',
'enm': 'English, Middle 1100-1500',
'fra': 'French',
'frm': 'French, Middle ca. 1400-1600',
'deu': 'German',
'frk': 'German Fraktur',
'ell': 'Greek, Modern (1453-)',
'ita': 'Italian',
'por': 'Portuguese',
'rus': 'Russian',
'spa': 'Spanish; Castilian',
},
'publishing_data': {
'date': None,
'title': 'nopaque OCR service',
'url': 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/tree/1.0.0b', # noqa
'version': '1.0.0'
}
}
}
}
}
services_file = os.path.join(
os.path.dirname(os.path.abspath(__file__)), 'services.yml')
with open(services_file, 'r') as f:
SERVICES = yaml.safe_load(f)
bp = Blueprint('services', __name__)
from . import routes
from . import routes # noqa

View File

@ -1,3 +1,4 @@
from app.models import TesseractOCRModel
from flask_wtf import FlaskForm
from wtforms import (BooleanField, MultipleFileField, SelectField, StringField,
SubmitField, ValidationError)
@ -6,85 +7,105 @@ from . import SERVICES
class AddJobForm(FlaskForm):
description = StringField('Description',
validators=[DataRequired(), Length(1, 255)])
description = StringField('Description', validators=[DataRequired(), Length(1, 255)]) # noqa
submit = SubmitField()
title = StringField('Title', validators=[DataRequired(), Length(1, 32)])
version = SelectField('Version', validators=[DataRequired()])
class AddNLPJobForm(AddJobForm):
check_encoding = BooleanField('Check encoding')
class AddSpacyNLPJobForm(AddJobForm):
encoding_detection = BooleanField('Encoding detection')
files = MultipleFileField('Files', validators=[DataRequired()])
language = SelectField('Language', choices=[('', 'Choose your option')],
default='', validators=[DataRequired()])
model = SelectField(
'Model',
choices=[('', 'Choose your option')],
default='',
validators=[DataRequired()]
)
def validate_check_encoding(self, field):
if field.data and 'check_encoding' not in SERVICES['nlp']['versions'][self.version.data]: # noqa
raise ValidationError('Check encoding is not available in this version') # noqa
def validate_encoding_detection(self, field):
service_info = SERVICES['spacy-nlp']['versions'][self.version.data]
if field.data and 'encoding_detection' not in service_info:
raise ValidationError('Encoding detection is not available')
def validate_files(form, field):
valid_extensions = ['.txt']
for file in field.data:
if not file.filename.lower().endswith('.txt'):
raise ValidationError('File does not have an approved '
'extension: .txt')
if not file.filename.lower().endswith(tuple(valid_extensions)):
raise ValidationError(
'File does not have an approved extension: '
'/'.join(valid_extensions)
)
def __init__(self, *args, **kwargs):
version = kwargs.pop('version', SERVICES['nlp']['versions']['latest'])
version = kwargs.pop('version', SERVICES['spacy-nlp']['latest_version']) # noqa
super().__init__(*args, **kwargs)
if 'check_encoding' not in SERVICES['nlp']['versions'][version]:
self.check_encoding.render_kw = {'disabled': True}
self.language.choices += [(x, y) for x, y in SERVICES['nlp']['versions'][version]['models'].items()] # noqa
self.version.choices = [(x, x) for x in SERVICES['nlp']['versions'] if x != 'latest'] # noqa
service_info = SERVICES['spacy-nlp']['versions'][version]
if 'check_encoding' not in service_info['methods']:
self.encoding_detection.render_kw = {'disabled': True}
self.model.choices += [(x, y) for x, y in service_info['models'].items()] # noqa
self.version.choices = [(x, x) for x in SERVICES['spacy-nlp']['versions']] # noqa
self.version.default = version
class AddOCRJobForm(AddJobForm):
binarization = BooleanField('Binarazation')
class AddTesseractOCRJobForm(AddJobForm):
binarization = BooleanField('Binarization')
files = MultipleFileField('Files', validators=[DataRequired()])
language = SelectField('Language', choices=[('', 'Choose your option')],
default='', validators=[DataRequired()])
model = SelectField(
'Model',
choices=[('', 'Choose your option')],
default='',
validators=[DataRequired()]
)
def validate_binarization(self, field):
if field.data and 'binarization' not in SERVICES['ocr']['versions'][self.version.data]: # noqa
raise ValidationError('Binarization is not available in this version') # noqa
service_info = SERVICES['tesseract-ocr']['versions'][self.version.data]
if field.data and 'binarization' not in service_info:
raise ValidationError('Binarization is not available')
def validate_files(self, field):
valid_extensions = ['.pdf']
for file in field.data:
if not file.filename.lower().endswith('.pdf'):
raise ValidationError('File does not have an approved '
'extension: .pdf')
if not file.filename.lower().endswith(tuple(valid_extensions)):
raise ValidationError(
'File does not have an approved extension: '
'/'.join(valid_extensions)
)
def __init__(self, *args, **kwargs):
version = kwargs.pop('version', SERVICES['ocr']['versions']['latest'])
version = kwargs.pop('version', SERVICES['tesseract-ocr']['latest_version']) # noqa
super().__init__(*args, **kwargs)
if 'binarization' not in SERVICES['ocr']['versions'][version]:
service_info = SERVICES['tesseract-ocr']['versions'][version]
if 'binarization' not in service_info['methods']:
self.binarization.render_kw = {'disabled': True}
self.language.choices += [(x, y) for x, y in SERVICES['ocr']['versions'][version]['models'].items()] # noqa
self.version.choices = [(x, x) for x in SERVICES['ocr']['versions'] if x != 'latest'] # noqa
self.version.default = version
self.model.choices += [(x.hashid, x.title) for x in TesseractOCRModel.query.all()] # noqa
self.version.choices = [(x, x) for x in SERVICES['tesseract-ocr']['versions']] # noqa
self.version.data = version
self.version.default = SERVICES['tesseract-ocr']['latest_version']
class AddFileSetupJobForm(AddJobForm):
files = MultipleFileField('Files', validators=[DataRequired()])
def validate_files(form, field):
valid_extensions = ['.jpeg', '.jpg', '.png', '.tiff', '.tif']
for file in field.data:
if not file.filename.lower().endswith(('.jpeg', '.jpg', '.png',
'.tiff', '.tif')):
raise ValidationError('File does not have an approved '
'extension: .jpeg | .jpg | .png | .tiff '
'| .tif')
if not file.filename.lower().endswith(tuple(valid_extensions)):
raise ValidationError(
'File does not have an approved extension: '
'/'.join(valid_extensions)
)
def __init__(self, *args, **kwargs):
version = kwargs.pop('version', SERVICES['file-setup']['versions']['latest'])
version = kwargs.pop('version', SERVICES['file-setup']['latest_version']) # noqa
super().__init__(*args, **kwargs)
self.version.choices = [(x, x) for x in SERVICES['file-setup']['versions'] if x != 'latest'] # noqa
self.version.default = version
self.version.choices = [(x, x) for x in SERVICES['file-setup']['versions']] # noqa
self.version.data = version
self.version.default = SERVICES['file-setup']['latest_version']
AddJobForms = {
'file-setup': AddFileSetupJobForm,
'ocr': AddOCRJobForm,
'nlp': AddNLPJobForm
'tesseract-ocr': AddTesseractOCRJobForm,
'spacy-nlp': AddSpacyNLPJobForm
}

View File

@ -1,3 +1,4 @@
from app import hashids
from flask import (abort, current_app, flash, make_response, render_template,
request, url_for)
from flask_login import current_user, login_required
@ -8,7 +9,6 @@ from .. import db
from .forms import AddJobForms
from ..models import Job, JobInput
import json
import os
@bp.route('/corpus-analysis')
@ -24,57 +24,65 @@ def service(service):
# Check if the requested service exist
if service not in SERVICES or service not in AddJobForms:
abort(404)
version = request.args.get(
'version', SERVICES[service]['versions']['latest'])
version = request.args.get('version', SERVICES[service]['latest_version'])
if version not in SERVICES[service]['versions']:
abort(404)
form = AddJobForms[service](prefix='add-job-form', version=version)
form.version.data = version
title = SERVICES[service]['name']
versions = SERVICES[service]['versions']
if form.is_submitted():
if not form.validate():
return make_response(form.errors, 400)
service_args = []
if service == 'nlp':
service_args.append(f'-l {form.language.data}')
if form.check_encoding.data:
service_args.append('--check-encoding')
if service == 'ocr':
service_args.append(f'-l {form.language.data}')
service_args = {}
if service == 'spacy-nlp':
service_args['model'] = form.model.data
if form.encoding_detection.data:
service_args['encoding_detection'] = True
if service == 'tesseract-ocr':
service_args['model'] = hashids.decode(form.model.data)
if form.binarization.data:
service_args.append('--binarize')
job = Job(user=current_user,
description=form.description.data,
service=service, service_args=json.dumps(service_args),
service_version=form.version.data,
status='preparing', title=form.title.data)
service_args['binarization'] = True
job = Job(
user=current_user,
description=form.description.data,
service=service,
service_args=json.dumps(service_args),
service_version=form.version.data,
status='preparing',
title=form.title.data
)
db.session.add(job)
db.session.flush()
db.session.flush(objects=[job])
db.session.refresh(job)
try:
os.makedirs(job.path)
except OSError:
current_app.logger.error(f'Make dir {job.path} led to an OSError!')
job.makedirs()
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
flash('Internal Server Error', 'error')
return make_response(
{'redirect_url': url_for('.service', service=service)}, 500)
else:
for file in form.files.data:
filename = secure_filename(file.filename)
job_input = JobInput(
filename=filename, job=job, mimetype=file.mimetype)
return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa
for file in form.files.data:
filename = secure_filename(file.filename)
job_input = JobInput(
filename=filename,
job=job,
mimetype=file.mimetype
)
db.session.add(job_input)
db.session.flush(objects=[job_input])
db.session.refresh(job_input)
try:
file.save(job_input.path)
db.session.add(job_input)
job.status = 'submitted'
db.session.commit()
flash(f'Job "{job.title}" added', 'job')
return make_response(
{'redirect_url': url_for('jobs.job', job_id=job.id)}, 201)
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
flash('Internal Server Error', 'error')
return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa
job.status = 'submitted'
db.session.commit()
flash(f'Job "{job.title}" added', 'job')
return make_response({'redirect_url': url_for('jobs.job', job_id=job.id)}, 201) # noqa
return render_template(
f'services/{service.replace("-", "_")}.html.j2',
form=form,
title=title,
versions=versions
title=title
)

38
app/services/services.yml Normal file
View File

@ -0,0 +1,38 @@
# TODO: This could also be done via GitLab/GitHub APIs
#file-setup-pipeline:
file-setup:
name: 'File setup pipeline'
latest_version: '0.1.0'
versions:
0.1.0:
publisher: 'Bielefeld University - CRC 1288 - INF'
publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup/-/releases/v0.1.0'
#spacy-nlp-pipeline:
spacy-nlp:
name: 'spaCy NLP'
latest_version: '0.1.0'
versions:
0.1.0:
methods:
- 'encoding_detection'
models:
de: 'German'
en: 'English'
it: 'Italian'
pl: 'Polish'
zh: 'Chinese'
publisher: 'Bielefeld University - CRC 1288 - INF'
publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/releases/v0.1.0'
#tesseract-ocr-pipeline:
tesseract-ocr:
name: 'Tesseract OCR'
latest_version: '0.1.0'
versions:
0.1.0:
methods:
- 'binarization'
publisher: 'Bielefeld University - CRC 1288 - INF'
publishing_year: 2022
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/releases/v0.1.0'

View File

@ -50,8 +50,8 @@ h1 .nopaque-icons, h2 .nopaque-icons, h3 .nopaque-icons, h4 .nopaque-icons,
}
.nopaque-icons.service-icon[data-service="corpus-analysis"]:empty:before {content: "H";}
.nopaque-icons.service-icon[data-service="file-setup"]:empty:before {content: "E";}
.nopaque-icons.service-icon[data-service="nlp"]:empty:before {content: "G";}
.nopaque-icons.service-icon[data-service="ocr"]:empty:before {content: "F";}
.nopaque-icons.service-icon[data-service="spacy-nlp"]:empty:before {content: "G";}
.nopaque-icons.service-icon[data-service="tesseract-ocr"]:empty:before {content: "F";}
.status-text[data-status]:empty:before {content: attr(data-status);}

View File

@ -53,7 +53,7 @@ class CorpusAnalysisApp {
this.data.cQiClient = new CQiClient(this.settings.corpusId);
this.data.cQiClient.connect()
.then(cQiStatus => {
return this.data.cQiClient.corpora.get('CORPUS');
return this.data.cQiClient.corpora.get(`NOPAQUE_${this.settings.corpusId}`);
})
.then(
cQiCorpus => {

View File

@ -100,7 +100,10 @@ class JobDisplay extends RessourceDisplay {
}
setServiceArgs(serviceArgs) {
this.setElements(this.displayElement.querySelectorAll('.job-service-args'), serviceArgs);
this.setElements(
this.displayElement.querySelectorAll('.job-service-args'),
JSON.stringify(serviceArgs)
);
}
setServiceVersion(serviceVersion) {

View File

@ -10,25 +10,10 @@ class JobResultList extends RessourceList {
</tr>
`.trim(),
ressourceMapper: jobResult => {
let description;
if (jobResult.filename.endsWith('.pdf.zip')) {
description = 'PDF files with text layer';
} else if (jobResult.filename.endsWith('.txt.zip')) {
description = 'Raw text files';
} else if (jobResult.filename.endsWith('.vrt.zip')) {
description = 'VRT compliant files including the NLP data';
} else if (jobResult.filename.endsWith('.xml.zip')) {
description = 'TEI compliant files';
} else if (jobResult.filename.endsWith('.poco.zip')) {
description = 'HOCR and image files for post correction (PoCo)';
} else {
description = 'All result files created during this job';
}
return {
id: jobResult.id,
creationDate: jobResult.creation_date,
description: description,
description: jobResult.description,
filename: jobResult.filename
};
},

View File

@ -19,12 +19,12 @@
'darken': '#a1b300',
'lighten': '#f2f3e1'
},
'nlp': {
'spacy-nlp': {
'base': '#98acd2',
'darken': '#0064a3',
'lighten': '#e5e8f5'
},
'ocr': {
'tesseract-ocr': {
'base': '#a9d8c8',
'darken': '#00a58b',
'lighten': '#e7f4f1'

View File

@ -15,8 +15,8 @@
<li><div class="divider"></div></li>
<li><a class="subheader">Processes & Services</a></li>
<li class="service-color service-color-border border-darken" data-service="file-setup" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='file-setup') }}"><i class="nopaque-icons service-icon" data-service="file-setup"></i>File setup</a></li>
<li class="service-color service-color-border border-darken" data-service="ocr" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='ocr') }}"><i class="nopaque-icons service-icon" data-service="ocr"></i>OCR</a></li>
<li class="service-color service-color-border border-darken" data-service="nlp" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='nlp') }}"><i class="nopaque-icons service-icon" data-service="nlp"></i>NLP</a></li>
<li class="service-color service-color-border border-darken" data-service="tesseract-ocr" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='tesseract-ocr') }}"><i class="nopaque-icons service-icon" data-service="tesseract-ocr"></i>OCR</a></li>
<li class="service-color service-color-border border-darken" data-service="spacy-nlp" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='spacy-nlp') }}"><i class="nopaque-icons service-icon" data-service="spacy-nlp"></i>NLP</a></li>
<li class="service-color service-color-border border-darken" data-service="corpus-analysis" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='corpus-analysis') }}"><i class="nopaque-icons service-icon" data-service="corpus-analysis"></i>Corpus analysis</a></li>
<li><div class="divider"></div></li>
<li><a class="subheader">Account</a></li>
@ -28,6 +28,9 @@
{% if current_user.can(Permission.ADMINISTRATE) %}
<li><a href="{{ url_for('admin.index') }}"><i class="material-icons">admin_panel_settings</i>Administration</a></li>
{% endif %}
{% if current_user.can(Permission.CONTRIBUTE) %}
<li><a href="{{ url_for('contribute.index') }}"><i class="material-icons">new_label</i>Contribute</a></li>
{% endif %}
{% if current_user.can(Permission.USE_API) %}
<li><a href="{{ url_for('api.doc') }}"><i class="material-icons">api</i>API</a></li>
{% endif %}

View File

@ -120,32 +120,32 @@
</a>
<br><br>
<p class="service-color-text darken" data-service="file-setup"><b>File setup</b></p>
<p class="light">Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing and the application of other services.</p>
<p class="light">Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing.</p>
<a href="{{ url_for('services.service', service='file-setup') }}" class="waves-effect waves-light btn service-color darken" data-service="file-setup">Create Job</a>
</div>
</div>
<div class="col s12 m4">
<div class="card-panel center-align hoverable">
<br>
<a href="{{ url_for('services.service', service='ocr') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);">
<i class="nopaque-icons service-color darken service-icon" data-service="ocr" style="font-size: 2.5rem;"></i>
<a href="{{ url_for('services.service', service='tesseract-ocr') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);">
<i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr" style="font-size: 2.5rem;"></i>
</a>
<br><br>
<p class="service-color-text darken" data-service="ocr"><b>Optical Character Recognition</b></p>
<p class="service-color-text darken" data-service="tesseract-ocr"><b>Optical Character Recognition</b></p>
<p class="light">nopaque converts your image data like photos or scans into text data through a process called OCR. This step enables you to proceed with further computational analysis of your documents.</p>
<a href="{{ url_for('services.service', service='ocr') }}" class="waves-effect waves-light btn service-color darken" data-service="ocr">Create Job</a>
<a href="{{ url_for('services.service', service='tesseract-ocr') }}" class="waves-effect waves-light btn service-color darken" data-service="tesseract-ocr">Create Job</a>
</div>
</div>
<div class="col s12 m4">
<div class="card-panel center-align hoverable">
<br>
<a href="{{ url_for('services.service', service='nlp') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);">
<i class="nopaque-icons service-color darken service-icon" data-service="nlp" style="font-size: 2.5rem;"></i>
<a href="{{ url_for('services.service', service='spacy-nlp') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);">
<i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp" style="font-size: 2.5rem;"></i>
</a>
<br><br>
<p class="service-color-text darken" data-service="nlp"><b>Natural Language Processing</b></p>
<p class="service-color-text darken" data-service="spacy-nlp"><b>Natural Language Processing</b></p>
<p class="light">By means of computational linguistic data processing (tokenization, lemmatization, part-of-speech tagging and named-entity recognition) nopaque extracts additional information from your text.</p>
<a href="{{ url_for('services.service', service='nlp') }}" class="waves-effect waves-light btn service-color darken" data-service="nlp">Create Job</a>
<a href="{{ url_for('services.service', service='spacy-nlp') }}" class="waves-effect waves-light btn service-color darken" data-service="spacy-nlp">Create Job</a>
</div>
</div>
</div>

View File

@ -84,11 +84,11 @@
<p class="light">Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing and the application of other services.</p>
</div>
<div class="col s12 m6 l3 center-align">
<a href="{{ url_for('services.service', service='ocr') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
<i class="nopaque-icons service-color darken service-icon" data-service="ocr"></i>
<a href="{{ url_for('services.service', service='tesseract-ocr') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
<i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr"></i>
</a>
<br><br>
<p class="service-color-text text-darken" data-service="ocr"><b>Optical Character Recognition</b></p>
<p class="service-color-text text-darken" data-service="tesseract-ocr"><b>Optical Character Recognition</b></p>
<p class="light">nopaque converts your image data like photos or scans into text data through OCR making it machine readable. This step enables you to proceed with further computational analysis of your documents.</p>
</div>
<div class="col s12 m6 l3 center-align">

View File

@ -2,7 +2,7 @@
{% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %}
{% import "materialize/wtf.html.j2" as wtf %}
{% block main_attribs %} class="service-scheme" data-service="nlp"{% endblock main_attribs %}
{% block main_attribs %} class="service-scheme" data-service="spacy-nlp"{% endblock main_attribs %}
{% block page_content %}
<div class="container">
@ -16,13 +16,13 @@
<p class="hide-on-small-only">&nbsp;</p>
<p class="hide-on-small-only">&nbsp;</p>
<a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
<i class="nopaque-icons service-color darken service-icon" data-service="nlp"></i>
<i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp"></i>
</a>
</div>
</div>
<div class="col s12 m9 pull-m3">
<div class="card service-color-border border-darken" data-service="nlp" style="border-top: 10px solid;">
<div class="card service-color-border border-darken" data-service="spacy-nlp" style="border-top: 10px solid;">
<div class="card-content">
<div class="row">
<div class="col s12 m6">
@ -71,7 +71,7 @@
{{ wtf.render_field(form.files, accept='text/plain', placeholder='Choose your .txt files') }}
</div>
<div class="col s12 l4">
{{ wtf.render_field(form.language, material_icon='language') }}
{{ wtf.render_field(form.model, material_icon='language') }}
</div>
<div class="col s12 l3">
{{ wtf.render_field(form.version, material_icon='apps') }}
@ -80,13 +80,13 @@
<span class="card-title">Preprocessing</span>
</div>
<div class="col s9">
<p>{{ form.check_encoding.label.text }}</p>
<p>{{ form.encoding_detection.label.text }}</p>
<p class="light">If the input files are not created with the nopaque OCR service or you do not know if your text files are UTF-8 encoded, check this switch. We will try to automatically determine the right encoding for your texts to process them.</p>
</div>
<div class="col s3 right-align">
<div class="switch">
<label>
{{ form.check_encoding() }}
{{ form.encoding_detection() }}
<span class="lever"></span>
</label>
</div>

View File

@ -2,7 +2,7 @@
{% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %}
{% import "materialize/wtf.html.j2" as wtf %}
{% block main_attribs %} class="service-scheme" data-service="ocr"{% endblock main_attribs %}
{% block main_attribs %} class="service-scheme" data-service="tesseract-ocr"{% endblock main_attribs %}
{% block page_content %}
<div class="container">
@ -16,13 +16,13 @@
<p class="hide-on-small-only">&nbsp;</p>
<p class="hide-on-small-only">&nbsp;</p>
<a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
<i class="nopaque-icons service-color darken service-icon" data-service="ocr"></i>
<i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr"></i>
</a>
</div>
</div>
<div class="col s12 m9 pull-m3">
<div class="card service-color-border border-darken" data-service="ocr" style="border-top: 10px solid;">
<div class="card service-color-border border-darken" data-service="tesseract-ocr" style="border-top: 10px solid;">
<div class="card-content">
<div class="row">
<div class="col s12">
@ -50,10 +50,10 @@
{{ wtf.render_field(form.description, data_length='255', material_icon='description') }}
</div>
<div class="col s12 l5">
{{ wtf.render_field(form.files, accept='application/pdf', color=ocr_color_darken, placeholder='Choose your .pdf files') }}
{{ wtf.render_field(form.files, accept='application/pdf', placeholder='Choose your .pdf files') }}
</div>
<div class="col s12 l4">
{{ wtf.render_field(form.language, material_icon='language') }}
{{ wtf.render_field(form.model, material_icon='language') }}
</div>
<div class="col s12 l3">
{{ wtf.render_field(form.version, material_icon='apps') }}
@ -127,7 +127,7 @@
</div>
</div>
<div class="card-action right-align">
{{ wtf.render_field(form.submit, color=ocr_color_darken, material_icon='send') }}
{{ wtf.render_field(form.submit, material_icon='send') }}
</div>
</form>
</div>

View File

@ -1,10 +0,0 @@
from app import hashids
from werkzeug.routing import BaseConverter
class HashidConverter(BaseConverter):
def to_python(self, value: str) -> int:
return hashids.decode(value)[0]
def to_url(self, value: int) -> str:
return hashids.encode(value)

View File

@ -5,14 +5,14 @@
version: "3.5"
networks:
reverse-proxy:
external:
name: reverse-proxy
traefik:
external: true
name: "traefik"
services:
nopaque:
labels:
- "traefik.docker.network=reverse-proxy"
- "traefik.docker.network=traefik"
- "traefik.enable=true"
### <http> ###
- "traefik.http.middlewares.http-nopaque-headers.headers.customrequestheaders.X-Forwarded-Proto=http"

View File

@ -0,0 +1,45 @@
"""empty message
Revision ID: ad0d835fe5b1
Revises: 68ed092ffe5e
Create Date: 2022-01-18 16:23:45.673993
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = 'ad0d835fe5b1'
down_revision = '68ed092ffe5e'
branch_labels = None
depends_on = None
def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.create_table('tesseract_ocr_models',
sa.Column('creation_date', sa.DateTime(), nullable=True),
sa.Column('filename', sa.String(length=255), nullable=True),
sa.Column('last_edited_date', sa.DateTime(), nullable=True),
sa.Column('mimetype', sa.String(length=255), nullable=True),
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('user_id', sa.Integer(), nullable=True),
sa.Column('compatible_service_versions', sa.String(length=255), nullable=True),
sa.Column('description', sa.String(length=255), nullable=True),
sa.Column('publisher', sa.String(length=128), nullable=True),
sa.Column('publishing_year', sa.Integer(), nullable=True),
sa.Column('title', sa.String(length=64), nullable=True),
sa.Column('version', sa.String(length=16), nullable=True),
sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
sa.PrimaryKeyConstraint('id')
)
op.add_column('job_results', sa.Column('description', sa.String(length=255), nullable=True))
# ### end Alembic commands ###
def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column('job_results', 'description')
op.drop_table('tesseract_ocr_models')
# ### end Alembic commands ###

View File

@ -3,10 +3,9 @@
import eventlet
eventlet.monkey_patch()
from app import db, cli, create_app # noqa
from app.models import (Corpus, CorpusFile, Job, JobInput, JobResult,
Permission, QueryResult, Role, User) # noqa
Permission, QueryResult, Role, TesseractOCRModel, User) # noqa
from app import db, cli, create_app # noqa
from flask import Flask # noqa
from typing import Any, Dict # noqa
@ -34,5 +33,6 @@ def make_shell_context() -> Dict[str, Any]:
'Permission': Permission,
'QueryResult': QueryResult,
'Role': Role,
'TesseractOCRModel': TesseractOCRModel,
'User': User
}

View File

@ -19,5 +19,7 @@ hiredis
jsonschema
psycopg2
python-dotenv
pyyaml
redis
tqdm
wtforms[email]