mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2024-12-24 18:34:18 +00:00
Big update, corpus analysis reworked, versioned services, preliminary work for contributions
This commit is contained in:
parent
0647537192
commit
fe938c0ca2
816
app/TesseractOCRModel.defaults.yml
Normal file
816
app/TesseractOCRModel.defaults.yml
Normal file
@ -0,0 +1,816 @@
|
||||
# - title: 'Afrikaans'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/afr.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Amharic'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/amh.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
- title: 'Arabic'
|
||||
description: ''
|
||||
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ara.traineddata'
|
||||
publisher: 'tesseract-ocr'
|
||||
publishing_year: 2021
|
||||
version: '4.1.0'
|
||||
compatible_service_versions:
|
||||
- '0.1.0'
|
||||
# - title: 'Assamese'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/asm.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Azerbaijani'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/aze.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Azerbaijani - Cyrillic'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/aze_cyrl.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Belarusian'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bel.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Bengali'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ben.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Tibetan'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bod.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Bosnian'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bos.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Bulgarian'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bul.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Catalan; Valencian'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/cat.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Cebuano'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ceb.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Czech'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ces.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Chinese - Simplified'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chi_sim.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
- title: 'Chinese - Traditional'
|
||||
description: ''
|
||||
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chi_tra.traineddata'
|
||||
publisher: 'tesseract-ocr'
|
||||
publishing_year: 2021
|
||||
version: '4.1.0'
|
||||
compatible_service_versions:
|
||||
- '0.1.0'
|
||||
# - title: 'Cherokee'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chr.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Welsh'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/cym.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
- title: 'Danish'
|
||||
description: ''
|
||||
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/dan.traineddata'
|
||||
publisher: 'tesseract-ocr'
|
||||
publishing_year: 2021
|
||||
version: '4.1.0'
|
||||
compatible_service_versions:
|
||||
- '0.1.0'
|
||||
- title: 'German'
|
||||
description: ''
|
||||
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/deu.traineddata'
|
||||
publisher: 'tesseract-ocr'
|
||||
publishing_year: 2021
|
||||
version: '4.1.0'
|
||||
compatible_service_versions:
|
||||
- '0.1.0'
|
||||
# - title: 'Dzongkha'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/dzo.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
- title: 'Greek, Modern (1453-)'
|
||||
description: ''
|
||||
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ell.traineddata'
|
||||
publisher: 'tesseract-ocr'
|
||||
publishing_year: 2021
|
||||
version: '4.1.0'
|
||||
compatible_service_versions:
|
||||
- '0.1.0'
|
||||
- title: 'English'
|
||||
description: ''
|
||||
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/eng.traineddata'
|
||||
publisher: 'tesseract-ocr'
|
||||
publishing_year: 2021
|
||||
version: '4.1.0'
|
||||
compatible_service_versions:
|
||||
- '0.1.0'
|
||||
- title: 'English, Middle (1100-1500)'
|
||||
description: ''
|
||||
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/enm.traineddata'
|
||||
publisher: 'tesseract-ocr'
|
||||
publishing_year: 2021
|
||||
version: '4.1.0'
|
||||
compatible_service_versions:
|
||||
- '0.1.0'
|
||||
# - title: 'Esperanto'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/epo.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Estonian'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/est.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Basque'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/eus.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Persian'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fas.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Finnish'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fin.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
- title: 'French'
|
||||
description: ''
|
||||
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fra.traineddata'
|
||||
publisher: 'tesseract-ocr'
|
||||
publishing_year: 2021
|
||||
version: '4.1.0'
|
||||
compatible_service_versions:
|
||||
- '0.1.0'
|
||||
- title: 'German Fraktur'
|
||||
description: ''
|
||||
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/frk.traineddata'
|
||||
publisher: 'tesseract-ocr'
|
||||
publishing_year: 2021
|
||||
version: '4.1.0'
|
||||
compatible_service_versions:
|
||||
- '0.1.0'
|
||||
- title: 'French, Middle (ca. 1400-1600)'
|
||||
description: ''
|
||||
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/frm.traineddata'
|
||||
publisher: 'tesseract-ocr'
|
||||
publishing_year: 2021
|
||||
version: '4.1.0'
|
||||
compatible_service_versions:
|
||||
- '0.1.0'
|
||||
# - title: 'Irish'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/gle.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Galician'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/glg.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
- title: 'Greek, Ancient (-1453)'
|
||||
description: ''
|
||||
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/grc.traineddata'
|
||||
publisher: 'tesseract-ocr'
|
||||
publishing_year: 2021
|
||||
version: '4.1.0'
|
||||
compatible_service_versions:
|
||||
- '0.1.0'
|
||||
# - title: 'Gujarati'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/guj.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Haitian; Haitian Creole'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hat.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Hebrew'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/heb.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Hindi'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hin.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Croatian'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hrv.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Hungarian'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hun.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Inuktitut'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/iku.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Indonesian'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ind.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Icelandic'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/isl.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
- title: 'Italian'
|
||||
description: ''
|
||||
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ita.traineddata'
|
||||
publisher: 'tesseract-ocr'
|
||||
publishing_year: 2021
|
||||
version: '4.1.0'
|
||||
compatible_service_versions:
|
||||
- '0.1.0'
|
||||
- title: 'Italian - Old'
|
||||
description: ''
|
||||
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ita_old.traineddata'
|
||||
publisher: 'tesseract-ocr'
|
||||
publishing_year: 2021
|
||||
version: '4.1.0'
|
||||
compatible_service_versions:
|
||||
- '0.1.0'
|
||||
# - title: 'Javanese'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/jav.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Japanese'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/jpn.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Kannada'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kan.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Georgian'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kat.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Georgian - Old'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kat_old.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Kazakh'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kaz.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Central Khmer'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/khm.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Kirghiz; Kyrgyz'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kir.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Korean'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kor.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Kurdish'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kur.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Lao'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lao.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Latin'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lat.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Latvian'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lav.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Lithuanian'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lit.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Malayalam'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mal.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Marathi'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mar.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Macedonian'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mkd.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Maltese'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mlt.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Malay'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/msa.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Burmese'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mya.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Nepali'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nep.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Dutch; Flemish'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nld.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Norwegian'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nor.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Oriya'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ori.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Panjabi; Punjabi'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pan.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Polish'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pol.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
- title: 'Portuguese'
|
||||
description: ''
|
||||
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/por.traineddata'
|
||||
publisher: 'tesseract-ocr'
|
||||
publishing_year: 2021
|
||||
version: '4.1.0'
|
||||
compatible_service_versions:
|
||||
- '0.1.0'
|
||||
# - title: 'Pushto; Pashto'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pus.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Romanian; Moldavian; Moldovan'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ron.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
- title: 'Russian'
|
||||
description: ''
|
||||
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/rus.traineddata'
|
||||
publisher: 'tesseract-ocr'
|
||||
publishing_year: 2021
|
||||
version: '4.1.0'
|
||||
compatible_service_versions:
|
||||
- '0.1.0'
|
||||
# - title: 'Sanskrit'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/san.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Sinhala; Sinhalese'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/sin.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Slovak'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/slk.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Slovenian'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/slv.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
- title: 'Spanish; Castilian'
|
||||
description: ''
|
||||
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/spa.traineddata'
|
||||
publisher: 'tesseract-ocr'
|
||||
publishing_year: 2021
|
||||
version: '4.1.0'
|
||||
compatible_service_versions:
|
||||
- '0.1.0'
|
||||
- title: 'Spanish; Castilian - Old'
|
||||
description: ''
|
||||
url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/spa_old.traineddata'
|
||||
publisher: 'tesseract-ocr'
|
||||
publishing_year: 2021
|
||||
version: '4.1.0'
|
||||
compatible_service_versions:
|
||||
- '0.1.0'
|
||||
# - title: 'Albanian'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/sqi.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Serbian'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/srp.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Serbian - Latin'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/srp_latn.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Swahili'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/swa.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Swedish'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/swe.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Syriac'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/syr.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Tamil'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tam.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Telugu'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tel.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Tajik'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tgk.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Tagalog'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tgl.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Thai'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tha.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Tigrinya'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tir.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Turkish'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tur.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Uighur; Uyghur'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uig.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Ukrainian'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ukr.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Urdu'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/urd.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Uzbek'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uzb.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Uzbek - Cyrillic'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uzb_cyrl.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Vietnamese'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/vie.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
||||
# - title: 'Yiddish'
|
||||
# description: ''
|
||||
# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/yid.traineddata'
|
||||
# publisher: 'tesseract-ocr'
|
||||
# publishing_year: 2021
|
||||
# version: '4.1.0'
|
||||
# compatible_service_versions:
|
||||
# - '0.1.0'
|
@ -39,9 +39,6 @@ def create_app(config: Config = Config) -> Flask:
|
||||
socketio.init_app(
|
||||
app, message_queue=app.config['NOPAQUE_SOCKETIO_MESSAGE_QUEUE_URI'])
|
||||
|
||||
# from .utils import HashidConverter
|
||||
# app.url_map.converters['hashid'] = HashidConverter
|
||||
|
||||
from .events import socketio as socketio_events
|
||||
from .events import sqlalchemy as sqlalchemy_events
|
||||
|
||||
@ -54,6 +51,9 @@ def create_app(config: Config = Config) -> Flask:
|
||||
from .auth import bp as auth_blueprint
|
||||
app.register_blueprint(auth_blueprint, url_prefix='/auth')
|
||||
|
||||
from .contribute import bp as contribute_blueprint
|
||||
app.register_blueprint(contribute_blueprint, url_prefix='/contribute')
|
||||
|
||||
from .corpora import bp as corpora_blueprint
|
||||
app.register_blueprint(corpora_blueprint, url_prefix='/corpora')
|
||||
|
||||
|
@ -1,7 +1,6 @@
|
||||
from flask import Blueprint
|
||||
from flask_restx import Api
|
||||
|
||||
from .jobs import ns as jobs_ns
|
||||
from .tokens import ns as tokens_ns
|
||||
|
||||
bp = Blueprint('api', __name__)
|
||||
@ -23,5 +22,4 @@ api = Api(
|
||||
version='1.0'
|
||||
)
|
||||
|
||||
api.add_namespace(jobs_ns)
|
||||
api.add_namespace(tokens_ns)
|
||||
|
@ -9,8 +9,12 @@ token_auth = HTTPTokenAuth()
|
||||
|
||||
@basic_auth.verify_password
|
||||
def verify_password(email_or_username, password):
|
||||
user = User.query.filter(or_(User.username == email_or_username,
|
||||
User.email == email_or_username.lower())).first()
|
||||
user = User.query.filter(
|
||||
or_(
|
||||
User.username == email_or_username,
|
||||
User.email == email_or_username.lower()
|
||||
)
|
||||
).first()
|
||||
if user and user.verify_password(password):
|
||||
return user
|
||||
|
||||
|
@ -1,48 +0,0 @@
|
||||
from flask_restx import Namespace, Resource
|
||||
from .auth import token_auth
|
||||
from ..jobs import tasks
|
||||
from ..models import Job
|
||||
|
||||
|
||||
ns = Namespace('jobs', description='Job operations')
|
||||
|
||||
|
||||
@ns.route('')
|
||||
class API_Jobs(Resource):
|
||||
'''Shows a list of all jobs and lets you POST to add new job'''
|
||||
|
||||
@ns.doc(security='apiKey')
|
||||
@token_auth.login_required
|
||||
def get(self):
|
||||
'''List all jobs'''
|
||||
# TODO: Implement the correct get_jobs functionality
|
||||
jobs = Job.query.all()
|
||||
return [job.to_dict(include_relationships=False) for job in jobs]
|
||||
|
||||
@ns.doc(security='apiKey')
|
||||
@token_auth.login_required
|
||||
def post(self):
|
||||
'''Create a new job'''
|
||||
# TODO: Implement this
|
||||
pass
|
||||
|
||||
|
||||
@ns.route('/<hashid:id>')
|
||||
class API_Job(Resource):
|
||||
'''Show a single job and lets you delete it'''
|
||||
|
||||
@ns.doc(security='apiKey')
|
||||
@token_auth.login_required
|
||||
def get(self, id):
|
||||
'''Get a job by id'''
|
||||
job = Job.query.get_or_404(id)
|
||||
return job.to_dict(include_relationships=False)
|
||||
|
||||
@ns.doc(security='apiKey')
|
||||
@token_auth.login_required
|
||||
def delete(self, id):
|
||||
'''Delete a job by id'''
|
||||
job = Job.query.get_or_404(id)
|
||||
# We use this imported task because it will run in the background
|
||||
tasks.delete_job(job.id)
|
||||
return '', 204
|
@ -60,28 +60,37 @@ def register():
|
||||
return redirect(url_for('main.dashboard'))
|
||||
form = RegistrationForm(prefix='registration-form')
|
||||
if form.validate_on_submit():
|
||||
user = User(email=form.email.data.lower(),
|
||||
password=form.password.data,
|
||||
username=form.username.data)
|
||||
user = User(
|
||||
email=form.email.data.lower(),
|
||||
password=form.password.data,
|
||||
username=form.username.data
|
||||
)
|
||||
db.session.add(user)
|
||||
db.session.commit()
|
||||
db.session.flush(objects=[user])
|
||||
db.session.refresh(user)
|
||||
try:
|
||||
os.makedirs(user.path)
|
||||
except OSError:
|
||||
current_app.logger.error(
|
||||
f'Make dir {user.path} led to an OSError!')
|
||||
db.session.delete(user)
|
||||
db.session.commit()
|
||||
user.makedirs()
|
||||
except OSError as e:
|
||||
current_app.logger.error(e)
|
||||
db.session.rollback()
|
||||
abort(500)
|
||||
else:
|
||||
token = user.generate_confirmation_token()
|
||||
msg = create_message(user.email, 'Confirm Your Account',
|
||||
'auth/email/confirm', token=token, user=user)
|
||||
msg = create_message(
|
||||
user.email,
|
||||
'Confirm Your Account',
|
||||
'auth/email/confirm',
|
||||
token=token,
|
||||
user=user
|
||||
)
|
||||
send(msg)
|
||||
flash('A confirmation email has been sent to you by email.')
|
||||
return redirect(url_for('.login'))
|
||||
return render_template('auth/register.html.j2', form=form,
|
||||
title='Register')
|
||||
return render_template(
|
||||
'auth/register.html.j2',
|
||||
form=form,
|
||||
title='Register'
|
||||
)
|
||||
|
||||
|
||||
@bp.route('/confirm/<token>')
|
||||
|
88
app/cli.py
88
app/cli.py
@ -1,16 +1,44 @@
|
||||
from . import db
|
||||
from .models import Corpus, Role
|
||||
from flask import current_app
|
||||
from flask_migrate import upgrade
|
||||
from . import db
|
||||
from .models import Corpus, Job, Role, User, TesseractOCRModel
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
|
||||
def _make_default_dirs():
|
||||
base_dir = current_app.config['NOPAQUE_DATA_DIR']
|
||||
|
||||
default_directories = [
|
||||
os.path.join(base_dir, 'tmp'),
|
||||
os.path.join(base_dir, 'users')
|
||||
]
|
||||
for directory in default_directories:
|
||||
if os.path.exists(directory):
|
||||
if not os.path.isdir(directory):
|
||||
raise NotADirectoryError(f'{directory} is not a directory')
|
||||
else:
|
||||
os.mkdir(directory)
|
||||
|
||||
|
||||
def register(app):
|
||||
@app.cli.command()
|
||||
def deploy():
|
||||
''' Run deployment tasks. '''
|
||||
# Make default directories
|
||||
_make_default_dirs()
|
||||
|
||||
# migrate database to latest revision
|
||||
upgrade()
|
||||
# create or update user roles
|
||||
Role.insert_roles()
|
||||
|
||||
# Insert/Update default database values
|
||||
current_app.logger.info('Insert/Update default roles')
|
||||
Role.insert_defaults()
|
||||
current_app.logger.info('Insert/Update default users')
|
||||
User.insert_defaults()
|
||||
current_app.logger.info('Insert/Update default tesseract ocr models')
|
||||
TesseractOCRModel.insert_defaults()
|
||||
|
||||
@app.cli.group()
|
||||
def daemon():
|
||||
@ -40,3 +68,55 @@ def register(app):
|
||||
from unittest.suite import TestSuite
|
||||
tests: TestSuite = TestLoader().discover('tests')
|
||||
TextTestRunner(verbosity=2).run(tests)
|
||||
|
||||
@app.cli.group()
|
||||
def convert():
|
||||
''' Datebase convert commands. '''
|
||||
|
||||
@convert.command()
|
||||
def nlp_jobs():
|
||||
for job in Job.query.filter_by(service='nlp').all():
|
||||
job.service = 'spacy-nlp'
|
||||
service_args = json.loads(job.service_args)
|
||||
new_service_args = {}
|
||||
for service_arg in service_args:
|
||||
if service_arg == '--check-encoding':
|
||||
new_service_args['encoding_detection'] = True
|
||||
elif re.match(r'-l ([a-z]{2})', service_arg):
|
||||
language_code = re.search(r'-l ([a-z]{2})', service_arg).group(1) # noqa
|
||||
new_service_args['language'] = language_code
|
||||
job.service_args = json.dumps(new_service_args)
|
||||
db.session.commit()
|
||||
|
||||
@convert.command()
|
||||
def ocr_jobs():
|
||||
# Language code to TesseractOCRModel.title lookup
|
||||
language_code_lookup = {
|
||||
'ara': 'Arabic',
|
||||
'chi_tra': 'Chinese - Traditional',
|
||||
'dan': 'Danish',
|
||||
'eng': 'English',
|
||||
'enm': 'English, Middle (1100-1500)',
|
||||
'fra': 'French',
|
||||
'frm': 'French, Middle (ca. 1400-1600)',
|
||||
'deu': 'German',
|
||||
'frk': 'German Fraktur',
|
||||
'ell': 'Greek, Modern (1453-)',
|
||||
'ita': 'Italian',
|
||||
'por': 'Portuguese',
|
||||
'rus': 'Russian',
|
||||
'spa': 'Spanish; Castilian'
|
||||
}
|
||||
for job in Job.query.filter_by(service='ocr').all():
|
||||
job.service = 'tesseract-ocr'
|
||||
service_args = json.loads(job.service_args)
|
||||
new_service_args = {}
|
||||
for service_arg in service_args:
|
||||
if service_arg == '--binarize':
|
||||
new_service_args['binarization'] = True
|
||||
elif re.match(r'-l ([a-z]{3})', service_arg):
|
||||
language_code = re.search(r'-l ([a-z]{3})', service_arg).group(1) # noqa
|
||||
tesseract_ocr_model = TesseractOCRModel.query.filter_by(title=language_code_lookup[language_code]).first() # noqa
|
||||
new_service_args['model'] = tesseract_ocr_model.id
|
||||
job.service_args = json.dumps(new_service_args)
|
||||
db.session.commit()
|
||||
|
5
app/contribute/__init__.py
Normal file
5
app/contribute/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
from flask import Blueprint
|
||||
|
||||
|
||||
bp = Blueprint('contribute', __name__)
|
||||
from . import routes
|
19
app/contribute/routes.py
Normal file
19
app/contribute/routes.py
Normal file
@ -0,0 +1,19 @@
|
||||
from flask import flash, redirect, render_template, url_for
|
||||
from flask_login import login_required
|
||||
from . import bp
|
||||
from .. import db
|
||||
from ..decorators import permission_required
|
||||
from ..models import Permission, Role, User
|
||||
from ..settings import tasks as settings_tasks
|
||||
|
||||
|
||||
@bp.before_request
|
||||
@login_required
|
||||
@permission_required(Permission.CONTRIBUTE)
|
||||
def before_request():
|
||||
pass
|
||||
|
||||
|
||||
@bp.route('/')
|
||||
def index():
|
||||
pass
|
@ -93,12 +93,12 @@ def connect(auth):
|
||||
|
||||
@socketio.on('disconnect', namespace=NAMESPACE)
|
||||
def disconnect():
|
||||
if 'd' not in session:
|
||||
return
|
||||
session['d']['cqi_client_lock'].acquire()
|
||||
try:
|
||||
session['d']['cqi_client'].disconnect()
|
||||
except cqi.errors.CQiException:
|
||||
pass
|
||||
except BrokenPipeError:
|
||||
except (BrokenPipeError, cqi.errors.CQiException):
|
||||
pass
|
||||
session['d']['cqi_client_lock'].release()
|
||||
corpus = Corpus.query.get(session['d']['corpus_id'])
|
||||
|
@ -12,7 +12,10 @@ def cqi_over_socketio(f):
|
||||
f_args = {}
|
||||
# Check for missing args and if all provided args are of the right type
|
||||
for param in signature(f).parameters.values():
|
||||
if param.annotation == cqi.CQiClient:
|
||||
if param.name == 'corpus_name':
|
||||
f_args[param.name] = f'NOPAQUE_{session["d"]["corpus_id"]}'
|
||||
continue
|
||||
if param.name == 'cqi_client':
|
||||
f_args[param.name] = session['d']['cqi_client']
|
||||
continue
|
||||
if param.default is param.empty:
|
||||
|
@ -1,6 +1,7 @@
|
||||
from flask import (abort, current_app, flash, make_response, redirect,
|
||||
render_template, url_for, send_from_directory)
|
||||
from flask_login import current_user, login_required
|
||||
from werkzeug.utils import secure_filename
|
||||
from . import bp
|
||||
from . import tasks
|
||||
from .forms import (AddCorpusFileForm, AddCorpusForm, EditCorpusFileForm,
|
||||
@ -29,18 +30,20 @@ def add_corpus():
|
||||
db.session.flush()
|
||||
db.session.refresh(corpus)
|
||||
try:
|
||||
os.makedirs(corpus.path)
|
||||
corpus.makedirs()
|
||||
except OSError as e:
|
||||
current_app.logger.error(f'Could not add corpus: {e}')
|
||||
current_app.logger.error(e)
|
||||
db.session.rollback()
|
||||
flash('Internal Server Error', 'error')
|
||||
abort(500)
|
||||
else:
|
||||
db.session.commit()
|
||||
flash(f'Corpus "{corpus.title}" added!', 'corpus')
|
||||
return redirect(url_for('.corpus', corpus_id=corpus.id))
|
||||
return render_template('corpora/add_corpus.html.j2', form=form,
|
||||
title='Add corpus')
|
||||
db.session.commit()
|
||||
flash(f'Corpus "{corpus.title}" added', 'corpus')
|
||||
return redirect(url_for('.corpus', corpus_id=corpus.id))
|
||||
return render_template(
|
||||
'corpora/add_corpus.html.j2',
|
||||
form=form,
|
||||
title='Add corpus'
|
||||
)
|
||||
|
||||
|
||||
@bp.route('/import', methods=['GET', 'POST'])
|
||||
@ -174,7 +177,7 @@ def add_corpus_file(corpus_id):
|
||||
if not form.validate():
|
||||
return make_response(form.errors, 400)
|
||||
# Save the file
|
||||
form.file.data.save(os.path.join(corpus.path, form.file.data.filename))
|
||||
filename = secure_filename(form.file.data.filename)
|
||||
corpus_file = CorpusFile(
|
||||
address=form.address.data,
|
||||
author=form.author.data,
|
||||
@ -182,9 +185,10 @@ def add_corpus_file(corpus_id):
|
||||
chapter=form.chapter.data,
|
||||
corpus=corpus,
|
||||
editor=form.editor.data,
|
||||
filename=form.file.data.filename,
|
||||
filename=filename,
|
||||
institution=form.institution.data,
|
||||
journal=form.journal.data,
|
||||
mimetype='application/vrt+xml',
|
||||
pages=form.pages.data,
|
||||
publisher=form.publisher.data,
|
||||
publishing_year=form.publishing_year.data,
|
||||
@ -192,12 +196,25 @@ def add_corpus_file(corpus_id):
|
||||
title=form.title.data
|
||||
)
|
||||
db.session.add(corpus_file)
|
||||
db.session.flush(objects=[corpus_file])
|
||||
db.session.refresh(corpus_file)
|
||||
try:
|
||||
form.file.data.save(corpus_file.path)
|
||||
except OSError as e:
|
||||
current_app.logger.error(e)
|
||||
db.session.rollback()
|
||||
flash('Internal Server Error', 'error')
|
||||
return make_response({'redirect_url': url_for('.add_corpus_file', corpus_id=corpus.id)}, 500) # noqa
|
||||
corpus.status = 'unprepared'
|
||||
db.session.commit()
|
||||
flash(f'Corpus file "{corpus_file.filename}" added!', 'corpus')
|
||||
flash(f'Corpus file "{corpus_file.title}" added!', 'corpus')
|
||||
return make_response({'redirect_url': url_for('.corpus', corpus_id=corpus.id)}, 201) # noqa
|
||||
return render_template('corpora/add_corpus_file.html.j2', corpus=corpus,
|
||||
form=form, title='Add corpus file')
|
||||
return render_template(
|
||||
'corpora/add_corpus_file.html.j2',
|
||||
corpus=corpus,
|
||||
form=form,
|
||||
title='Add corpus file'
|
||||
)
|
||||
|
||||
|
||||
@bp.route('/<hashid:corpus_id>/files/<hashid:corpus_file_id>/delete')
|
||||
|
@ -17,11 +17,7 @@ class Daemon(CheckCorporaMixin, CheckJobsMixin):
|
||||
|
||||
def run(self):
|
||||
while True:
|
||||
try:
|
||||
self.check_corpora()
|
||||
self.check_jobs()
|
||||
db.session.commit()
|
||||
except Exception as e:
|
||||
current_app.logger.warning(e)
|
||||
pass
|
||||
self.check_corpora()
|
||||
self.check_jobs()
|
||||
db.session.commit()
|
||||
sleep(1.5)
|
||||
|
@ -26,37 +26,55 @@ class CheckCorporaMixin:
|
||||
def create_build_corpus_service(self, corpus):
|
||||
''' # Docker service settings # '''
|
||||
''' ## Command ## '''
|
||||
command = 'docker-entrypoint.sh build-corpus'
|
||||
command = ['bash', '-c']
|
||||
command.append(
|
||||
f'mkdir /corpora/data/nopaque_{corpus.id}'
|
||||
' && '
|
||||
'cwb-encode'
|
||||
' -c utf8'
|
||||
f' -d /corpora/data/nopaque_{corpus.id}'
|
||||
' -f /root/files/corpus.vrt'
|
||||
f' -R /usr/local/share/cwb/registry/nopaque_{corpus.id}'
|
||||
' -P pos -P lemma -P simple_pos'
|
||||
' -S ent:0+type -S s:0'
|
||||
' -S text:0+address+author+booktitle+chapter+editor+institution+journal+pages+publisher+publishing_year+school+title' # noqa
|
||||
' -xsB -9'
|
||||
' && '
|
||||
f'cwb-make -V NOPAQUE_{corpus.id}'
|
||||
)
|
||||
''' ## Constraints ## '''
|
||||
constraints = ['node.role==worker']
|
||||
''' ## Image ## '''
|
||||
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cqpserver:r1674' # noqa
|
||||
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cwb:r1702'
|
||||
''' ## Labels ## '''
|
||||
labels = {
|
||||
'origin': current_app.config['SERVER_NAME'],
|
||||
'type': 'build-corpus',
|
||||
'type': 'corpus.build',
|
||||
'corpus_id': str(corpus.id)
|
||||
}
|
||||
''' ## Mounts ## '''
|
||||
''' ### Corpus file mount ### '''
|
||||
corpus_file_source = os.path.join(corpus.path, 'merged', 'corpus.vrt')
|
||||
corpus_file_target = '/root/files/corpus.vrt'
|
||||
corpus_file_mount = f'{corpus_file_source}:{corpus_file_target}:ro'
|
||||
''' ### Corpus data mount ### '''
|
||||
corpus_data_source = os.path.join(corpus.path, 'data')
|
||||
corpus_data_target = '/corpora/data'
|
||||
corpus_data_mount = f'{corpus_data_source}:{corpus_data_target}:rw'
|
||||
# Make sure that their is no data in the corpus data directory
|
||||
shutil.rmtree(corpus_data_source, ignore_errors=True)
|
||||
os.mkdir(corpus_data_source)
|
||||
''' ### Corpus registry mount ### '''
|
||||
corpus_registry_source = os.path.join(corpus.path, 'registry')
|
||||
corpus_registry_target = '/usr/local/share/cwb/registry'
|
||||
corpus_registry_mount = f'{corpus_registry_source}:{corpus_registry_target}:rw' # noqa
|
||||
# Make sure that their is no data in the corpus registry directory
|
||||
shutil.rmtree(corpus_registry_source, ignore_errors=True)
|
||||
os.mkdir(corpus_registry_source)
|
||||
mounts = [corpus_file_mount, corpus_data_mount, corpus_registry_mount]
|
||||
mounts = []
|
||||
''' ### Data mount ### '''
|
||||
data_mount_source = os.path.join(corpus.path, 'cwb', 'data')
|
||||
data_mount_target = '/corpora/data'
|
||||
data_mount = f'{data_mount_source}:{data_mount_target}:rw'
|
||||
# Make sure that their is no data in the data directory
|
||||
shutil.rmtree(data_mount_source, ignore_errors=True)
|
||||
os.makedirs(data_mount_source)
|
||||
mounts.append(data_mount)
|
||||
''' ### File mount ### '''
|
||||
file_mount_source = os.path.join(corpus.path, 'cwb', 'corpus.vrt')
|
||||
file_mount_target = '/root/files/corpus.vrt'
|
||||
file_mount = f'{file_mount_source}:{file_mount_target}:ro'
|
||||
mounts.append(file_mount)
|
||||
''' ### Registry mount ### '''
|
||||
registry_mount_source = os.path.join(corpus.path, 'cwb', 'registry')
|
||||
registry_mount_target = '/usr/local/share/cwb/registry'
|
||||
registry_mount = f'{registry_mount_source}:{registry_mount_target}:rw'
|
||||
# Make sure that their is no data in the registry directory
|
||||
shutil.rmtree(registry_mount_source, ignore_errors=True)
|
||||
os.makedirs(registry_mount_source)
|
||||
mounts.append(registry_mount)
|
||||
''' ## Name ## '''
|
||||
name = f'build-corpus_{corpus.id}'
|
||||
''' ## Restart policy ## '''
|
||||
@ -74,7 +92,7 @@ class CheckCorporaMixin:
|
||||
except docker.errors.APIError as e:
|
||||
current_app.logger.error(
|
||||
f'Create service "{name}" failed '
|
||||
+ f'due to "docker.errors.APIError": {e}'
|
||||
f'due to "docker.errors.APIError": {e}'
|
||||
)
|
||||
return
|
||||
corpus.status = 'queued'
|
||||
@ -86,14 +104,14 @@ class CheckCorporaMixin:
|
||||
except docker.errors.NotFound as e:
|
||||
current_app.logger.error(
|
||||
f'Get service "{service_name}" failed '
|
||||
+ f'due to "docker.errors.NotFound": {e}'
|
||||
f'due to "docker.errors.NotFound": {e}'
|
||||
)
|
||||
corpus.status = 'failed'
|
||||
return
|
||||
except docker.errors.APIError as e:
|
||||
current_app.logger.error(
|
||||
f'Get service "{service_name}" failed '
|
||||
+ f'due to "docker.errors.APIError": {e}'
|
||||
f'due to "docker.errors.APIError": {e}'
|
||||
)
|
||||
service_tasks = service.tasks()
|
||||
if not service_tasks:
|
||||
@ -108,36 +126,47 @@ class CheckCorporaMixin:
|
||||
corpus.status = 'failed'
|
||||
else:
|
||||
return
|
||||
try:
|
||||
service.remove()
|
||||
except docker.errors.APIError as e:
|
||||
current_app.logger.error(
|
||||
f'Remove service "{service_name}" failed '
|
||||
+ f'due to "docker.errors.APIError": {e}'
|
||||
)
|
||||
# try:
|
||||
# service.remove()
|
||||
# except docker.errors.APIError as e:
|
||||
# current_app.logger.error(
|
||||
# f'Remove service "{service_name}" failed '
|
||||
# f'due to "docker.errors.APIError": {e}'
|
||||
# )
|
||||
|
||||
def create_cqpserver_container(self, corpus):
|
||||
''' # Docker container settings # '''
|
||||
''' ## Command ## '''
|
||||
command = 'cqpserver'
|
||||
command = []
|
||||
command.append(
|
||||
'echo "host *;" > cqpserver.init'
|
||||
' && '
|
||||
'echo "user anonymous \\"\\";" >> cqpserver.init'
|
||||
' && '
|
||||
'cqpserver -I cqpserver.init'
|
||||
)
|
||||
''' ## Detach ## '''
|
||||
detach = True
|
||||
''' ## Entrypoint ## '''
|
||||
entrypoint = ['bash', '-c']
|
||||
''' ## Image ## '''
|
||||
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cqpserver:r1674' # noqa
|
||||
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cwb:r1702'
|
||||
''' ## Name ## '''
|
||||
name = f'cqpserver_{corpus.id}'
|
||||
''' ## Network ## '''
|
||||
network = 'nopaque_default'
|
||||
''' ## Volumes ## '''
|
||||
volumes = []
|
||||
''' ### Corpus data volume ### '''
|
||||
corpus_data_source = os.path.join(corpus.path, 'data')
|
||||
corpus_data_target = '/corpora/data'
|
||||
corpus_data_volume = f'{corpus_data_source}:{corpus_data_target}:rw'
|
||||
data_volume_source = os.path.join(corpus.path, 'cwb', 'data')
|
||||
data_volume_target = '/corpora/data'
|
||||
data_volume = f'{data_volume_source}:{data_volume_target}:rw'
|
||||
volumes.append(data_volume)
|
||||
''' ### Corpus registry volume ### '''
|
||||
corpus_registry_source = os.path.join(corpus.path, 'registry')
|
||||
corpus_registry_target = '/usr/local/share/cwb/registry'
|
||||
corpus_registry_volume = f'{corpus_registry_source}:{corpus_registry_target}:rw' # noqa
|
||||
volumes = [corpus_data_volume, corpus_registry_volume]
|
||||
registry_volume_source = os.path.join(corpus.path, 'cwb', 'registry')
|
||||
registry_volume_target = '/usr/local/share/cwb/registry'
|
||||
registry_volume = f'{registry_volume_source}:{registry_volume_target}:rw' # noqa
|
||||
volumes.append(registry_volume)
|
||||
# Check if a cqpserver container already exists. If this is the case,
|
||||
# remove it and create a new one
|
||||
try:
|
||||
@ -147,7 +176,7 @@ class CheckCorporaMixin:
|
||||
except docker.errors.APIError as e:
|
||||
current_app.logger.error(
|
||||
f'Get container "{name}" failed '
|
||||
+ f'due to "docker.errors.APIError": {e}'
|
||||
f'due to "docker.errors.APIError": {e}'
|
||||
)
|
||||
return
|
||||
else:
|
||||
@ -156,7 +185,7 @@ class CheckCorporaMixin:
|
||||
except docker.errors.APIError as e:
|
||||
current_app.logger.error(
|
||||
f'Remove container "{name}" failed '
|
||||
+ f'due to "docker.errors.APIError": {e}'
|
||||
f'due to "docker.errors.APIError": {e}'
|
||||
)
|
||||
return
|
||||
try:
|
||||
@ -164,6 +193,7 @@ class CheckCorporaMixin:
|
||||
image,
|
||||
command=command,
|
||||
detach=detach,
|
||||
entrypoint=entrypoint,
|
||||
volumes=volumes,
|
||||
name=name,
|
||||
network=network
|
||||
@ -171,14 +201,14 @@ class CheckCorporaMixin:
|
||||
except docker.errors.ImageNotFound as e:
|
||||
current_app.logger.error(
|
||||
f'Run container "{name}" failed '
|
||||
+ f'due to "docker.errors.ImageNotFound" error: {e}'
|
||||
f'due to "docker.errors.ImageNotFound" error: {e}'
|
||||
)
|
||||
corpus.status = 'failed'
|
||||
return
|
||||
except docker.errors.APIError as e:
|
||||
current_app.logger.error(
|
||||
f'Run container "{name}" failed '
|
||||
+ f'due to "docker.errors.APIError" error: {e}'
|
||||
f'due to "docker.errors.APIError" error: {e}'
|
||||
)
|
||||
return
|
||||
corpus.status = 'analysing'
|
||||
@ -190,14 +220,14 @@ class CheckCorporaMixin:
|
||||
except docker.errors.NotFound as e:
|
||||
current_app.logger.error(
|
||||
f'Get container "{container_name}" failed '
|
||||
+ f'due to "docker.errors.NotFound": {e}'
|
||||
f'due to "docker.errors.NotFound": {e}'
|
||||
)
|
||||
corpus.num_analysis_sessions = 0
|
||||
corpus.status = 'prepared'
|
||||
except docker.errors.APIError as e:
|
||||
current_app.logger.error(
|
||||
f'Get container "{container_name}" failed '
|
||||
+ f'due to "docker.errors.APIError": {e}'
|
||||
f'due to "docker.errors.APIError": {e}'
|
||||
)
|
||||
|
||||
def remove_cqpserver_container(self, corpus):
|
||||
@ -210,7 +240,7 @@ class CheckCorporaMixin:
|
||||
except docker.errors.APIError as e:
|
||||
current_app.logger.error(
|
||||
f'Get container "{container_name}" failed '
|
||||
+ f'due to "docker.errors.APIError": {e}'
|
||||
f'due to "docker.errors.APIError": {e}'
|
||||
)
|
||||
return
|
||||
try:
|
||||
@ -218,5 +248,5 @@ class CheckCorporaMixin:
|
||||
except docker.errors.APIError as e:
|
||||
current_app.logger.error(
|
||||
f'Remove container "{container_name}" failed '
|
||||
+ f'due to "docker.errors.APIError": {e}'
|
||||
f'due to "docker.errors.APIError": {e}'
|
||||
)
|
||||
|
@ -2,7 +2,7 @@ from datetime import datetime
|
||||
from flask import current_app
|
||||
from werkzeug.utils import secure_filename
|
||||
from .. import db
|
||||
from ..models import Job, JobResult
|
||||
from ..models import Job, JobResult, TesseractOCRModel
|
||||
import docker
|
||||
import json
|
||||
import os
|
||||
@ -23,27 +23,34 @@ class CheckJobsMixin:
|
||||
''' # Docker service settings # '''
|
||||
''' ## Service specific settings ## '''
|
||||
if job.service == 'file-setup':
|
||||
mem_mb = 2048
|
||||
mem_mb = 512
|
||||
n_cores = 2
|
||||
executable = 'file-setup'
|
||||
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}file-setup:{job.service_version}' # noqa
|
||||
elif job.service == 'ocr':
|
||||
mem_mb = 4096
|
||||
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}file-setup:v{job.service_version}' # noqa
|
||||
elif job.service == 'tesseract-ocr':
|
||||
mem_mb = 2048
|
||||
n_cores = 4
|
||||
executable = 'ocr'
|
||||
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}ocr:{job.service_version}' # noqa
|
||||
elif job.service == 'nlp':
|
||||
mem_mb = 2048
|
||||
n_cores = 2
|
||||
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}ocr:v{job.service_version}' # noqa
|
||||
elif job.service == 'spacy-nlp':
|
||||
mem_mb = 1024
|
||||
n_cores = 1
|
||||
executable = 'nlp'
|
||||
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}nlp:{job.service_version}' # noqa
|
||||
image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}nlp:v{job.service_version}' # noqa
|
||||
''' ## Command ## '''
|
||||
command = f'{executable} -i /input -o /output'
|
||||
command += ' --log-dir /input'
|
||||
command += ' --log-dir /logs'
|
||||
command += f' --mem-mb {mem_mb}'
|
||||
command += f' --n-cores {n_cores}'
|
||||
command += f' --zip [{job.service}]_{secure_filename(job.title)}'
|
||||
command += ' ' + ' '.join(json.loads(job.service_args))
|
||||
service_args = json.loads(job.service_args)
|
||||
if job.service == 'spacy-nlp':
|
||||
command += f' -m {service_args["model"]}'
|
||||
if 'encoding_detection' in service_args and service_args['encoding_detection']: # noqa
|
||||
command += ' --check-encoding'
|
||||
elif job.service == 'tesseract-ocr':
|
||||
command += f' -m {service_args["model"]}'
|
||||
if 'binarization' in service_args and service_args['binarization']:
|
||||
command += ' --binarize'
|
||||
''' ## Constraints ## '''
|
||||
constraints = ['node.role==worker']
|
||||
''' ## Labels ## '''
|
||||
@ -53,20 +60,42 @@ class CheckJobsMixin:
|
||||
'job_id': str(job.id)
|
||||
}
|
||||
''' ## Mounts ## '''
|
||||
''' ### Input mount ### '''
|
||||
input_mount_source = job.path
|
||||
input_mount_target = '/input'
|
||||
mounts = []
|
||||
''' ### Input mount(s) ### '''
|
||||
input_mount_target_base = '/input'
|
||||
if job.service == 'file-setup':
|
||||
input_mount_target += f'/{secure_filename(job.title)}'
|
||||
input_mount = f'{input_mount_source}:{input_mount_target}:rw'
|
||||
input_mount_target_base += f'/{secure_filename(job.title)}'
|
||||
for job_input in job.inputs:
|
||||
input_mount_source = job_input.path
|
||||
input_mount_target = f'/{input_mount_target_base}/{job_input.filename}' # noqa
|
||||
input_mount = f'{input_mount_source}:{input_mount_target}:ro'
|
||||
mounts.append(input_mount)
|
||||
if job.service == 'tesseract-ocr':
|
||||
service_args = json.loads(job.service_args)
|
||||
model = TesseractOCRModel.query.get(service_args['model'])
|
||||
if model is None:
|
||||
job.status = 'failed'
|
||||
return
|
||||
models_mount_source = model.path
|
||||
models_mount_target = f'/usr/local/share/tessdata/{model.filename}'
|
||||
models_mount = f'{models_mount_source}:{models_mount_target}:ro'
|
||||
mounts.append(models_mount)
|
||||
''' ### Output mount ### '''
|
||||
output_mount_source = os.path.join(job.path, 'output')
|
||||
output_mount_source = os.path.join(job.path, 'results')
|
||||
output_mount_target = '/output'
|
||||
output_mount = f'{output_mount_source}:{output_mount_target}:rw'
|
||||
# Make sure that their is no data in the output directory
|
||||
shutil.rmtree(output_mount_source, ignore_errors=True)
|
||||
os.makedirs(output_mount_source)
|
||||
mounts = [input_mount, output_mount]
|
||||
mounts.append(output_mount)
|
||||
''' ### Pipeline data mount ### '''
|
||||
pyflow_data_mount_source = os.path.join(job.path, 'pipeline_data')
|
||||
pyflow_data_mount_target = '/logs/pyflow.data'
|
||||
pyflow_data_mount = f'{pyflow_data_mount_source}:{pyflow_data_mount_target}:rw' # noqa
|
||||
# Make sure that their is no data in the output directory
|
||||
shutil.rmtree(pyflow_data_mount_source, ignore_errors=True)
|
||||
os.makedirs(pyflow_data_mount_source)
|
||||
mounts.append(pyflow_data_mount)
|
||||
''' ## Name ## '''
|
||||
name = f'job_{job.id}'
|
||||
''' ## Resources ## '''
|
||||
@ -90,7 +119,7 @@ class CheckJobsMixin:
|
||||
except docker.errors.APIError as e:
|
||||
current_app.logger.error(
|
||||
f'Create service "{name}" failed '
|
||||
+ f'due to "docker.errors.APIError": {e}'
|
||||
f'due to "docker.errors.APIError": {e}'
|
||||
)
|
||||
return
|
||||
job.status = 'queued'
|
||||
@ -102,14 +131,14 @@ class CheckJobsMixin:
|
||||
except docker.errors.NotFound as e:
|
||||
current_app.logger.error(
|
||||
f'Get service "{service_name}" failed '
|
||||
+ f'due to "docker.errors.NotFound": {e}'
|
||||
f'due to "docker.errors.NotFound": {e}'
|
||||
)
|
||||
job.status = 'failed'
|
||||
return
|
||||
except docker.errors.APIError as e:
|
||||
current_app.logger.error(
|
||||
f'Get service "{service_name}" failed '
|
||||
+ f'due to "docker.errors.APIError": {e}'
|
||||
f'due to "docker.errors.APIError": {e}'
|
||||
)
|
||||
return
|
||||
service_tasks = service.tasks()
|
||||
@ -121,13 +150,25 @@ class CheckJobsMixin:
|
||||
return
|
||||
elif job.status == 'running' and task_state == 'complete':
|
||||
job.status = 'complete'
|
||||
results_dir = os.path.join(job.path, 'output')
|
||||
result_files = [x for x in os.listdir(results_dir) if x.endswith('.zip')] # noqa
|
||||
for result_file in result_files:
|
||||
job_result = JobResult(filename=result_file, job=job)
|
||||
results_dir = os.path.join(job.path, 'results')
|
||||
with open(os.path.join(results_dir, 'outputs.json')) as f:
|
||||
outputs = json.load(f)
|
||||
for output in outputs:
|
||||
filename = os.path.basename(output['file'])
|
||||
job_result = JobResult(
|
||||
filename=filename,
|
||||
job=job,
|
||||
mimetype=output['mimetype']
|
||||
)
|
||||
if 'description' in output:
|
||||
job_result.description = output['description']
|
||||
db.session.add(job_result)
|
||||
db.session.flush()
|
||||
db.session.flush(objects=[job_result])
|
||||
db.session.refresh(job_result)
|
||||
os.rename(
|
||||
os.path.join(results_dir, output['file']),
|
||||
job_result.path
|
||||
)
|
||||
elif job.status == 'running' and task_state == 'failed':
|
||||
job.status = 'failed'
|
||||
else:
|
||||
@ -138,7 +179,7 @@ class CheckJobsMixin:
|
||||
except docker.errors.APIError as e:
|
||||
current_app.logger.error(
|
||||
f'Remove service "{service_name}" failed '
|
||||
+ f'due to "docker.errors.APIError": {e}'
|
||||
f'due to "docker.errors.APIError": {e}'
|
||||
)
|
||||
|
||||
def remove_job_service(self, job):
|
||||
@ -151,7 +192,7 @@ class CheckJobsMixin:
|
||||
except docker.errors.APIError as e:
|
||||
current_app.logger.error(
|
||||
f'Get service "{service_name}" failed '
|
||||
+ f'due to "docker.errors.APIError": {e}'
|
||||
f'due to "docker.errors.APIError": {e}'
|
||||
)
|
||||
return
|
||||
try:
|
||||
@ -159,7 +200,7 @@ class CheckJobsMixin:
|
||||
except docker.errors.APIError as e:
|
||||
current_app.logger.error(
|
||||
f'Update service "{service_name}" failed '
|
||||
+ f'due to "docker.errors.APIError": {e}'
|
||||
f'due to "docker.errors.APIError": {e}'
|
||||
)
|
||||
return
|
||||
try:
|
||||
@ -167,5 +208,5 @@ class CheckJobsMixin:
|
||||
except docker.errors.APIError as e:
|
||||
current_app.logger.error(
|
||||
f'Remove "{service_name}" service failed '
|
||||
+ f'due to "docker.errors.APIError": {e}'
|
||||
f'due to "docker.errors.APIError": {e}'
|
||||
)
|
||||
|
@ -34,12 +34,14 @@ def delete_job(job_id):
|
||||
@login_required
|
||||
def download_job_input(job_id, job_input_id):
|
||||
job_input = JobInput.query.filter(JobInput.job_id == job_id, JobInput.id == job_input_id).first_or_404() # noqa
|
||||
if not (job_input.job.user == current_user
|
||||
or current_user.is_administrator()):
|
||||
if not (job_input.job.user == current_user or current_user.is_administrator()): # noqa
|
||||
abort(403)
|
||||
return send_from_directory(as_attachment=True,
|
||||
directory=os.path.dirname(job_input.path),
|
||||
filename=job_input.filename)
|
||||
return send_from_directory(
|
||||
as_attachment=True,
|
||||
attachment_filename=job_input.filename,
|
||||
directory=os.path.dirname(job_input.path),
|
||||
filename=os.path.basename(job_input.path)
|
||||
)
|
||||
|
||||
|
||||
@bp.route('/<hashid:job_id>/restart')
|
||||
@ -59,9 +61,11 @@ def restart(job_id):
|
||||
@login_required
|
||||
def download_job_result(job_id, job_result_id):
|
||||
job_result = JobResult.query.filter(JobResult.job_id == job_id, JobResult.id == job_result_id).first_or_404() # noqa
|
||||
if not (job_result.job.user == current_user
|
||||
or current_user.is_administrator()):
|
||||
if not (job_result.job.user == current_user or current_user.is_administrator()): # noqa
|
||||
abort(403)
|
||||
return send_from_directory(as_attachment=True,
|
||||
directory=os.path.dirname(job_result.path),
|
||||
filename=job_result.filename)
|
||||
return send_from_directory(
|
||||
as_attachment=True,
|
||||
attachment_filename=job_result.filename,
|
||||
directory=os.path.dirname(job_result.path),
|
||||
filename=os.path.basename(job_result.path)
|
||||
)
|
||||
|
174
app/models.py
174
app/models.py
@ -4,13 +4,17 @@ from flask_hashids import HashidMixin
|
||||
from flask_login import UserMixin
|
||||
from itsdangerous import BadSignature, TimedJSONWebSignatureSerializer
|
||||
from time import sleep
|
||||
from tqdm import tqdm
|
||||
from werkzeug.security import generate_password_hash, check_password_hash
|
||||
import xml.etree.ElementTree as ET
|
||||
from . import db, login
|
||||
import base64
|
||||
import enum
|
||||
import json
|
||||
import os
|
||||
import requests
|
||||
import shutil
|
||||
import xml.etree.ElementTree as ET
|
||||
import yaml
|
||||
|
||||
|
||||
class Permission(enum.IntEnum):
|
||||
@ -25,7 +29,7 @@ class Permission(enum.IntEnum):
|
||||
|
||||
class FileMixin:
|
||||
creation_date = db.Column(db.DateTime, default=datetime.utcnow)
|
||||
filename = db.Column(db.String(256))
|
||||
filename = db.Column(db.String(255))
|
||||
last_edited_date = db.Column(db.DateTime, default=datetime.utcnow)
|
||||
mimetype = db.Column(db.String(255))
|
||||
|
||||
@ -86,7 +90,7 @@ class Role(HashidMixin, db.Model):
|
||||
return dict_role
|
||||
|
||||
@staticmethod
|
||||
def insert_roles():
|
||||
def insert_defaults():
|
||||
roles = {
|
||||
'User': [],
|
||||
'API user': [Permission.USE_API],
|
||||
@ -132,6 +136,12 @@ class User(HashidMixin, UserMixin, db.Model):
|
||||
db.String(16), default='all')
|
||||
# Backrefs: role: Role
|
||||
# Relationships
|
||||
tesseract_ocr_models = db.relationship(
|
||||
'TesseractOCRModel',
|
||||
backref='user',
|
||||
cascade='all, delete-orphan',
|
||||
lazy='dynamic'
|
||||
)
|
||||
corpora = db.relationship(
|
||||
'Corpus',
|
||||
backref='user',
|
||||
@ -221,6 +231,12 @@ class User(HashidMixin, UserMixin, db.Model):
|
||||
def is_administrator(self):
|
||||
return self.can(Permission.ADMINISTRATE)
|
||||
|
||||
def makedirs(self):
|
||||
os.mkdir(self.path)
|
||||
os.mkdir(os.path.join(self.path, 'tesseract_ocr_models'))
|
||||
os.mkdir(os.path.join(self.path, 'corpora'))
|
||||
os.mkdir(os.path.join(self.path, 'jobs'))
|
||||
|
||||
def revoke_token(self):
|
||||
self.token_expiration = datetime.utcnow() - timedelta(seconds=1)
|
||||
|
||||
@ -269,6 +285,21 @@ class User(HashidMixin, UserMixin, db.Model):
|
||||
return None
|
||||
return user
|
||||
|
||||
@staticmethod
|
||||
def insert_defaults():
|
||||
if User.query.filter_by(username='nopaque').first() is not None:
|
||||
return
|
||||
user = User(username='nopaque')
|
||||
db.session.add(user)
|
||||
db.session.flush(objects=[user])
|
||||
db.session.refresh(user)
|
||||
try:
|
||||
user.makedirs()
|
||||
except OSError as e:
|
||||
current_app.logger.error(e)
|
||||
db.session.rollback()
|
||||
db.session.commit()
|
||||
|
||||
@staticmethod
|
||||
def reset_password(token, new_password):
|
||||
s = TimedJSONWebSignatureSerializer(current_app.config['SECRET_KEY'])
|
||||
@ -284,6 +315,72 @@ class User(HashidMixin, UserMixin, db.Model):
|
||||
return True
|
||||
|
||||
|
||||
class TesseractOCRModel(FileMixin, HashidMixin, db.Model):
|
||||
__tablename__ = 'tesseract_ocr_models'
|
||||
# Primary key
|
||||
id = db.Column(db.Integer, primary_key=True)
|
||||
# Foreign keys
|
||||
user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
|
||||
# Fields
|
||||
compatible_service_versions = db.Column(db.String(255))
|
||||
description = db.Column(db.String(255))
|
||||
publisher = db.Column(db.String(128))
|
||||
publishing_year = db.Column(db.Integer)
|
||||
title = db.Column(db.String(64))
|
||||
version = db.Column(db.String(16))
|
||||
# Backrefs: user: User
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return os.path.join(
|
||||
self.user.path,
|
||||
'tesseract_ocr_models',
|
||||
str(self.id)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def insert_defaults():
|
||||
user = User.query.filter_by(username='nopaque').first()
|
||||
defaults_file = os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)),
|
||||
'TesseractOCRModel.defaults.yml'
|
||||
)
|
||||
with open(defaults_file, 'r') as f:
|
||||
defaults = yaml.safe_load(f)
|
||||
for m in defaults:
|
||||
if TesseractOCRModel.query.filter_by(title=m['title'], version=m['version']).first() is not None: # noqa
|
||||
continue
|
||||
tesseract_ocr_model = TesseractOCRModel(
|
||||
compatible_service_versions=json.dumps(m['compatible_service_versions']), # noqa
|
||||
description=m['description'],
|
||||
publisher=m['publisher'],
|
||||
publishing_year=m['publishing_year'],
|
||||
title=m['title'],
|
||||
user=user,
|
||||
version=m['version']
|
||||
)
|
||||
db.session.add(tesseract_ocr_model)
|
||||
db.session.flush(objects=[tesseract_ocr_model])
|
||||
db.session.refresh(tesseract_ocr_model)
|
||||
tesseract_ocr_model.filename = f'{tesseract_ocr_model.id}.traineddata' # noqa
|
||||
r = requests.get(m['url'], stream=True)
|
||||
pbar = tqdm(
|
||||
desc=f'{tesseract_ocr_model.title} ({tesseract_ocr_model.filename})', # noqa
|
||||
unit="B",
|
||||
unit_scale=True,
|
||||
unit_divisor=1024,
|
||||
total=int(r.headers['Content-Length'])
|
||||
)
|
||||
pbar.clear()
|
||||
with open(tesseract_ocr_model.path, 'wb') as f:
|
||||
for chunk in r.iter_content(chunk_size=1024):
|
||||
if chunk: # filter out keep-alive new chunks
|
||||
pbar.update(len(chunk))
|
||||
f.write(chunk)
|
||||
pbar.close()
|
||||
db.session.commit()
|
||||
|
||||
|
||||
class JobInput(FileMixin, HashidMixin, db.Model):
|
||||
__tablename__ = 'job_inputs'
|
||||
# Primary key
|
||||
@ -309,7 +406,7 @@ class JobInput(FileMixin, HashidMixin, db.Model):
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return os.path.join(self.job.path, self.filename)
|
||||
return os.path.join(self.job.path, 'inputs', str(self.id))
|
||||
|
||||
def to_dict(self, backrefs=False, relationships=False):
|
||||
dict_job_input = {
|
||||
@ -347,6 +444,8 @@ class JobResult(FileMixin, HashidMixin, db.Model):
|
||||
id = db.Column(db.Integer, primary_key=True)
|
||||
# Foreign keys
|
||||
job_id = db.Column(db.Integer, db.ForeignKey('jobs.id'))
|
||||
# Fields
|
||||
description = db.Column(db.String(255))
|
||||
# Backrefs: job: Job
|
||||
|
||||
def __repr__(self):
|
||||
@ -366,12 +465,13 @@ class JobResult(FileMixin, HashidMixin, db.Model):
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return os.path.join(self.job.path, 'output', self.filename)
|
||||
return os.path.join(self.job.path, 'results', str(self.id))
|
||||
|
||||
def to_dict(self, backrefs=False, relationships=False):
|
||||
dict_job_result = {
|
||||
'id': self.hashid,
|
||||
'job_id': self.job.hashid,
|
||||
'description': self.description,
|
||||
'download_url': self.download_url,
|
||||
'url': self.url,
|
||||
**self.file_mixin_to_dict(
|
||||
@ -414,8 +514,8 @@ class Job(HashidMixin, db.Model):
|
||||
end_date = db.Column(db.DateTime())
|
||||
service = db.Column(db.String(64))
|
||||
'''
|
||||
' Service specific arguments as string list.
|
||||
' Example: ["-l eng", "--binarize"]
|
||||
' Dictionary as JSON formatted string.
|
||||
' Example: {"binarization": True}
|
||||
'''
|
||||
service_args = db.Column(db.String(255))
|
||||
service_version = db.Column(db.String(16))
|
||||
@ -472,6 +572,12 @@ class Job(HashidMixin, db.Model):
|
||||
shutil.rmtree(self.path, ignore_errors=True)
|
||||
db.session.delete(self)
|
||||
|
||||
def makedirs(self):
|
||||
os.mkdir(self.path)
|
||||
os.mkdir(os.path.join(self.path, 'inputs'))
|
||||
os.mkdir(os.path.join(self.path, 'pipeline_data'))
|
||||
os.mkdir(os.path.join(self.path, 'results'))
|
||||
|
||||
def restart(self):
|
||||
'''
|
||||
Restart a job - only if the status is complete or failed
|
||||
@ -479,7 +585,7 @@ class Job(HashidMixin, db.Model):
|
||||
|
||||
if self.status not in ['complete', 'failed']:
|
||||
raise Exception('Could not restart job: status is not "complete/failed"') # noqa
|
||||
shutil.rmtree(os.path.join(self.path, 'output'), ignore_errors=True)
|
||||
shutil.rmtree(os.path.join(self.path, 'results'), ignore_errors=True)
|
||||
shutil.rmtree(os.path.join(self.path, 'pyflow.data'), ignore_errors=True) # noqa
|
||||
for result in self.results:
|
||||
db.session.delete(result)
|
||||
@ -487,6 +593,10 @@ class Job(HashidMixin, db.Model):
|
||||
self.status = 'submitted'
|
||||
|
||||
def to_dict(self, backrefs=False, relationships=False):
|
||||
service_args = json.loads(self.service_args)
|
||||
if self.service == 'tesseract-ocr' and 'model' in service_args:
|
||||
tesseract_ocr_pipeline_model = TesseractOCRModel.query.get(service_args['model']) # noqa
|
||||
service_args['model'] = tesseract_ocr_pipeline_model.title
|
||||
dict_job = {
|
||||
'id': self.hashid,
|
||||
'user_id': self.user.hashid,
|
||||
@ -494,7 +604,7 @@ class Job(HashidMixin, db.Model):
|
||||
'description': self.description,
|
||||
'end_date': None if self.end_date is None else f'{self.end_date.isoformat()}Z', # noqa
|
||||
'service': self.service,
|
||||
'service_args': self.service_args,
|
||||
'service_args': service_args,
|
||||
'service_version': self.service_version,
|
||||
'status': self.status,
|
||||
'title': self.title,
|
||||
@ -550,7 +660,7 @@ class CorpusFile(FileMixin, HashidMixin, db.Model):
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return os.path.join(self.corpus.path, self.filename)
|
||||
return os.path.join(self.corpus.path, 'files', str(self.id))
|
||||
|
||||
@property
|
||||
def url(self):
|
||||
@ -659,28 +769,27 @@ class Corpus(HashidMixin, db.Model):
|
||||
return self.user.hashid
|
||||
|
||||
def build(self):
|
||||
output_dir = os.path.join(self.path, 'merged')
|
||||
shutil.rmtree(output_dir, ignore_errors=True)
|
||||
os.mkdir(output_dir)
|
||||
output_file = os.path.join(output_dir, 'corpus.vrt')
|
||||
corpus_element = ET.fromstring('<corpus>\n</corpus>')
|
||||
for corpus_file in self.files:
|
||||
element_tree = ET.parse(corpus_file.path)
|
||||
text_node = element_tree.find('text')
|
||||
text_node.set('address', corpus_file.address or 'NULL')
|
||||
text_node.set('author', corpus_file.author)
|
||||
text_node.set('booktitle', corpus_file.booktitle or 'NULL')
|
||||
text_node.set('chapter', corpus_file.chapter or 'NULL')
|
||||
text_node.set('editor', corpus_file.editor or 'NULL')
|
||||
text_node.set('institution', corpus_file.institution or 'NULL')
|
||||
text_node.set('journal', corpus_file.journal or 'NULL')
|
||||
text_node.set('pages', corpus_file.pages or 'NULL')
|
||||
text_node.set('publisher', corpus_file.publisher or 'NULL')
|
||||
text_node.set('publishing_year', str(corpus_file.publishing_year))
|
||||
text_node.set('school', corpus_file.school or 'NULL')
|
||||
text_node.set('title', corpus_file.title)
|
||||
corpus_element.insert(1, text_node)
|
||||
ET.ElementTree(corpus_element).write(output_file, encoding='utf-8')
|
||||
text_element = element_tree.getroot()
|
||||
text_element.set('address', corpus_file.address or 'NULL')
|
||||
text_element.set('author', corpus_file.author)
|
||||
text_element.set('booktitle', corpus_file.booktitle or 'NULL')
|
||||
text_element.set('chapter', corpus_file.chapter or 'NULL')
|
||||
text_element.set('editor', corpus_file.editor or 'NULL')
|
||||
text_element.set('institution', corpus_file.institution or 'NULL')
|
||||
text_element.set('journal', corpus_file.journal or 'NULL')
|
||||
text_element.set('pages', corpus_file.pages or 'NULL')
|
||||
text_element.set('publisher', corpus_file.publisher or 'NULL')
|
||||
text_element.set('publishing_year', str(corpus_file.publishing_year)) # noqa
|
||||
text_element.set('school', corpus_file.school or 'NULL')
|
||||
text_element.set('title', corpus_file.title)
|
||||
corpus_element.insert(1, text_element)
|
||||
ET.ElementTree(corpus_element).write(
|
||||
os.path.join(self.path, 'cwb', 'corpus.vrt'),
|
||||
encoding='utf-8'
|
||||
)
|
||||
self.last_edited_date = datetime.utcnow()
|
||||
self.status = 'submitted'
|
||||
|
||||
@ -688,6 +797,13 @@ class Corpus(HashidMixin, db.Model):
|
||||
shutil.rmtree(self.path, ignore_errors=True)
|
||||
db.session.delete(self)
|
||||
|
||||
def makedirs(self):
|
||||
os.mkdir(self.path)
|
||||
os.mkdir(os.path.join(self.path, 'files'))
|
||||
os.mkdir(os.path.join(self.path, 'cwb'))
|
||||
os.mkdir(os.path.join(self.path, 'cwb', 'data'))
|
||||
os.mkdir(os.path.join(self.path, 'cwb', 'registry'))
|
||||
|
||||
def to_dict(self, backrefs=False, relationships=False):
|
||||
dict_corpus = {
|
||||
'id': self.hashid,
|
||||
|
@ -1,77 +1,13 @@
|
||||
from flask import Blueprint
|
||||
import os
|
||||
import yaml
|
||||
|
||||
|
||||
SERVICES = {
|
||||
'file-setup': {
|
||||
'name': 'File setup',
|
||||
'versions': {
|
||||
'latest': '1.0.0b',
|
||||
'1.0.0b': {
|
||||
'publishing_data': {
|
||||
'date': None,
|
||||
'title': 'nopaque File setup service',
|
||||
'url': 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup/-/tree/1.0.0b', # noqa
|
||||
'version': '1.0.0'
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
'nlp': {
|
||||
'name': 'Natural Language Processing',
|
||||
'versions': {
|
||||
'latest': '1.0.0b',
|
||||
'1.0.0b': {
|
||||
'check_encoding': True,
|
||||
'models': {
|
||||
'de': 'German',
|
||||
'en': 'English',
|
||||
'it': 'Italian',
|
||||
'nl': 'Dutch',
|
||||
'pl': 'Polish',
|
||||
'zh': 'Chinese'
|
||||
},
|
||||
'publishing_data': {
|
||||
'date': None,
|
||||
'title': 'nopaque NLP service',
|
||||
'url': 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/tree/1.0.0b', # noqa
|
||||
'version': '1.0.0'
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
'ocr': {
|
||||
'name': 'Optical Character Recognition',
|
||||
'versions': {
|
||||
'latest': '1.0.0b',
|
||||
'1.0.0b': {
|
||||
'binarization': True,
|
||||
'models': {
|
||||
'ara': 'Arabic',
|
||||
'chi_tra': 'Chinese - Traditional',
|
||||
'dan': 'Danish',
|
||||
'eng': 'English',
|
||||
'enm': 'English, Middle 1100-1500',
|
||||
'fra': 'French',
|
||||
'frm': 'French, Middle ca. 1400-1600',
|
||||
'deu': 'German',
|
||||
'frk': 'German Fraktur',
|
||||
'ell': 'Greek, Modern (1453-)',
|
||||
'ita': 'Italian',
|
||||
'por': 'Portuguese',
|
||||
'rus': 'Russian',
|
||||
'spa': 'Spanish; Castilian',
|
||||
},
|
||||
'publishing_data': {
|
||||
'date': None,
|
||||
'title': 'nopaque OCR service',
|
||||
'url': 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/tree/1.0.0b', # noqa
|
||||
'version': '1.0.0'
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
services_file = os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)), 'services.yml')
|
||||
with open(services_file, 'r') as f:
|
||||
SERVICES = yaml.safe_load(f)
|
||||
|
||||
|
||||
bp = Blueprint('services', __name__)
|
||||
from . import routes
|
||||
from . import routes # noqa
|
||||
|
@ -1,3 +1,4 @@
|
||||
from app.models import TesseractOCRModel
|
||||
from flask_wtf import FlaskForm
|
||||
from wtforms import (BooleanField, MultipleFileField, SelectField, StringField,
|
||||
SubmitField, ValidationError)
|
||||
@ -6,85 +7,105 @@ from . import SERVICES
|
||||
|
||||
|
||||
class AddJobForm(FlaskForm):
|
||||
description = StringField('Description',
|
||||
validators=[DataRequired(), Length(1, 255)])
|
||||
description = StringField('Description', validators=[DataRequired(), Length(1, 255)]) # noqa
|
||||
submit = SubmitField()
|
||||
title = StringField('Title', validators=[DataRequired(), Length(1, 32)])
|
||||
version = SelectField('Version', validators=[DataRequired()])
|
||||
|
||||
|
||||
class AddNLPJobForm(AddJobForm):
|
||||
check_encoding = BooleanField('Check encoding')
|
||||
class AddSpacyNLPJobForm(AddJobForm):
|
||||
encoding_detection = BooleanField('Encoding detection')
|
||||
files = MultipleFileField('Files', validators=[DataRequired()])
|
||||
language = SelectField('Language', choices=[('', 'Choose your option')],
|
||||
default='', validators=[DataRequired()])
|
||||
model = SelectField(
|
||||
'Model',
|
||||
choices=[('', 'Choose your option')],
|
||||
default='',
|
||||
validators=[DataRequired()]
|
||||
)
|
||||
|
||||
def validate_check_encoding(self, field):
|
||||
if field.data and 'check_encoding' not in SERVICES['nlp']['versions'][self.version.data]: # noqa
|
||||
raise ValidationError('Check encoding is not available in this version') # noqa
|
||||
def validate_encoding_detection(self, field):
|
||||
service_info = SERVICES['spacy-nlp']['versions'][self.version.data]
|
||||
if field.data and 'encoding_detection' not in service_info:
|
||||
raise ValidationError('Encoding detection is not available')
|
||||
|
||||
def validate_files(form, field):
|
||||
valid_extensions = ['.txt']
|
||||
for file in field.data:
|
||||
if not file.filename.lower().endswith('.txt'):
|
||||
raise ValidationError('File does not have an approved '
|
||||
'extension: .txt')
|
||||
if not file.filename.lower().endswith(tuple(valid_extensions)):
|
||||
raise ValidationError(
|
||||
'File does not have an approved extension: '
|
||||
'/'.join(valid_extensions)
|
||||
)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
version = kwargs.pop('version', SERVICES['nlp']['versions']['latest'])
|
||||
version = kwargs.pop('version', SERVICES['spacy-nlp']['latest_version']) # noqa
|
||||
super().__init__(*args, **kwargs)
|
||||
if 'check_encoding' not in SERVICES['nlp']['versions'][version]:
|
||||
self.check_encoding.render_kw = {'disabled': True}
|
||||
self.language.choices += [(x, y) for x, y in SERVICES['nlp']['versions'][version]['models'].items()] # noqa
|
||||
self.version.choices = [(x, x) for x in SERVICES['nlp']['versions'] if x != 'latest'] # noqa
|
||||
service_info = SERVICES['spacy-nlp']['versions'][version]
|
||||
if 'check_encoding' not in service_info['methods']:
|
||||
self.encoding_detection.render_kw = {'disabled': True}
|
||||
self.model.choices += [(x, y) for x, y in service_info['models'].items()] # noqa
|
||||
self.version.choices = [(x, x) for x in SERVICES['spacy-nlp']['versions']] # noqa
|
||||
self.version.default = version
|
||||
|
||||
|
||||
class AddOCRJobForm(AddJobForm):
|
||||
binarization = BooleanField('Binarazation')
|
||||
class AddTesseractOCRJobForm(AddJobForm):
|
||||
binarization = BooleanField('Binarization')
|
||||
files = MultipleFileField('Files', validators=[DataRequired()])
|
||||
language = SelectField('Language', choices=[('', 'Choose your option')],
|
||||
default='', validators=[DataRequired()])
|
||||
model = SelectField(
|
||||
'Model',
|
||||
choices=[('', 'Choose your option')],
|
||||
default='',
|
||||
validators=[DataRequired()]
|
||||
)
|
||||
|
||||
def validate_binarization(self, field):
|
||||
if field.data and 'binarization' not in SERVICES['ocr']['versions'][self.version.data]: # noqa
|
||||
raise ValidationError('Binarization is not available in this version') # noqa
|
||||
service_info = SERVICES['tesseract-ocr']['versions'][self.version.data]
|
||||
if field.data and 'binarization' not in service_info:
|
||||
raise ValidationError('Binarization is not available')
|
||||
|
||||
def validate_files(self, field):
|
||||
valid_extensions = ['.pdf']
|
||||
for file in field.data:
|
||||
if not file.filename.lower().endswith('.pdf'):
|
||||
raise ValidationError('File does not have an approved '
|
||||
'extension: .pdf')
|
||||
if not file.filename.lower().endswith(tuple(valid_extensions)):
|
||||
raise ValidationError(
|
||||
'File does not have an approved extension: '
|
||||
'/'.join(valid_extensions)
|
||||
)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
version = kwargs.pop('version', SERVICES['ocr']['versions']['latest'])
|
||||
version = kwargs.pop('version', SERVICES['tesseract-ocr']['latest_version']) # noqa
|
||||
super().__init__(*args, **kwargs)
|
||||
if 'binarization' not in SERVICES['ocr']['versions'][version]:
|
||||
service_info = SERVICES['tesseract-ocr']['versions'][version]
|
||||
if 'binarization' not in service_info['methods']:
|
||||
self.binarization.render_kw = {'disabled': True}
|
||||
self.language.choices += [(x, y) for x, y in SERVICES['ocr']['versions'][version]['models'].items()] # noqa
|
||||
self.version.choices = [(x, x) for x in SERVICES['ocr']['versions'] if x != 'latest'] # noqa
|
||||
self.version.default = version
|
||||
self.model.choices += [(x.hashid, x.title) for x in TesseractOCRModel.query.all()] # noqa
|
||||
self.version.choices = [(x, x) for x in SERVICES['tesseract-ocr']['versions']] # noqa
|
||||
self.version.data = version
|
||||
self.version.default = SERVICES['tesseract-ocr']['latest_version']
|
||||
|
||||
|
||||
class AddFileSetupJobForm(AddJobForm):
|
||||
files = MultipleFileField('Files', validators=[DataRequired()])
|
||||
|
||||
def validate_files(form, field):
|
||||
valid_extensions = ['.jpeg', '.jpg', '.png', '.tiff', '.tif']
|
||||
for file in field.data:
|
||||
if not file.filename.lower().endswith(('.jpeg', '.jpg', '.png',
|
||||
'.tiff', '.tif')):
|
||||
raise ValidationError('File does not have an approved '
|
||||
'extension: .jpeg | .jpg | .png | .tiff '
|
||||
'| .tif')
|
||||
if not file.filename.lower().endswith(tuple(valid_extensions)):
|
||||
raise ValidationError(
|
||||
'File does not have an approved extension: '
|
||||
'/'.join(valid_extensions)
|
||||
)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
version = kwargs.pop('version', SERVICES['file-setup']['versions']['latest'])
|
||||
version = kwargs.pop('version', SERVICES['file-setup']['latest_version']) # noqa
|
||||
super().__init__(*args, **kwargs)
|
||||
self.version.choices = [(x, x) for x in SERVICES['file-setup']['versions'] if x != 'latest'] # noqa
|
||||
self.version.default = version
|
||||
self.version.choices = [(x, x) for x in SERVICES['file-setup']['versions']] # noqa
|
||||
self.version.data = version
|
||||
self.version.default = SERVICES['file-setup']['latest_version']
|
||||
|
||||
|
||||
AddJobForms = {
|
||||
'file-setup': AddFileSetupJobForm,
|
||||
'ocr': AddOCRJobForm,
|
||||
'nlp': AddNLPJobForm
|
||||
'tesseract-ocr': AddTesseractOCRJobForm,
|
||||
'spacy-nlp': AddSpacyNLPJobForm
|
||||
}
|
||||
|
@ -1,3 +1,4 @@
|
||||
from app import hashids
|
||||
from flask import (abort, current_app, flash, make_response, render_template,
|
||||
request, url_for)
|
||||
from flask_login import current_user, login_required
|
||||
@ -8,7 +9,6 @@ from .. import db
|
||||
from .forms import AddJobForms
|
||||
from ..models import Job, JobInput
|
||||
import json
|
||||
import os
|
||||
|
||||
|
||||
@bp.route('/corpus-analysis')
|
||||
@ -24,57 +24,65 @@ def service(service):
|
||||
# Check if the requested service exist
|
||||
if service not in SERVICES or service not in AddJobForms:
|
||||
abort(404)
|
||||
version = request.args.get(
|
||||
'version', SERVICES[service]['versions']['latest'])
|
||||
version = request.args.get('version', SERVICES[service]['latest_version'])
|
||||
if version not in SERVICES[service]['versions']:
|
||||
abort(404)
|
||||
form = AddJobForms[service](prefix='add-job-form', version=version)
|
||||
form.version.data = version
|
||||
title = SERVICES[service]['name']
|
||||
versions = SERVICES[service]['versions']
|
||||
if form.is_submitted():
|
||||
if not form.validate():
|
||||
return make_response(form.errors, 400)
|
||||
service_args = []
|
||||
if service == 'nlp':
|
||||
service_args.append(f'-l {form.language.data}')
|
||||
if form.check_encoding.data:
|
||||
service_args.append('--check-encoding')
|
||||
if service == 'ocr':
|
||||
service_args.append(f'-l {form.language.data}')
|
||||
service_args = {}
|
||||
if service == 'spacy-nlp':
|
||||
service_args['model'] = form.model.data
|
||||
if form.encoding_detection.data:
|
||||
service_args['encoding_detection'] = True
|
||||
if service == 'tesseract-ocr':
|
||||
service_args['model'] = hashids.decode(form.model.data)
|
||||
if form.binarization.data:
|
||||
service_args.append('--binarize')
|
||||
job = Job(user=current_user,
|
||||
description=form.description.data,
|
||||
service=service, service_args=json.dumps(service_args),
|
||||
service_version=form.version.data,
|
||||
status='preparing', title=form.title.data)
|
||||
service_args['binarization'] = True
|
||||
job = Job(
|
||||
user=current_user,
|
||||
description=form.description.data,
|
||||
service=service,
|
||||
service_args=json.dumps(service_args),
|
||||
service_version=form.version.data,
|
||||
status='preparing',
|
||||
title=form.title.data
|
||||
)
|
||||
db.session.add(job)
|
||||
db.session.flush()
|
||||
db.session.flush(objects=[job])
|
||||
db.session.refresh(job)
|
||||
try:
|
||||
os.makedirs(job.path)
|
||||
except OSError:
|
||||
current_app.logger.error(f'Make dir {job.path} led to an OSError!')
|
||||
job.makedirs()
|
||||
except OSError as e:
|
||||
current_app.logger.error(e)
|
||||
db.session.rollback()
|
||||
flash('Internal Server Error', 'error')
|
||||
return make_response(
|
||||
{'redirect_url': url_for('.service', service=service)}, 500)
|
||||
else:
|
||||
for file in form.files.data:
|
||||
filename = secure_filename(file.filename)
|
||||
job_input = JobInput(
|
||||
filename=filename, job=job, mimetype=file.mimetype)
|
||||
return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa
|
||||
for file in form.files.data:
|
||||
filename = secure_filename(file.filename)
|
||||
job_input = JobInput(
|
||||
filename=filename,
|
||||
job=job,
|
||||
mimetype=file.mimetype
|
||||
)
|
||||
db.session.add(job_input)
|
||||
db.session.flush(objects=[job_input])
|
||||
db.session.refresh(job_input)
|
||||
try:
|
||||
file.save(job_input.path)
|
||||
db.session.add(job_input)
|
||||
job.status = 'submitted'
|
||||
db.session.commit()
|
||||
flash(f'Job "{job.title}" added', 'job')
|
||||
return make_response(
|
||||
{'redirect_url': url_for('jobs.job', job_id=job.id)}, 201)
|
||||
except OSError as e:
|
||||
current_app.logger.error(e)
|
||||
db.session.rollback()
|
||||
flash('Internal Server Error', 'error')
|
||||
return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa
|
||||
job.status = 'submitted'
|
||||
db.session.commit()
|
||||
flash(f'Job "{job.title}" added', 'job')
|
||||
return make_response({'redirect_url': url_for('jobs.job', job_id=job.id)}, 201) # noqa
|
||||
return render_template(
|
||||
f'services/{service.replace("-", "_")}.html.j2',
|
||||
form=form,
|
||||
title=title,
|
||||
versions=versions
|
||||
title=title
|
||||
)
|
||||
|
38
app/services/services.yml
Normal file
38
app/services/services.yml
Normal file
@ -0,0 +1,38 @@
|
||||
# TODO: This could also be done via GitLab/GitHub APIs
|
||||
#file-setup-pipeline:
|
||||
file-setup:
|
||||
name: 'File setup pipeline'
|
||||
latest_version: '0.1.0'
|
||||
versions:
|
||||
0.1.0:
|
||||
publisher: 'Bielefeld University - CRC 1288 - INF'
|
||||
publishing_year: 2022
|
||||
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup/-/releases/v0.1.0'
|
||||
#spacy-nlp-pipeline:
|
||||
spacy-nlp:
|
||||
name: 'spaCy NLP'
|
||||
latest_version: '0.1.0'
|
||||
versions:
|
||||
0.1.0:
|
||||
methods:
|
||||
- 'encoding_detection'
|
||||
models:
|
||||
de: 'German'
|
||||
en: 'English'
|
||||
it: 'Italian'
|
||||
pl: 'Polish'
|
||||
zh: 'Chinese'
|
||||
publisher: 'Bielefeld University - CRC 1288 - INF'
|
||||
publishing_year: 2022
|
||||
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/releases/v0.1.0'
|
||||
#tesseract-ocr-pipeline:
|
||||
tesseract-ocr:
|
||||
name: 'Tesseract OCR'
|
||||
latest_version: '0.1.0'
|
||||
versions:
|
||||
0.1.0:
|
||||
methods:
|
||||
- 'binarization'
|
||||
publisher: 'Bielefeld University - CRC 1288 - INF'
|
||||
publishing_year: 2022
|
||||
url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/releases/v0.1.0'
|
@ -50,8 +50,8 @@ h1 .nopaque-icons, h2 .nopaque-icons, h3 .nopaque-icons, h4 .nopaque-icons,
|
||||
}
|
||||
.nopaque-icons.service-icon[data-service="corpus-analysis"]:empty:before {content: "H";}
|
||||
.nopaque-icons.service-icon[data-service="file-setup"]:empty:before {content: "E";}
|
||||
.nopaque-icons.service-icon[data-service="nlp"]:empty:before {content: "G";}
|
||||
.nopaque-icons.service-icon[data-service="ocr"]:empty:before {content: "F";}
|
||||
.nopaque-icons.service-icon[data-service="spacy-nlp"]:empty:before {content: "G";}
|
||||
.nopaque-icons.service-icon[data-service="tesseract-ocr"]:empty:before {content: "F";}
|
||||
|
||||
.status-text[data-status]:empty:before {content: attr(data-status);}
|
||||
|
||||
|
@ -53,7 +53,7 @@ class CorpusAnalysisApp {
|
||||
this.data.cQiClient = new CQiClient(this.settings.corpusId);
|
||||
this.data.cQiClient.connect()
|
||||
.then(cQiStatus => {
|
||||
return this.data.cQiClient.corpora.get('CORPUS');
|
||||
return this.data.cQiClient.corpora.get(`NOPAQUE_${this.settings.corpusId}`);
|
||||
})
|
||||
.then(
|
||||
cQiCorpus => {
|
||||
|
@ -100,7 +100,10 @@ class JobDisplay extends RessourceDisplay {
|
||||
}
|
||||
|
||||
setServiceArgs(serviceArgs) {
|
||||
this.setElements(this.displayElement.querySelectorAll('.job-service-args'), serviceArgs);
|
||||
this.setElements(
|
||||
this.displayElement.querySelectorAll('.job-service-args'),
|
||||
JSON.stringify(serviceArgs)
|
||||
);
|
||||
}
|
||||
|
||||
setServiceVersion(serviceVersion) {
|
||||
|
@ -10,25 +10,10 @@ class JobResultList extends RessourceList {
|
||||
</tr>
|
||||
`.trim(),
|
||||
ressourceMapper: jobResult => {
|
||||
let description;
|
||||
|
||||
if (jobResult.filename.endsWith('.pdf.zip')) {
|
||||
description = 'PDF files with text layer';
|
||||
} else if (jobResult.filename.endsWith('.txt.zip')) {
|
||||
description = 'Raw text files';
|
||||
} else if (jobResult.filename.endsWith('.vrt.zip')) {
|
||||
description = 'VRT compliant files including the NLP data';
|
||||
} else if (jobResult.filename.endsWith('.xml.zip')) {
|
||||
description = 'TEI compliant files';
|
||||
} else if (jobResult.filename.endsWith('.poco.zip')) {
|
||||
description = 'HOCR and image files for post correction (PoCo)';
|
||||
} else {
|
||||
description = 'All result files created during this job';
|
||||
}
|
||||
return {
|
||||
id: jobResult.id,
|
||||
creationDate: jobResult.creation_date,
|
||||
description: description,
|
||||
description: jobResult.description,
|
||||
filename: jobResult.filename
|
||||
};
|
||||
},
|
||||
|
@ -19,12 +19,12 @@
|
||||
'darken': '#a1b300',
|
||||
'lighten': '#f2f3e1'
|
||||
},
|
||||
'nlp': {
|
||||
'spacy-nlp': {
|
||||
'base': '#98acd2',
|
||||
'darken': '#0064a3',
|
||||
'lighten': '#e5e8f5'
|
||||
},
|
||||
'ocr': {
|
||||
'tesseract-ocr': {
|
||||
'base': '#a9d8c8',
|
||||
'darken': '#00a58b',
|
||||
'lighten': '#e7f4f1'
|
||||
|
@ -15,8 +15,8 @@
|
||||
<li><div class="divider"></div></li>
|
||||
<li><a class="subheader">Processes & Services</a></li>
|
||||
<li class="service-color service-color-border border-darken" data-service="file-setup" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='file-setup') }}"><i class="nopaque-icons service-icon" data-service="file-setup"></i>File setup</a></li>
|
||||
<li class="service-color service-color-border border-darken" data-service="ocr" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='ocr') }}"><i class="nopaque-icons service-icon" data-service="ocr"></i>OCR</a></li>
|
||||
<li class="service-color service-color-border border-darken" data-service="nlp" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='nlp') }}"><i class="nopaque-icons service-icon" data-service="nlp"></i>NLP</a></li>
|
||||
<li class="service-color service-color-border border-darken" data-service="tesseract-ocr" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='tesseract-ocr') }}"><i class="nopaque-icons service-icon" data-service="tesseract-ocr"></i>OCR</a></li>
|
||||
<li class="service-color service-color-border border-darken" data-service="spacy-nlp" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='spacy-nlp') }}"><i class="nopaque-icons service-icon" data-service="spacy-nlp"></i>NLP</a></li>
|
||||
<li class="service-color service-color-border border-darken" data-service="corpus-analysis" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='corpus-analysis') }}"><i class="nopaque-icons service-icon" data-service="corpus-analysis"></i>Corpus analysis</a></li>
|
||||
<li><div class="divider"></div></li>
|
||||
<li><a class="subheader">Account</a></li>
|
||||
@ -28,6 +28,9 @@
|
||||
{% if current_user.can(Permission.ADMINISTRATE) %}
|
||||
<li><a href="{{ url_for('admin.index') }}"><i class="material-icons">admin_panel_settings</i>Administration</a></li>
|
||||
{% endif %}
|
||||
{% if current_user.can(Permission.CONTRIBUTE) %}
|
||||
<li><a href="{{ url_for('contribute.index') }}"><i class="material-icons">new_label</i>Contribute</a></li>
|
||||
{% endif %}
|
||||
{% if current_user.can(Permission.USE_API) %}
|
||||
<li><a href="{{ url_for('api.doc') }}"><i class="material-icons">api</i>API</a></li>
|
||||
{% endif %}
|
||||
|
@ -120,32 +120,32 @@
|
||||
</a>
|
||||
<br><br>
|
||||
<p class="service-color-text darken" data-service="file-setup"><b>File setup</b></p>
|
||||
<p class="light">Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing and the application of other services.</p>
|
||||
<p class="light">Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing.</p>
|
||||
<a href="{{ url_for('services.service', service='file-setup') }}" class="waves-effect waves-light btn service-color darken" data-service="file-setup">Create Job</a>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col s12 m4">
|
||||
<div class="card-panel center-align hoverable">
|
||||
<br>
|
||||
<a href="{{ url_for('services.service', service='ocr') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);">
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="ocr" style="font-size: 2.5rem;"></i>
|
||||
<a href="{{ url_for('services.service', service='tesseract-ocr') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);">
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr" style="font-size: 2.5rem;"></i>
|
||||
</a>
|
||||
<br><br>
|
||||
<p class="service-color-text darken" data-service="ocr"><b>Optical Character Recognition</b></p>
|
||||
<p class="service-color-text darken" data-service="tesseract-ocr"><b>Optical Character Recognition</b></p>
|
||||
<p class="light">nopaque converts your image data – like photos or scans – into text data through a process called OCR. This step enables you to proceed with further computational analysis of your documents.</p>
|
||||
<a href="{{ url_for('services.service', service='ocr') }}" class="waves-effect waves-light btn service-color darken" data-service="ocr">Create Job</a>
|
||||
<a href="{{ url_for('services.service', service='tesseract-ocr') }}" class="waves-effect waves-light btn service-color darken" data-service="tesseract-ocr">Create Job</a>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col s12 m4">
|
||||
<div class="card-panel center-align hoverable">
|
||||
<br>
|
||||
<a href="{{ url_for('services.service', service='nlp') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);">
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="nlp" style="font-size: 2.5rem;"></i>
|
||||
<a href="{{ url_for('services.service', service='spacy-nlp') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);">
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp" style="font-size: 2.5rem;"></i>
|
||||
</a>
|
||||
<br><br>
|
||||
<p class="service-color-text darken" data-service="nlp"><b>Natural Language Processing</b></p>
|
||||
<p class="service-color-text darken" data-service="spacy-nlp"><b>Natural Language Processing</b></p>
|
||||
<p class="light">By means of computational linguistic data processing (tokenization, lemmatization, part-of-speech tagging and named-entity recognition) nopaque extracts additional information from your text.</p>
|
||||
<a href="{{ url_for('services.service', service='nlp') }}" class="waves-effect waves-light btn service-color darken" data-service="nlp">Create Job</a>
|
||||
<a href="{{ url_for('services.service', service='spacy-nlp') }}" class="waves-effect waves-light btn service-color darken" data-service="spacy-nlp">Create Job</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
@ -84,11 +84,11 @@
|
||||
<p class="light">Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing and the application of other services.</p>
|
||||
</div>
|
||||
<div class="col s12 m6 l3 center-align">
|
||||
<a href="{{ url_for('services.service', service='ocr') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="ocr"></i>
|
||||
<a href="{{ url_for('services.service', service='tesseract-ocr') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr"></i>
|
||||
</a>
|
||||
<br><br>
|
||||
<p class="service-color-text text-darken" data-service="ocr"><b>Optical Character Recognition</b></p>
|
||||
<p class="service-color-text text-darken" data-service="tesseract-ocr"><b>Optical Character Recognition</b></p>
|
||||
<p class="light">nopaque converts your image data – like photos or scans – into text data through OCR making it machine readable. This step enables you to proceed with further computational analysis of your documents.</p>
|
||||
</div>
|
||||
<div class="col s12 m6 l3 center-align">
|
||||
|
@ -2,7 +2,7 @@
|
||||
{% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %}
|
||||
{% import "materialize/wtf.html.j2" as wtf %}
|
||||
|
||||
{% block main_attribs %} class="service-scheme" data-service="nlp"{% endblock main_attribs %}
|
||||
{% block main_attribs %} class="service-scheme" data-service="spacy-nlp"{% endblock main_attribs %}
|
||||
|
||||
{% block page_content %}
|
||||
<div class="container">
|
||||
@ -16,13 +16,13 @@
|
||||
<p class="hide-on-small-only"> </p>
|
||||
<p class="hide-on-small-only"> </p>
|
||||
<a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="nlp"></i>
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp"></i>
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="col s12 m9 pull-m3">
|
||||
<div class="card service-color-border border-darken" data-service="nlp" style="border-top: 10px solid;">
|
||||
<div class="card service-color-border border-darken" data-service="spacy-nlp" style="border-top: 10px solid;">
|
||||
<div class="card-content">
|
||||
<div class="row">
|
||||
<div class="col s12 m6">
|
||||
@ -71,7 +71,7 @@
|
||||
{{ wtf.render_field(form.files, accept='text/plain', placeholder='Choose your .txt files') }}
|
||||
</div>
|
||||
<div class="col s12 l4">
|
||||
{{ wtf.render_field(form.language, material_icon='language') }}
|
||||
{{ wtf.render_field(form.model, material_icon='language') }}
|
||||
</div>
|
||||
<div class="col s12 l3">
|
||||
{{ wtf.render_field(form.version, material_icon='apps') }}
|
||||
@ -80,13 +80,13 @@
|
||||
<span class="card-title">Preprocessing</span>
|
||||
</div>
|
||||
<div class="col s9">
|
||||
<p>{{ form.check_encoding.label.text }}</p>
|
||||
<p>{{ form.encoding_detection.label.text }}</p>
|
||||
<p class="light">If the input files are not created with the nopaque OCR service or you do not know if your text files are UTF-8 encoded, check this switch. We will try to automatically determine the right encoding for your texts to process them.</p>
|
||||
</div>
|
||||
<div class="col s3 right-align">
|
||||
<div class="switch">
|
||||
<label>
|
||||
{{ form.check_encoding() }}
|
||||
{{ form.encoding_detection() }}
|
||||
<span class="lever"></span>
|
||||
</label>
|
||||
</div>
|
@ -2,7 +2,7 @@
|
||||
{% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %}
|
||||
{% import "materialize/wtf.html.j2" as wtf %}
|
||||
|
||||
{% block main_attribs %} class="service-scheme" data-service="ocr"{% endblock main_attribs %}
|
||||
{% block main_attribs %} class="service-scheme" data-service="tesseract-ocr"{% endblock main_attribs %}
|
||||
|
||||
{% block page_content %}
|
||||
<div class="container">
|
||||
@ -16,13 +16,13 @@
|
||||
<p class="hide-on-small-only"> </p>
|
||||
<p class="hide-on-small-only"> </p>
|
||||
<a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light">
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="ocr"></i>
|
||||
<i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr"></i>
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="col s12 m9 pull-m3">
|
||||
<div class="card service-color-border border-darken" data-service="ocr" style="border-top: 10px solid;">
|
||||
<div class="card service-color-border border-darken" data-service="tesseract-ocr" style="border-top: 10px solid;">
|
||||
<div class="card-content">
|
||||
<div class="row">
|
||||
<div class="col s12">
|
||||
@ -50,10 +50,10 @@
|
||||
{{ wtf.render_field(form.description, data_length='255', material_icon='description') }}
|
||||
</div>
|
||||
<div class="col s12 l5">
|
||||
{{ wtf.render_field(form.files, accept='application/pdf', color=ocr_color_darken, placeholder='Choose your .pdf files') }}
|
||||
{{ wtf.render_field(form.files, accept='application/pdf', placeholder='Choose your .pdf files') }}
|
||||
</div>
|
||||
<div class="col s12 l4">
|
||||
{{ wtf.render_field(form.language, material_icon='language') }}
|
||||
{{ wtf.render_field(form.model, material_icon='language') }}
|
||||
</div>
|
||||
<div class="col s12 l3">
|
||||
{{ wtf.render_field(form.version, material_icon='apps') }}
|
||||
@ -127,7 +127,7 @@
|
||||
</div>
|
||||
</div>
|
||||
<div class="card-action right-align">
|
||||
{{ wtf.render_field(form.submit, color=ocr_color_darken, material_icon='send') }}
|
||||
{{ wtf.render_field(form.submit, material_icon='send') }}
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
10
app/utils.py
10
app/utils.py
@ -1,10 +0,0 @@
|
||||
from app import hashids
|
||||
from werkzeug.routing import BaseConverter
|
||||
|
||||
|
||||
class HashidConverter(BaseConverter):
|
||||
def to_python(self, value: str) -> int:
|
||||
return hashids.decode(value)[0]
|
||||
|
||||
def to_url(self, value: int) -> str:
|
||||
return hashids.encode(value)
|
@ -5,14 +5,14 @@
|
||||
version: "3.5"
|
||||
|
||||
networks:
|
||||
reverse-proxy:
|
||||
external:
|
||||
name: reverse-proxy
|
||||
traefik:
|
||||
external: true
|
||||
name: "traefik"
|
||||
|
||||
services:
|
||||
nopaque:
|
||||
labels:
|
||||
- "traefik.docker.network=reverse-proxy"
|
||||
- "traefik.docker.network=traefik"
|
||||
- "traefik.enable=true"
|
||||
### <http> ###
|
||||
- "traefik.http.middlewares.http-nopaque-headers.headers.customrequestheaders.X-Forwarded-Proto=http"
|
||||
|
45
migrations/versions/ad0d835fe5b1_.py
Normal file
45
migrations/versions/ad0d835fe5b1_.py
Normal file
@ -0,0 +1,45 @@
|
||||
"""empty message
|
||||
|
||||
Revision ID: ad0d835fe5b1
|
||||
Revises: 68ed092ffe5e
|
||||
Create Date: 2022-01-18 16:23:45.673993
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = 'ad0d835fe5b1'
|
||||
down_revision = '68ed092ffe5e'
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade():
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
op.create_table('tesseract_ocr_models',
|
||||
sa.Column('creation_date', sa.DateTime(), nullable=True),
|
||||
sa.Column('filename', sa.String(length=255), nullable=True),
|
||||
sa.Column('last_edited_date', sa.DateTime(), nullable=True),
|
||||
sa.Column('mimetype', sa.String(length=255), nullable=True),
|
||||
sa.Column('id', sa.Integer(), nullable=False),
|
||||
sa.Column('user_id', sa.Integer(), nullable=True),
|
||||
sa.Column('compatible_service_versions', sa.String(length=255), nullable=True),
|
||||
sa.Column('description', sa.String(length=255), nullable=True),
|
||||
sa.Column('publisher', sa.String(length=128), nullable=True),
|
||||
sa.Column('publishing_year', sa.Integer(), nullable=True),
|
||||
sa.Column('title', sa.String(length=64), nullable=True),
|
||||
sa.Column('version', sa.String(length=16), nullable=True),
|
||||
sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
|
||||
sa.PrimaryKeyConstraint('id')
|
||||
)
|
||||
op.add_column('job_results', sa.Column('description', sa.String(length=255), nullable=True))
|
||||
# ### end Alembic commands ###
|
||||
|
||||
|
||||
def downgrade():
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
op.drop_column('job_results', 'description')
|
||||
op.drop_table('tesseract_ocr_models')
|
||||
# ### end Alembic commands ###
|
@ -3,10 +3,9 @@
|
||||
import eventlet
|
||||
eventlet.monkey_patch()
|
||||
|
||||
|
||||
from app import db, cli, create_app # noqa
|
||||
from app.models import (Corpus, CorpusFile, Job, JobInput, JobResult,
|
||||
Permission, QueryResult, Role, User) # noqa
|
||||
Permission, QueryResult, Role, TesseractOCRModel, User) # noqa
|
||||
from app import db, cli, create_app # noqa
|
||||
from flask import Flask # noqa
|
||||
from typing import Any, Dict # noqa
|
||||
|
||||
@ -34,5 +33,6 @@ def make_shell_context() -> Dict[str, Any]:
|
||||
'Permission': Permission,
|
||||
'QueryResult': QueryResult,
|
||||
'Role': Role,
|
||||
'TesseractOCRModel': TesseractOCRModel,
|
||||
'User': User
|
||||
}
|
||||
|
@ -19,5 +19,7 @@ hiredis
|
||||
jsonschema
|
||||
psycopg2
|
||||
python-dotenv
|
||||
pyyaml
|
||||
redis
|
||||
tqdm
|
||||
wtforms[email]
|
||||
|
Loading…
Reference in New Issue
Block a user