From fe938c0ca2ab77a76619f322845b11f0137c2a0f Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Thu, 3 Feb 2022 12:39:16 +0100 Subject: [PATCH] Big update, corpus analysis reworked, versioned services, preliminary work for contributions --- app/TesseractOCRModel.defaults.yml | 816 ++++++++++++++++++ app/__init__.py | 6 +- app/api/__init__.py | 2 - app/api/auth.py | 8 +- app/api/jobs.py | 48 -- app/auth/routes.py | 37 +- app/cli.py | 88 +- app/contribute/__init__.py | 5 + app/contribute/routes.py | 19 + app/corpora/cqi_over_socketio/__init__.py | 6 +- app/corpora/cqi_over_socketio/utils.py | 5 +- app/corpora/routes.py | 43 +- app/daemon/__init__.py | 10 +- app/daemon/corpus_utils.py | 128 +-- app/daemon/job_utils.py | 105 ++- app/jobs/routes.py | 24 +- app/models.py | 174 +++- app/services/__init__.py | 78 +- app/services/forms.py | 103 ++- app/services/routes.py | 82 +- app/services/services.yml | 38 + app/static/css/nopaque.css | 4 +- .../js/CorpusAnalysis/CorpusAnalysisApp.js | 2 +- app/static/js/RessourceDisplays/JobDisplay.js | 5 +- app/static/js/RessourceLists/JobResultList.js | 17 +- app/templates/_colors.html.j2 | 4 +- app/templates/_sidenav.html.j2 | 7 +- app/templates/main/dashboard.html.j2 | 18 +- app/templates/main/index.html.j2 | 6 +- .../{nlp.html.j2 => spacy_nlp.html.j2} | 12 +- .../{ocr.html.j2 => tesseract_ocr.html.j2} | 12 +- app/utils.py | 10 - docker-compose.traefik.yml | 8 +- migrations/versions/ad0d835fe5b1_.py | 45 + nopaque.py | 6 +- requirements.txt | 2 + 36 files changed, 1552 insertions(+), 431 deletions(-) create mode 100644 app/TesseractOCRModel.defaults.yml delete mode 100644 app/api/jobs.py create mode 100644 app/contribute/__init__.py create mode 100644 app/contribute/routes.py create mode 100644 app/services/services.yml rename app/templates/services/{nlp.html.j2 => spacy_nlp.html.j2} (93%) rename app/templates/services/{ocr.html.j2 => tesseract_ocr.html.j2} (93%) delete mode 100644 app/utils.py create mode 100644 migrations/versions/ad0d835fe5b1_.py diff --git a/app/TesseractOCRModel.defaults.yml b/app/TesseractOCRModel.defaults.yml new file mode 100644 index 00000000..37929e89 --- /dev/null +++ b/app/TesseractOCRModel.defaults.yml @@ -0,0 +1,816 @@ +# - title: 'Afrikaans' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/afr.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Amharic' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/amh.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +- title: 'Arabic' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ara.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +# - title: 'Assamese' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/asm.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Azerbaijani' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/aze.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Azerbaijani - Cyrillic' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/aze_cyrl.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Belarusian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bel.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Bengali' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ben.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Tibetan' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bod.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Bosnian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bos.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Bulgarian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bul.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Catalan; Valencian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/cat.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Cebuano' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ceb.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Czech' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ces.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Chinese - Simplified' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chi_sim.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +- title: 'Chinese - Traditional' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chi_tra.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +# - title: 'Cherokee' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chr.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Welsh' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/cym.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +- title: 'Danish' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/dan.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +- title: 'German' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/deu.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +# - title: 'Dzongkha' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/dzo.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +- title: 'Greek, Modern (1453-)' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ell.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +- title: 'English' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/eng.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +- title: 'English, Middle (1100-1500)' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/enm.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +# - title: 'Esperanto' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/epo.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Estonian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/est.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Basque' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/eus.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Persian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fas.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Finnish' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fin.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +- title: 'French' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fra.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +- title: 'German Fraktur' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/frk.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +- title: 'French, Middle (ca. 1400-1600)' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/frm.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +# - title: 'Irish' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/gle.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Galician' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/glg.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +- title: 'Greek, Ancient (-1453)' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/grc.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +# - title: 'Gujarati' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/guj.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Haitian; Haitian Creole' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hat.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Hebrew' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/heb.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Hindi' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hin.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Croatian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hrv.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Hungarian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hun.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Inuktitut' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/iku.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Indonesian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ind.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Icelandic' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/isl.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +- title: 'Italian' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ita.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +- title: 'Italian - Old' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ita_old.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +# - title: 'Javanese' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/jav.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Japanese' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/jpn.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Kannada' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kan.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Georgian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kat.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Georgian - Old' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kat_old.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Kazakh' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kaz.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Central Khmer' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/khm.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Kirghiz; Kyrgyz' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kir.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Korean' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kor.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Kurdish' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kur.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Lao' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lao.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Latin' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lat.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Latvian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lav.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Lithuanian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lit.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Malayalam' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mal.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Marathi' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mar.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Macedonian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mkd.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Maltese' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mlt.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Malay' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/msa.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Burmese' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mya.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Nepali' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nep.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Dutch; Flemish' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nld.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Norwegian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nor.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Oriya' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ori.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Panjabi; Punjabi' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pan.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Polish' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pol.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +- title: 'Portuguese' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/por.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +# - title: 'Pushto; Pashto' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pus.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Romanian; Moldavian; Moldovan' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ron.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +- title: 'Russian' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/rus.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +# - title: 'Sanskrit' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/san.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Sinhala; Sinhalese' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/sin.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Slovak' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/slk.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Slovenian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/slv.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +- title: 'Spanish; Castilian' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/spa.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +- title: 'Spanish; Castilian - Old' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/spa_old.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +# - title: 'Albanian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/sqi.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Serbian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/srp.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Serbian - Latin' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/srp_latn.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Swahili' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/swa.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Swedish' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/swe.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Syriac' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/syr.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Tamil' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tam.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Telugu' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tel.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Tajik' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tgk.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Tagalog' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tgl.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Thai' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tha.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Tigrinya' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tir.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Turkish' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tur.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Uighur; Uyghur' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uig.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Ukrainian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ukr.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Urdu' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/urd.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Uzbek' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uzb.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Uzbek - Cyrillic' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uzb_cyrl.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Vietnamese' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/vie.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Yiddish' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/yid.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' diff --git a/app/__init__.py b/app/__init__.py index 37b0961f..5c4052d2 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -39,9 +39,6 @@ def create_app(config: Config = Config) -> Flask: socketio.init_app( app, message_queue=app.config['NOPAQUE_SOCKETIO_MESSAGE_QUEUE_URI']) - # from .utils import HashidConverter - # app.url_map.converters['hashid'] = HashidConverter - from .events import socketio as socketio_events from .events import sqlalchemy as sqlalchemy_events @@ -54,6 +51,9 @@ def create_app(config: Config = Config) -> Flask: from .auth import bp as auth_blueprint app.register_blueprint(auth_blueprint, url_prefix='/auth') + from .contribute import bp as contribute_blueprint + app.register_blueprint(contribute_blueprint, url_prefix='/contribute') + from .corpora import bp as corpora_blueprint app.register_blueprint(corpora_blueprint, url_prefix='/corpora') diff --git a/app/api/__init__.py b/app/api/__init__.py index f47235ea..e7674c87 100644 --- a/app/api/__init__.py +++ b/app/api/__init__.py @@ -1,7 +1,6 @@ from flask import Blueprint from flask_restx import Api -from .jobs import ns as jobs_ns from .tokens import ns as tokens_ns bp = Blueprint('api', __name__) @@ -23,5 +22,4 @@ api = Api( version='1.0' ) -api.add_namespace(jobs_ns) api.add_namespace(tokens_ns) diff --git a/app/api/auth.py b/app/api/auth.py index 24e862ea..fea4123b 100644 --- a/app/api/auth.py +++ b/app/api/auth.py @@ -9,8 +9,12 @@ token_auth = HTTPTokenAuth() @basic_auth.verify_password def verify_password(email_or_username, password): - user = User.query.filter(or_(User.username == email_or_username, - User.email == email_or_username.lower())).first() + user = User.query.filter( + or_( + User.username == email_or_username, + User.email == email_or_username.lower() + ) + ).first() if user and user.verify_password(password): return user diff --git a/app/api/jobs.py b/app/api/jobs.py deleted file mode 100644 index 153d5060..00000000 --- a/app/api/jobs.py +++ /dev/null @@ -1,48 +0,0 @@ -from flask_restx import Namespace, Resource -from .auth import token_auth -from ..jobs import tasks -from ..models import Job - - -ns = Namespace('jobs', description='Job operations') - - -@ns.route('') -class API_Jobs(Resource): - '''Shows a list of all jobs and lets you POST to add new job''' - - @ns.doc(security='apiKey') - @token_auth.login_required - def get(self): - '''List all jobs''' - # TODO: Implement the correct get_jobs functionality - jobs = Job.query.all() - return [job.to_dict(include_relationships=False) for job in jobs] - - @ns.doc(security='apiKey') - @token_auth.login_required - def post(self): - '''Create a new job''' - # TODO: Implement this - pass - - -@ns.route('/') -class API_Job(Resource): - '''Show a single job and lets you delete it''' - - @ns.doc(security='apiKey') - @token_auth.login_required - def get(self, id): - '''Get a job by id''' - job = Job.query.get_or_404(id) - return job.to_dict(include_relationships=False) - - @ns.doc(security='apiKey') - @token_auth.login_required - def delete(self, id): - '''Delete a job by id''' - job = Job.query.get_or_404(id) - # We use this imported task because it will run in the background - tasks.delete_job(job.id) - return '', 204 diff --git a/app/auth/routes.py b/app/auth/routes.py index 2cda4bc2..35842251 100644 --- a/app/auth/routes.py +++ b/app/auth/routes.py @@ -60,28 +60,37 @@ def register(): return redirect(url_for('main.dashboard')) form = RegistrationForm(prefix='registration-form') if form.validate_on_submit(): - user = User(email=form.email.data.lower(), - password=form.password.data, - username=form.username.data) + user = User( + email=form.email.data.lower(), + password=form.password.data, + username=form.username.data + ) db.session.add(user) - db.session.commit() + db.session.flush(objects=[user]) + db.session.refresh(user) try: - os.makedirs(user.path) - except OSError: - current_app.logger.error( - f'Make dir {user.path} led to an OSError!') - db.session.delete(user) - db.session.commit() + user.makedirs() + except OSError as e: + current_app.logger.error(e) + db.session.rollback() abort(500) else: token = user.generate_confirmation_token() - msg = create_message(user.email, 'Confirm Your Account', - 'auth/email/confirm', token=token, user=user) + msg = create_message( + user.email, + 'Confirm Your Account', + 'auth/email/confirm', + token=token, + user=user + ) send(msg) flash('A confirmation email has been sent to you by email.') return redirect(url_for('.login')) - return render_template('auth/register.html.j2', form=form, - title='Register') + return render_template( + 'auth/register.html.j2', + form=form, + title='Register' + ) @bp.route('/confirm/') diff --git a/app/cli.py b/app/cli.py index d885ff12..e588eef9 100644 --- a/app/cli.py +++ b/app/cli.py @@ -1,16 +1,44 @@ -from . import db -from .models import Corpus, Role +from flask import current_app from flask_migrate import upgrade +from . import db +from .models import Corpus, Job, Role, User, TesseractOCRModel +import json +import os +import re + + +def _make_default_dirs(): + base_dir = current_app.config['NOPAQUE_DATA_DIR'] + + default_directories = [ + os.path.join(base_dir, 'tmp'), + os.path.join(base_dir, 'users') + ] + for directory in default_directories: + if os.path.exists(directory): + if not os.path.isdir(directory): + raise NotADirectoryError(f'{directory} is not a directory') + else: + os.mkdir(directory) def register(app): @app.cli.command() def deploy(): ''' Run deployment tasks. ''' + # Make default directories + _make_default_dirs() + # migrate database to latest revision upgrade() - # create or update user roles - Role.insert_roles() + + # Insert/Update default database values + current_app.logger.info('Insert/Update default roles') + Role.insert_defaults() + current_app.logger.info('Insert/Update default users') + User.insert_defaults() + current_app.logger.info('Insert/Update default tesseract ocr models') + TesseractOCRModel.insert_defaults() @app.cli.group() def daemon(): @@ -40,3 +68,55 @@ def register(app): from unittest.suite import TestSuite tests: TestSuite = TestLoader().discover('tests') TextTestRunner(verbosity=2).run(tests) + + @app.cli.group() + def convert(): + ''' Datebase convert commands. ''' + + @convert.command() + def nlp_jobs(): + for job in Job.query.filter_by(service='nlp').all(): + job.service = 'spacy-nlp' + service_args = json.loads(job.service_args) + new_service_args = {} + for service_arg in service_args: + if service_arg == '--check-encoding': + new_service_args['encoding_detection'] = True + elif re.match(r'-l ([a-z]{2})', service_arg): + language_code = re.search(r'-l ([a-z]{2})', service_arg).group(1) # noqa + new_service_args['language'] = language_code + job.service_args = json.dumps(new_service_args) + db.session.commit() + + @convert.command() + def ocr_jobs(): + # Language code to TesseractOCRModel.title lookup + language_code_lookup = { + 'ara': 'Arabic', + 'chi_tra': 'Chinese - Traditional', + 'dan': 'Danish', + 'eng': 'English', + 'enm': 'English, Middle (1100-1500)', + 'fra': 'French', + 'frm': 'French, Middle (ca. 1400-1600)', + 'deu': 'German', + 'frk': 'German Fraktur', + 'ell': 'Greek, Modern (1453-)', + 'ita': 'Italian', + 'por': 'Portuguese', + 'rus': 'Russian', + 'spa': 'Spanish; Castilian' + } + for job in Job.query.filter_by(service='ocr').all(): + job.service = 'tesseract-ocr' + service_args = json.loads(job.service_args) + new_service_args = {} + for service_arg in service_args: + if service_arg == '--binarize': + new_service_args['binarization'] = True + elif re.match(r'-l ([a-z]{3})', service_arg): + language_code = re.search(r'-l ([a-z]{3})', service_arg).group(1) # noqa + tesseract_ocr_model = TesseractOCRModel.query.filter_by(title=language_code_lookup[language_code]).first() # noqa + new_service_args['model'] = tesseract_ocr_model.id + job.service_args = json.dumps(new_service_args) + db.session.commit() diff --git a/app/contribute/__init__.py b/app/contribute/__init__.py new file mode 100644 index 00000000..15d172ec --- /dev/null +++ b/app/contribute/__init__.py @@ -0,0 +1,5 @@ +from flask import Blueprint + + +bp = Blueprint('contribute', __name__) +from . import routes diff --git a/app/contribute/routes.py b/app/contribute/routes.py new file mode 100644 index 00000000..e0b43231 --- /dev/null +++ b/app/contribute/routes.py @@ -0,0 +1,19 @@ +from flask import flash, redirect, render_template, url_for +from flask_login import login_required +from . import bp +from .. import db +from ..decorators import permission_required +from ..models import Permission, Role, User +from ..settings import tasks as settings_tasks + + +@bp.before_request +@login_required +@permission_required(Permission.CONTRIBUTE) +def before_request(): + pass + + +@bp.route('/') +def index(): + pass diff --git a/app/corpora/cqi_over_socketio/__init__.py b/app/corpora/cqi_over_socketio/__init__.py index 14031c4a..3a358758 100644 --- a/app/corpora/cqi_over_socketio/__init__.py +++ b/app/corpora/cqi_over_socketio/__init__.py @@ -93,12 +93,12 @@ def connect(auth): @socketio.on('disconnect', namespace=NAMESPACE) def disconnect(): + if 'd' not in session: + return session['d']['cqi_client_lock'].acquire() try: session['d']['cqi_client'].disconnect() - except cqi.errors.CQiException: - pass - except BrokenPipeError: + except (BrokenPipeError, cqi.errors.CQiException): pass session['d']['cqi_client_lock'].release() corpus = Corpus.query.get(session['d']['corpus_id']) diff --git a/app/corpora/cqi_over_socketio/utils.py b/app/corpora/cqi_over_socketio/utils.py index 7cbe07b9..9763548a 100644 --- a/app/corpora/cqi_over_socketio/utils.py +++ b/app/corpora/cqi_over_socketio/utils.py @@ -12,7 +12,10 @@ def cqi_over_socketio(f): f_args = {} # Check for missing args and if all provided args are of the right type for param in signature(f).parameters.values(): - if param.annotation == cqi.CQiClient: + if param.name == 'corpus_name': + f_args[param.name] = f'NOPAQUE_{session["d"]["corpus_id"]}' + continue + if param.name == 'cqi_client': f_args[param.name] = session['d']['cqi_client'] continue if param.default is param.empty: diff --git a/app/corpora/routes.py b/app/corpora/routes.py index 1086c298..f6d95b54 100644 --- a/app/corpora/routes.py +++ b/app/corpora/routes.py @@ -1,6 +1,7 @@ from flask import (abort, current_app, flash, make_response, redirect, render_template, url_for, send_from_directory) from flask_login import current_user, login_required +from werkzeug.utils import secure_filename from . import bp from . import tasks from .forms import (AddCorpusFileForm, AddCorpusForm, EditCorpusFileForm, @@ -29,18 +30,20 @@ def add_corpus(): db.session.flush() db.session.refresh(corpus) try: - os.makedirs(corpus.path) + corpus.makedirs() except OSError as e: - current_app.logger.error(f'Could not add corpus: {e}') + current_app.logger.error(e) db.session.rollback() flash('Internal Server Error', 'error') abort(500) - else: - db.session.commit() - flash(f'Corpus "{corpus.title}" added!', 'corpus') - return redirect(url_for('.corpus', corpus_id=corpus.id)) - return render_template('corpora/add_corpus.html.j2', form=form, - title='Add corpus') + db.session.commit() + flash(f'Corpus "{corpus.title}" added', 'corpus') + return redirect(url_for('.corpus', corpus_id=corpus.id)) + return render_template( + 'corpora/add_corpus.html.j2', + form=form, + title='Add corpus' + ) @bp.route('/import', methods=['GET', 'POST']) @@ -174,7 +177,7 @@ def add_corpus_file(corpus_id): if not form.validate(): return make_response(form.errors, 400) # Save the file - form.file.data.save(os.path.join(corpus.path, form.file.data.filename)) + filename = secure_filename(form.file.data.filename) corpus_file = CorpusFile( address=form.address.data, author=form.author.data, @@ -182,9 +185,10 @@ def add_corpus_file(corpus_id): chapter=form.chapter.data, corpus=corpus, editor=form.editor.data, - filename=form.file.data.filename, + filename=filename, institution=form.institution.data, journal=form.journal.data, + mimetype='application/vrt+xml', pages=form.pages.data, publisher=form.publisher.data, publishing_year=form.publishing_year.data, @@ -192,12 +196,25 @@ def add_corpus_file(corpus_id): title=form.title.data ) db.session.add(corpus_file) + db.session.flush(objects=[corpus_file]) + db.session.refresh(corpus_file) + try: + form.file.data.save(corpus_file.path) + except OSError as e: + current_app.logger.error(e) + db.session.rollback() + flash('Internal Server Error', 'error') + return make_response({'redirect_url': url_for('.add_corpus_file', corpus_id=corpus.id)}, 500) # noqa corpus.status = 'unprepared' db.session.commit() - flash(f'Corpus file "{corpus_file.filename}" added!', 'corpus') + flash(f'Corpus file "{corpus_file.title}" added!', 'corpus') return make_response({'redirect_url': url_for('.corpus', corpus_id=corpus.id)}, 201) # noqa - return render_template('corpora/add_corpus_file.html.j2', corpus=corpus, - form=form, title='Add corpus file') + return render_template( + 'corpora/add_corpus_file.html.j2', + corpus=corpus, + form=form, + title='Add corpus file' + ) @bp.route('//files//delete') diff --git a/app/daemon/__init__.py b/app/daemon/__init__.py index 00977456..84ed0efe 100644 --- a/app/daemon/__init__.py +++ b/app/daemon/__init__.py @@ -17,11 +17,7 @@ class Daemon(CheckCorporaMixin, CheckJobsMixin): def run(self): while True: - try: - self.check_corpora() - self.check_jobs() - db.session.commit() - except Exception as e: - current_app.logger.warning(e) - pass + self.check_corpora() + self.check_jobs() + db.session.commit() sleep(1.5) diff --git a/app/daemon/corpus_utils.py b/app/daemon/corpus_utils.py index 31cad929..3962582e 100644 --- a/app/daemon/corpus_utils.py +++ b/app/daemon/corpus_utils.py @@ -26,37 +26,55 @@ class CheckCorporaMixin: def create_build_corpus_service(self, corpus): ''' # Docker service settings # ''' ''' ## Command ## ''' - command = 'docker-entrypoint.sh build-corpus' + command = ['bash', '-c'] + command.append( + f'mkdir /corpora/data/nopaque_{corpus.id}' + ' && ' + 'cwb-encode' + ' -c utf8' + f' -d /corpora/data/nopaque_{corpus.id}' + ' -f /root/files/corpus.vrt' + f' -R /usr/local/share/cwb/registry/nopaque_{corpus.id}' + ' -P pos -P lemma -P simple_pos' + ' -S ent:0+type -S s:0' + ' -S text:0+address+author+booktitle+chapter+editor+institution+journal+pages+publisher+publishing_year+school+title' # noqa + ' -xsB -9' + ' && ' + f'cwb-make -V NOPAQUE_{corpus.id}' + ) ''' ## Constraints ## ''' constraints = ['node.role==worker'] ''' ## Image ## ''' - image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cqpserver:r1674' # noqa + image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cwb:r1702' ''' ## Labels ## ''' labels = { 'origin': current_app.config['SERVER_NAME'], - 'type': 'build-corpus', + 'type': 'corpus.build', 'corpus_id': str(corpus.id) } ''' ## Mounts ## ''' - ''' ### Corpus file mount ### ''' - corpus_file_source = os.path.join(corpus.path, 'merged', 'corpus.vrt') - corpus_file_target = '/root/files/corpus.vrt' - corpus_file_mount = f'{corpus_file_source}:{corpus_file_target}:ro' - ''' ### Corpus data mount ### ''' - corpus_data_source = os.path.join(corpus.path, 'data') - corpus_data_target = '/corpora/data' - corpus_data_mount = f'{corpus_data_source}:{corpus_data_target}:rw' - # Make sure that their is no data in the corpus data directory - shutil.rmtree(corpus_data_source, ignore_errors=True) - os.mkdir(corpus_data_source) - ''' ### Corpus registry mount ### ''' - corpus_registry_source = os.path.join(corpus.path, 'registry') - corpus_registry_target = '/usr/local/share/cwb/registry' - corpus_registry_mount = f'{corpus_registry_source}:{corpus_registry_target}:rw' # noqa - # Make sure that their is no data in the corpus registry directory - shutil.rmtree(corpus_registry_source, ignore_errors=True) - os.mkdir(corpus_registry_source) - mounts = [corpus_file_mount, corpus_data_mount, corpus_registry_mount] + mounts = [] + ''' ### Data mount ### ''' + data_mount_source = os.path.join(corpus.path, 'cwb', 'data') + data_mount_target = '/corpora/data' + data_mount = f'{data_mount_source}:{data_mount_target}:rw' + # Make sure that their is no data in the data directory + shutil.rmtree(data_mount_source, ignore_errors=True) + os.makedirs(data_mount_source) + mounts.append(data_mount) + ''' ### File mount ### ''' + file_mount_source = os.path.join(corpus.path, 'cwb', 'corpus.vrt') + file_mount_target = '/root/files/corpus.vrt' + file_mount = f'{file_mount_source}:{file_mount_target}:ro' + mounts.append(file_mount) + ''' ### Registry mount ### ''' + registry_mount_source = os.path.join(corpus.path, 'cwb', 'registry') + registry_mount_target = '/usr/local/share/cwb/registry' + registry_mount = f'{registry_mount_source}:{registry_mount_target}:rw' + # Make sure that their is no data in the registry directory + shutil.rmtree(registry_mount_source, ignore_errors=True) + os.makedirs(registry_mount_source) + mounts.append(registry_mount) ''' ## Name ## ''' name = f'build-corpus_{corpus.id}' ''' ## Restart policy ## ''' @@ -74,7 +92,7 @@ class CheckCorporaMixin: except docker.errors.APIError as e: current_app.logger.error( f'Create service "{name}" failed ' - + f'due to "docker.errors.APIError": {e}' + f'due to "docker.errors.APIError": {e}' ) return corpus.status = 'queued' @@ -86,14 +104,14 @@ class CheckCorporaMixin: except docker.errors.NotFound as e: current_app.logger.error( f'Get service "{service_name}" failed ' - + f'due to "docker.errors.NotFound": {e}' + f'due to "docker.errors.NotFound": {e}' ) corpus.status = 'failed' return except docker.errors.APIError as e: current_app.logger.error( f'Get service "{service_name}" failed ' - + f'due to "docker.errors.APIError": {e}' + f'due to "docker.errors.APIError": {e}' ) service_tasks = service.tasks() if not service_tasks: @@ -108,36 +126,47 @@ class CheckCorporaMixin: corpus.status = 'failed' else: return - try: - service.remove() - except docker.errors.APIError as e: - current_app.logger.error( - f'Remove service "{service_name}" failed ' - + f'due to "docker.errors.APIError": {e}' - ) + # try: + # service.remove() + # except docker.errors.APIError as e: + # current_app.logger.error( + # f'Remove service "{service_name}" failed ' + # f'due to "docker.errors.APIError": {e}' + # ) def create_cqpserver_container(self, corpus): ''' # Docker container settings # ''' ''' ## Command ## ''' - command = 'cqpserver' + command = [] + command.append( + 'echo "host *;" > cqpserver.init' + ' && ' + 'echo "user anonymous \\"\\";" >> cqpserver.init' + ' && ' + 'cqpserver -I cqpserver.init' + ) ''' ## Detach ## ''' detach = True + ''' ## Entrypoint ## ''' + entrypoint = ['bash', '-c'] ''' ## Image ## ''' - image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cqpserver:r1674' # noqa + image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cwb:r1702' ''' ## Name ## ''' name = f'cqpserver_{corpus.id}' ''' ## Network ## ''' network = 'nopaque_default' ''' ## Volumes ## ''' + volumes = [] ''' ### Corpus data volume ### ''' - corpus_data_source = os.path.join(corpus.path, 'data') - corpus_data_target = '/corpora/data' - corpus_data_volume = f'{corpus_data_source}:{corpus_data_target}:rw' + data_volume_source = os.path.join(corpus.path, 'cwb', 'data') + data_volume_target = '/corpora/data' + data_volume = f'{data_volume_source}:{data_volume_target}:rw' + volumes.append(data_volume) ''' ### Corpus registry volume ### ''' - corpus_registry_source = os.path.join(corpus.path, 'registry') - corpus_registry_target = '/usr/local/share/cwb/registry' - corpus_registry_volume = f'{corpus_registry_source}:{corpus_registry_target}:rw' # noqa - volumes = [corpus_data_volume, corpus_registry_volume] + registry_volume_source = os.path.join(corpus.path, 'cwb', 'registry') + registry_volume_target = '/usr/local/share/cwb/registry' + registry_volume = f'{registry_volume_source}:{registry_volume_target}:rw' # noqa + volumes.append(registry_volume) # Check if a cqpserver container already exists. If this is the case, # remove it and create a new one try: @@ -147,7 +176,7 @@ class CheckCorporaMixin: except docker.errors.APIError as e: current_app.logger.error( f'Get container "{name}" failed ' - + f'due to "docker.errors.APIError": {e}' + f'due to "docker.errors.APIError": {e}' ) return else: @@ -156,7 +185,7 @@ class CheckCorporaMixin: except docker.errors.APIError as e: current_app.logger.error( f'Remove container "{name}" failed ' - + f'due to "docker.errors.APIError": {e}' + f'due to "docker.errors.APIError": {e}' ) return try: @@ -164,6 +193,7 @@ class CheckCorporaMixin: image, command=command, detach=detach, + entrypoint=entrypoint, volumes=volumes, name=name, network=network @@ -171,14 +201,14 @@ class CheckCorporaMixin: except docker.errors.ImageNotFound as e: current_app.logger.error( f'Run container "{name}" failed ' - + f'due to "docker.errors.ImageNotFound" error: {e}' + f'due to "docker.errors.ImageNotFound" error: {e}' ) corpus.status = 'failed' return except docker.errors.APIError as e: current_app.logger.error( f'Run container "{name}" failed ' - + f'due to "docker.errors.APIError" error: {e}' + f'due to "docker.errors.APIError" error: {e}' ) return corpus.status = 'analysing' @@ -190,14 +220,14 @@ class CheckCorporaMixin: except docker.errors.NotFound as e: current_app.logger.error( f'Get container "{container_name}" failed ' - + f'due to "docker.errors.NotFound": {e}' + f'due to "docker.errors.NotFound": {e}' ) corpus.num_analysis_sessions = 0 corpus.status = 'prepared' except docker.errors.APIError as e: current_app.logger.error( f'Get container "{container_name}" failed ' - + f'due to "docker.errors.APIError": {e}' + f'due to "docker.errors.APIError": {e}' ) def remove_cqpserver_container(self, corpus): @@ -210,7 +240,7 @@ class CheckCorporaMixin: except docker.errors.APIError as e: current_app.logger.error( f'Get container "{container_name}" failed ' - + f'due to "docker.errors.APIError": {e}' + f'due to "docker.errors.APIError": {e}' ) return try: @@ -218,5 +248,5 @@ class CheckCorporaMixin: except docker.errors.APIError as e: current_app.logger.error( f'Remove container "{container_name}" failed ' - + f'due to "docker.errors.APIError": {e}' + f'due to "docker.errors.APIError": {e}' ) diff --git a/app/daemon/job_utils.py b/app/daemon/job_utils.py index 78bae839..c640f35c 100644 --- a/app/daemon/job_utils.py +++ b/app/daemon/job_utils.py @@ -2,7 +2,7 @@ from datetime import datetime from flask import current_app from werkzeug.utils import secure_filename from .. import db -from ..models import Job, JobResult +from ..models import Job, JobResult, TesseractOCRModel import docker import json import os @@ -23,27 +23,34 @@ class CheckJobsMixin: ''' # Docker service settings # ''' ''' ## Service specific settings ## ''' if job.service == 'file-setup': - mem_mb = 2048 + mem_mb = 512 n_cores = 2 executable = 'file-setup' - image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}file-setup:{job.service_version}' # noqa - elif job.service == 'ocr': - mem_mb = 4096 + image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}file-setup:v{job.service_version}' # noqa + elif job.service == 'tesseract-ocr': + mem_mb = 2048 n_cores = 4 executable = 'ocr' - image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}ocr:{job.service_version}' # noqa - elif job.service == 'nlp': - mem_mb = 2048 - n_cores = 2 + image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}ocr:v{job.service_version}' # noqa + elif job.service == 'spacy-nlp': + mem_mb = 1024 + n_cores = 1 executable = 'nlp' - image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}nlp:{job.service_version}' # noqa + image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}nlp:v{job.service_version}' # noqa ''' ## Command ## ''' command = f'{executable} -i /input -o /output' - command += ' --log-dir /input' + command += ' --log-dir /logs' command += f' --mem-mb {mem_mb}' command += f' --n-cores {n_cores}' - command += f' --zip [{job.service}]_{secure_filename(job.title)}' - command += ' ' + ' '.join(json.loads(job.service_args)) + service_args = json.loads(job.service_args) + if job.service == 'spacy-nlp': + command += f' -m {service_args["model"]}' + if 'encoding_detection' in service_args and service_args['encoding_detection']: # noqa + command += ' --check-encoding' + elif job.service == 'tesseract-ocr': + command += f' -m {service_args["model"]}' + if 'binarization' in service_args and service_args['binarization']: + command += ' --binarize' ''' ## Constraints ## ''' constraints = ['node.role==worker'] ''' ## Labels ## ''' @@ -53,20 +60,42 @@ class CheckJobsMixin: 'job_id': str(job.id) } ''' ## Mounts ## ''' - ''' ### Input mount ### ''' - input_mount_source = job.path - input_mount_target = '/input' + mounts = [] + ''' ### Input mount(s) ### ''' + input_mount_target_base = '/input' if job.service == 'file-setup': - input_mount_target += f'/{secure_filename(job.title)}' - input_mount = f'{input_mount_source}:{input_mount_target}:rw' + input_mount_target_base += f'/{secure_filename(job.title)}' + for job_input in job.inputs: + input_mount_source = job_input.path + input_mount_target = f'/{input_mount_target_base}/{job_input.filename}' # noqa + input_mount = f'{input_mount_source}:{input_mount_target}:ro' + mounts.append(input_mount) + if job.service == 'tesseract-ocr': + service_args = json.loads(job.service_args) + model = TesseractOCRModel.query.get(service_args['model']) + if model is None: + job.status = 'failed' + return + models_mount_source = model.path + models_mount_target = f'/usr/local/share/tessdata/{model.filename}' + models_mount = f'{models_mount_source}:{models_mount_target}:ro' + mounts.append(models_mount) ''' ### Output mount ### ''' - output_mount_source = os.path.join(job.path, 'output') + output_mount_source = os.path.join(job.path, 'results') output_mount_target = '/output' output_mount = f'{output_mount_source}:{output_mount_target}:rw' # Make sure that their is no data in the output directory shutil.rmtree(output_mount_source, ignore_errors=True) os.makedirs(output_mount_source) - mounts = [input_mount, output_mount] + mounts.append(output_mount) + ''' ### Pipeline data mount ### ''' + pyflow_data_mount_source = os.path.join(job.path, 'pipeline_data') + pyflow_data_mount_target = '/logs/pyflow.data' + pyflow_data_mount = f'{pyflow_data_mount_source}:{pyflow_data_mount_target}:rw' # noqa + # Make sure that their is no data in the output directory + shutil.rmtree(pyflow_data_mount_source, ignore_errors=True) + os.makedirs(pyflow_data_mount_source) + mounts.append(pyflow_data_mount) ''' ## Name ## ''' name = f'job_{job.id}' ''' ## Resources ## ''' @@ -90,7 +119,7 @@ class CheckJobsMixin: except docker.errors.APIError as e: current_app.logger.error( f'Create service "{name}" failed ' - + f'due to "docker.errors.APIError": {e}' + f'due to "docker.errors.APIError": {e}' ) return job.status = 'queued' @@ -102,14 +131,14 @@ class CheckJobsMixin: except docker.errors.NotFound as e: current_app.logger.error( f'Get service "{service_name}" failed ' - + f'due to "docker.errors.NotFound": {e}' + f'due to "docker.errors.NotFound": {e}' ) job.status = 'failed' return except docker.errors.APIError as e: current_app.logger.error( f'Get service "{service_name}" failed ' - + f'due to "docker.errors.APIError": {e}' + f'due to "docker.errors.APIError": {e}' ) return service_tasks = service.tasks() @@ -121,13 +150,25 @@ class CheckJobsMixin: return elif job.status == 'running' and task_state == 'complete': job.status = 'complete' - results_dir = os.path.join(job.path, 'output') - result_files = [x for x in os.listdir(results_dir) if x.endswith('.zip')] # noqa - for result_file in result_files: - job_result = JobResult(filename=result_file, job=job) + results_dir = os.path.join(job.path, 'results') + with open(os.path.join(results_dir, 'outputs.json')) as f: + outputs = json.load(f) + for output in outputs: + filename = os.path.basename(output['file']) + job_result = JobResult( + filename=filename, + job=job, + mimetype=output['mimetype'] + ) + if 'description' in output: + job_result.description = output['description'] db.session.add(job_result) - db.session.flush() + db.session.flush(objects=[job_result]) db.session.refresh(job_result) + os.rename( + os.path.join(results_dir, output['file']), + job_result.path + ) elif job.status == 'running' and task_state == 'failed': job.status = 'failed' else: @@ -138,7 +179,7 @@ class CheckJobsMixin: except docker.errors.APIError as e: current_app.logger.error( f'Remove service "{service_name}" failed ' - + f'due to "docker.errors.APIError": {e}' + f'due to "docker.errors.APIError": {e}' ) def remove_job_service(self, job): @@ -151,7 +192,7 @@ class CheckJobsMixin: except docker.errors.APIError as e: current_app.logger.error( f'Get service "{service_name}" failed ' - + f'due to "docker.errors.APIError": {e}' + f'due to "docker.errors.APIError": {e}' ) return try: @@ -159,7 +200,7 @@ class CheckJobsMixin: except docker.errors.APIError as e: current_app.logger.error( f'Update service "{service_name}" failed ' - + f'due to "docker.errors.APIError": {e}' + f'due to "docker.errors.APIError": {e}' ) return try: @@ -167,5 +208,5 @@ class CheckJobsMixin: except docker.errors.APIError as e: current_app.logger.error( f'Remove "{service_name}" service failed ' - + f'due to "docker.errors.APIError": {e}' + f'due to "docker.errors.APIError": {e}' ) diff --git a/app/jobs/routes.py b/app/jobs/routes.py index db8c686c..4acd7c47 100644 --- a/app/jobs/routes.py +++ b/app/jobs/routes.py @@ -34,12 +34,14 @@ def delete_job(job_id): @login_required def download_job_input(job_id, job_input_id): job_input = JobInput.query.filter(JobInput.job_id == job_id, JobInput.id == job_input_id).first_or_404() # noqa - if not (job_input.job.user == current_user - or current_user.is_administrator()): + if not (job_input.job.user == current_user or current_user.is_administrator()): # noqa abort(403) - return send_from_directory(as_attachment=True, - directory=os.path.dirname(job_input.path), - filename=job_input.filename) + return send_from_directory( + as_attachment=True, + attachment_filename=job_input.filename, + directory=os.path.dirname(job_input.path), + filename=os.path.basename(job_input.path) + ) @bp.route('//restart') @@ -59,9 +61,11 @@ def restart(job_id): @login_required def download_job_result(job_id, job_result_id): job_result = JobResult.query.filter(JobResult.job_id == job_id, JobResult.id == job_result_id).first_or_404() # noqa - if not (job_result.job.user == current_user - or current_user.is_administrator()): + if not (job_result.job.user == current_user or current_user.is_administrator()): # noqa abort(403) - return send_from_directory(as_attachment=True, - directory=os.path.dirname(job_result.path), - filename=job_result.filename) + return send_from_directory( + as_attachment=True, + attachment_filename=job_result.filename, + directory=os.path.dirname(job_result.path), + filename=os.path.basename(job_result.path) + ) diff --git a/app/models.py b/app/models.py index 55013e92..d02b511c 100644 --- a/app/models.py +++ b/app/models.py @@ -4,13 +4,17 @@ from flask_hashids import HashidMixin from flask_login import UserMixin from itsdangerous import BadSignature, TimedJSONWebSignatureSerializer from time import sleep +from tqdm import tqdm from werkzeug.security import generate_password_hash, check_password_hash -import xml.etree.ElementTree as ET from . import db, login import base64 import enum +import json import os +import requests import shutil +import xml.etree.ElementTree as ET +import yaml class Permission(enum.IntEnum): @@ -25,7 +29,7 @@ class Permission(enum.IntEnum): class FileMixin: creation_date = db.Column(db.DateTime, default=datetime.utcnow) - filename = db.Column(db.String(256)) + filename = db.Column(db.String(255)) last_edited_date = db.Column(db.DateTime, default=datetime.utcnow) mimetype = db.Column(db.String(255)) @@ -86,7 +90,7 @@ class Role(HashidMixin, db.Model): return dict_role @staticmethod - def insert_roles(): + def insert_defaults(): roles = { 'User': [], 'API user': [Permission.USE_API], @@ -132,6 +136,12 @@ class User(HashidMixin, UserMixin, db.Model): db.String(16), default='all') # Backrefs: role: Role # Relationships + tesseract_ocr_models = db.relationship( + 'TesseractOCRModel', + backref='user', + cascade='all, delete-orphan', + lazy='dynamic' + ) corpora = db.relationship( 'Corpus', backref='user', @@ -221,6 +231,12 @@ class User(HashidMixin, UserMixin, db.Model): def is_administrator(self): return self.can(Permission.ADMINISTRATE) + def makedirs(self): + os.mkdir(self.path) + os.mkdir(os.path.join(self.path, 'tesseract_ocr_models')) + os.mkdir(os.path.join(self.path, 'corpora')) + os.mkdir(os.path.join(self.path, 'jobs')) + def revoke_token(self): self.token_expiration = datetime.utcnow() - timedelta(seconds=1) @@ -269,6 +285,21 @@ class User(HashidMixin, UserMixin, db.Model): return None return user + @staticmethod + def insert_defaults(): + if User.query.filter_by(username='nopaque').first() is not None: + return + user = User(username='nopaque') + db.session.add(user) + db.session.flush(objects=[user]) + db.session.refresh(user) + try: + user.makedirs() + except OSError as e: + current_app.logger.error(e) + db.session.rollback() + db.session.commit() + @staticmethod def reset_password(token, new_password): s = TimedJSONWebSignatureSerializer(current_app.config['SECRET_KEY']) @@ -284,6 +315,72 @@ class User(HashidMixin, UserMixin, db.Model): return True +class TesseractOCRModel(FileMixin, HashidMixin, db.Model): + __tablename__ = 'tesseract_ocr_models' + # Primary key + id = db.Column(db.Integer, primary_key=True) + # Foreign keys + user_id = db.Column(db.Integer, db.ForeignKey('users.id')) + # Fields + compatible_service_versions = db.Column(db.String(255)) + description = db.Column(db.String(255)) + publisher = db.Column(db.String(128)) + publishing_year = db.Column(db.Integer) + title = db.Column(db.String(64)) + version = db.Column(db.String(16)) + # Backrefs: user: User + + @property + def path(self): + return os.path.join( + self.user.path, + 'tesseract_ocr_models', + str(self.id) + ) + + @staticmethod + def insert_defaults(): + user = User.query.filter_by(username='nopaque').first() + defaults_file = os.path.join( + os.path.dirname(os.path.abspath(__file__)), + 'TesseractOCRModel.defaults.yml' + ) + with open(defaults_file, 'r') as f: + defaults = yaml.safe_load(f) + for m in defaults: + if TesseractOCRModel.query.filter_by(title=m['title'], version=m['version']).first() is not None: # noqa + continue + tesseract_ocr_model = TesseractOCRModel( + compatible_service_versions=json.dumps(m['compatible_service_versions']), # noqa + description=m['description'], + publisher=m['publisher'], + publishing_year=m['publishing_year'], + title=m['title'], + user=user, + version=m['version'] + ) + db.session.add(tesseract_ocr_model) + db.session.flush(objects=[tesseract_ocr_model]) + db.session.refresh(tesseract_ocr_model) + tesseract_ocr_model.filename = f'{tesseract_ocr_model.id}.traineddata' # noqa + r = requests.get(m['url'], stream=True) + pbar = tqdm( + desc=f'{tesseract_ocr_model.title} ({tesseract_ocr_model.filename})', # noqa + unit="B", + unit_scale=True, + unit_divisor=1024, + total=int(r.headers['Content-Length']) + ) + pbar.clear() + with open(tesseract_ocr_model.path, 'wb') as f: + for chunk in r.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + pbar.update(len(chunk)) + f.write(chunk) + pbar.close() + db.session.commit() + + class JobInput(FileMixin, HashidMixin, db.Model): __tablename__ = 'job_inputs' # Primary key @@ -309,7 +406,7 @@ class JobInput(FileMixin, HashidMixin, db.Model): @property def path(self): - return os.path.join(self.job.path, self.filename) + return os.path.join(self.job.path, 'inputs', str(self.id)) def to_dict(self, backrefs=False, relationships=False): dict_job_input = { @@ -347,6 +444,8 @@ class JobResult(FileMixin, HashidMixin, db.Model): id = db.Column(db.Integer, primary_key=True) # Foreign keys job_id = db.Column(db.Integer, db.ForeignKey('jobs.id')) + # Fields + description = db.Column(db.String(255)) # Backrefs: job: Job def __repr__(self): @@ -366,12 +465,13 @@ class JobResult(FileMixin, HashidMixin, db.Model): @property def path(self): - return os.path.join(self.job.path, 'output', self.filename) + return os.path.join(self.job.path, 'results', str(self.id)) def to_dict(self, backrefs=False, relationships=False): dict_job_result = { 'id': self.hashid, 'job_id': self.job.hashid, + 'description': self.description, 'download_url': self.download_url, 'url': self.url, **self.file_mixin_to_dict( @@ -414,8 +514,8 @@ class Job(HashidMixin, db.Model): end_date = db.Column(db.DateTime()) service = db.Column(db.String(64)) ''' - ' Service specific arguments as string list. - ' Example: ["-l eng", "--binarize"] + ' Dictionary as JSON formatted string. + ' Example: {"binarization": True} ''' service_args = db.Column(db.String(255)) service_version = db.Column(db.String(16)) @@ -472,6 +572,12 @@ class Job(HashidMixin, db.Model): shutil.rmtree(self.path, ignore_errors=True) db.session.delete(self) + def makedirs(self): + os.mkdir(self.path) + os.mkdir(os.path.join(self.path, 'inputs')) + os.mkdir(os.path.join(self.path, 'pipeline_data')) + os.mkdir(os.path.join(self.path, 'results')) + def restart(self): ''' Restart a job - only if the status is complete or failed @@ -479,7 +585,7 @@ class Job(HashidMixin, db.Model): if self.status not in ['complete', 'failed']: raise Exception('Could not restart job: status is not "complete/failed"') # noqa - shutil.rmtree(os.path.join(self.path, 'output'), ignore_errors=True) + shutil.rmtree(os.path.join(self.path, 'results'), ignore_errors=True) shutil.rmtree(os.path.join(self.path, 'pyflow.data'), ignore_errors=True) # noqa for result in self.results: db.session.delete(result) @@ -487,6 +593,10 @@ class Job(HashidMixin, db.Model): self.status = 'submitted' def to_dict(self, backrefs=False, relationships=False): + service_args = json.loads(self.service_args) + if self.service == 'tesseract-ocr' and 'model' in service_args: + tesseract_ocr_pipeline_model = TesseractOCRModel.query.get(service_args['model']) # noqa + service_args['model'] = tesseract_ocr_pipeline_model.title dict_job = { 'id': self.hashid, 'user_id': self.user.hashid, @@ -494,7 +604,7 @@ class Job(HashidMixin, db.Model): 'description': self.description, 'end_date': None if self.end_date is None else f'{self.end_date.isoformat()}Z', # noqa 'service': self.service, - 'service_args': self.service_args, + 'service_args': service_args, 'service_version': self.service_version, 'status': self.status, 'title': self.title, @@ -550,7 +660,7 @@ class CorpusFile(FileMixin, HashidMixin, db.Model): @property def path(self): - return os.path.join(self.corpus.path, self.filename) + return os.path.join(self.corpus.path, 'files', str(self.id)) @property def url(self): @@ -659,28 +769,27 @@ class Corpus(HashidMixin, db.Model): return self.user.hashid def build(self): - output_dir = os.path.join(self.path, 'merged') - shutil.rmtree(output_dir, ignore_errors=True) - os.mkdir(output_dir) - output_file = os.path.join(output_dir, 'corpus.vrt') corpus_element = ET.fromstring('\n') for corpus_file in self.files: element_tree = ET.parse(corpus_file.path) - text_node = element_tree.find('text') - text_node.set('address', corpus_file.address or 'NULL') - text_node.set('author', corpus_file.author) - text_node.set('booktitle', corpus_file.booktitle or 'NULL') - text_node.set('chapter', corpus_file.chapter or 'NULL') - text_node.set('editor', corpus_file.editor or 'NULL') - text_node.set('institution', corpus_file.institution or 'NULL') - text_node.set('journal', corpus_file.journal or 'NULL') - text_node.set('pages', corpus_file.pages or 'NULL') - text_node.set('publisher', corpus_file.publisher or 'NULL') - text_node.set('publishing_year', str(corpus_file.publishing_year)) - text_node.set('school', corpus_file.school or 'NULL') - text_node.set('title', corpus_file.title) - corpus_element.insert(1, text_node) - ET.ElementTree(corpus_element).write(output_file, encoding='utf-8') + text_element = element_tree.getroot() + text_element.set('address', corpus_file.address or 'NULL') + text_element.set('author', corpus_file.author) + text_element.set('booktitle', corpus_file.booktitle or 'NULL') + text_element.set('chapter', corpus_file.chapter or 'NULL') + text_element.set('editor', corpus_file.editor or 'NULL') + text_element.set('institution', corpus_file.institution or 'NULL') + text_element.set('journal', corpus_file.journal or 'NULL') + text_element.set('pages', corpus_file.pages or 'NULL') + text_element.set('publisher', corpus_file.publisher or 'NULL') + text_element.set('publishing_year', str(corpus_file.publishing_year)) # noqa + text_element.set('school', corpus_file.school or 'NULL') + text_element.set('title', corpus_file.title) + corpus_element.insert(1, text_element) + ET.ElementTree(corpus_element).write( + os.path.join(self.path, 'cwb', 'corpus.vrt'), + encoding='utf-8' + ) self.last_edited_date = datetime.utcnow() self.status = 'submitted' @@ -688,6 +797,13 @@ class Corpus(HashidMixin, db.Model): shutil.rmtree(self.path, ignore_errors=True) db.session.delete(self) + def makedirs(self): + os.mkdir(self.path) + os.mkdir(os.path.join(self.path, 'files')) + os.mkdir(os.path.join(self.path, 'cwb')) + os.mkdir(os.path.join(self.path, 'cwb', 'data')) + os.mkdir(os.path.join(self.path, 'cwb', 'registry')) + def to_dict(self, backrefs=False, relationships=False): dict_corpus = { 'id': self.hashid, diff --git a/app/services/__init__.py b/app/services/__init__.py index 5c553e89..e41a895d 100644 --- a/app/services/__init__.py +++ b/app/services/__init__.py @@ -1,77 +1,13 @@ from flask import Blueprint +import os +import yaml -SERVICES = { - 'file-setup': { - 'name': 'File setup', - 'versions': { - 'latest': '1.0.0b', - '1.0.0b': { - 'publishing_data': { - 'date': None, - 'title': 'nopaque File setup service', - 'url': 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup/-/tree/1.0.0b', # noqa - 'version': '1.0.0' - } - } - } - }, - 'nlp': { - 'name': 'Natural Language Processing', - 'versions': { - 'latest': '1.0.0b', - '1.0.0b': { - 'check_encoding': True, - 'models': { - 'de': 'German', - 'en': 'English', - 'it': 'Italian', - 'nl': 'Dutch', - 'pl': 'Polish', - 'zh': 'Chinese' - }, - 'publishing_data': { - 'date': None, - 'title': 'nopaque NLP service', - 'url': 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/tree/1.0.0b', # noqa - 'version': '1.0.0' - } - } - } - }, - 'ocr': { - 'name': 'Optical Character Recognition', - 'versions': { - 'latest': '1.0.0b', - '1.0.0b': { - 'binarization': True, - 'models': { - 'ara': 'Arabic', - 'chi_tra': 'Chinese - Traditional', - 'dan': 'Danish', - 'eng': 'English', - 'enm': 'English, Middle 1100-1500', - 'fra': 'French', - 'frm': 'French, Middle ca. 1400-1600', - 'deu': 'German', - 'frk': 'German Fraktur', - 'ell': 'Greek, Modern (1453-)', - 'ita': 'Italian', - 'por': 'Portuguese', - 'rus': 'Russian', - 'spa': 'Spanish; Castilian', - }, - 'publishing_data': { - 'date': None, - 'title': 'nopaque OCR service', - 'url': 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/tree/1.0.0b', # noqa - 'version': '1.0.0' - } - } - } - } -} +services_file = os.path.join( + os.path.dirname(os.path.abspath(__file__)), 'services.yml') +with open(services_file, 'r') as f: + SERVICES = yaml.safe_load(f) bp = Blueprint('services', __name__) -from . import routes +from . import routes # noqa diff --git a/app/services/forms.py b/app/services/forms.py index e77f1db3..0bebfb02 100644 --- a/app/services/forms.py +++ b/app/services/forms.py @@ -1,3 +1,4 @@ +from app.models import TesseractOCRModel from flask_wtf import FlaskForm from wtforms import (BooleanField, MultipleFileField, SelectField, StringField, SubmitField, ValidationError) @@ -6,85 +7,105 @@ from . import SERVICES class AddJobForm(FlaskForm): - description = StringField('Description', - validators=[DataRequired(), Length(1, 255)]) + description = StringField('Description', validators=[DataRequired(), Length(1, 255)]) # noqa submit = SubmitField() title = StringField('Title', validators=[DataRequired(), Length(1, 32)]) version = SelectField('Version', validators=[DataRequired()]) -class AddNLPJobForm(AddJobForm): - check_encoding = BooleanField('Check encoding') +class AddSpacyNLPJobForm(AddJobForm): + encoding_detection = BooleanField('Encoding detection') files = MultipleFileField('Files', validators=[DataRequired()]) - language = SelectField('Language', choices=[('', 'Choose your option')], - default='', validators=[DataRequired()]) + model = SelectField( + 'Model', + choices=[('', 'Choose your option')], + default='', + validators=[DataRequired()] + ) - def validate_check_encoding(self, field): - if field.data and 'check_encoding' not in SERVICES['nlp']['versions'][self.version.data]: # noqa - raise ValidationError('Check encoding is not available in this version') # noqa + def validate_encoding_detection(self, field): + service_info = SERVICES['spacy-nlp']['versions'][self.version.data] + if field.data and 'encoding_detection' not in service_info: + raise ValidationError('Encoding detection is not available') def validate_files(form, field): + valid_extensions = ['.txt'] for file in field.data: - if not file.filename.lower().endswith('.txt'): - raise ValidationError('File does not have an approved ' - 'extension: .txt') + if not file.filename.lower().endswith(tuple(valid_extensions)): + raise ValidationError( + 'File does not have an approved extension: ' + '/'.join(valid_extensions) + ) def __init__(self, *args, **kwargs): - version = kwargs.pop('version', SERVICES['nlp']['versions']['latest']) + version = kwargs.pop('version', SERVICES['spacy-nlp']['latest_version']) # noqa super().__init__(*args, **kwargs) - if 'check_encoding' not in SERVICES['nlp']['versions'][version]: - self.check_encoding.render_kw = {'disabled': True} - self.language.choices += [(x, y) for x, y in SERVICES['nlp']['versions'][version]['models'].items()] # noqa - self.version.choices = [(x, x) for x in SERVICES['nlp']['versions'] if x != 'latest'] # noqa + service_info = SERVICES['spacy-nlp']['versions'][version] + if 'check_encoding' not in service_info['methods']: + self.encoding_detection.render_kw = {'disabled': True} + self.model.choices += [(x, y) for x, y in service_info['models'].items()] # noqa + self.version.choices = [(x, x) for x in SERVICES['spacy-nlp']['versions']] # noqa self.version.default = version -class AddOCRJobForm(AddJobForm): - binarization = BooleanField('Binarazation') +class AddTesseractOCRJobForm(AddJobForm): + binarization = BooleanField('Binarization') files = MultipleFileField('Files', validators=[DataRequired()]) - language = SelectField('Language', choices=[('', 'Choose your option')], - default='', validators=[DataRequired()]) + model = SelectField( + 'Model', + choices=[('', 'Choose your option')], + default='', + validators=[DataRequired()] + ) def validate_binarization(self, field): - if field.data and 'binarization' not in SERVICES['ocr']['versions'][self.version.data]: # noqa - raise ValidationError('Binarization is not available in this version') # noqa + service_info = SERVICES['tesseract-ocr']['versions'][self.version.data] + if field.data and 'binarization' not in service_info: + raise ValidationError('Binarization is not available') def validate_files(self, field): + valid_extensions = ['.pdf'] for file in field.data: - if not file.filename.lower().endswith('.pdf'): - raise ValidationError('File does not have an approved ' - 'extension: .pdf') + if not file.filename.lower().endswith(tuple(valid_extensions)): + raise ValidationError( + 'File does not have an approved extension: ' + '/'.join(valid_extensions) + ) def __init__(self, *args, **kwargs): - version = kwargs.pop('version', SERVICES['ocr']['versions']['latest']) + version = kwargs.pop('version', SERVICES['tesseract-ocr']['latest_version']) # noqa super().__init__(*args, **kwargs) - if 'binarization' not in SERVICES['ocr']['versions'][version]: + service_info = SERVICES['tesseract-ocr']['versions'][version] + if 'binarization' not in service_info['methods']: self.binarization.render_kw = {'disabled': True} - self.language.choices += [(x, y) for x, y in SERVICES['ocr']['versions'][version]['models'].items()] # noqa - self.version.choices = [(x, x) for x in SERVICES['ocr']['versions'] if x != 'latest'] # noqa - self.version.default = version + self.model.choices += [(x.hashid, x.title) for x in TesseractOCRModel.query.all()] # noqa + self.version.choices = [(x, x) for x in SERVICES['tesseract-ocr']['versions']] # noqa + self.version.data = version + self.version.default = SERVICES['tesseract-ocr']['latest_version'] class AddFileSetupJobForm(AddJobForm): files = MultipleFileField('Files', validators=[DataRequired()]) def validate_files(form, field): + valid_extensions = ['.jpeg', '.jpg', '.png', '.tiff', '.tif'] for file in field.data: - if not file.filename.lower().endswith(('.jpeg', '.jpg', '.png', - '.tiff', '.tif')): - raise ValidationError('File does not have an approved ' - 'extension: .jpeg | .jpg | .png | .tiff ' - '| .tif') + if not file.filename.lower().endswith(tuple(valid_extensions)): + raise ValidationError( + 'File does not have an approved extension: ' + '/'.join(valid_extensions) + ) def __init__(self, *args, **kwargs): - version = kwargs.pop('version', SERVICES['file-setup']['versions']['latest']) + version = kwargs.pop('version', SERVICES['file-setup']['latest_version']) # noqa super().__init__(*args, **kwargs) - self.version.choices = [(x, x) for x in SERVICES['file-setup']['versions'] if x != 'latest'] # noqa - self.version.default = version + self.version.choices = [(x, x) for x in SERVICES['file-setup']['versions']] # noqa + self.version.data = version + self.version.default = SERVICES['file-setup']['latest_version'] AddJobForms = { 'file-setup': AddFileSetupJobForm, - 'ocr': AddOCRJobForm, - 'nlp': AddNLPJobForm + 'tesseract-ocr': AddTesseractOCRJobForm, + 'spacy-nlp': AddSpacyNLPJobForm } diff --git a/app/services/routes.py b/app/services/routes.py index 805ab692..d430e61e 100644 --- a/app/services/routes.py +++ b/app/services/routes.py @@ -1,3 +1,4 @@ +from app import hashids from flask import (abort, current_app, flash, make_response, render_template, request, url_for) from flask_login import current_user, login_required @@ -8,7 +9,6 @@ from .. import db from .forms import AddJobForms from ..models import Job, JobInput import json -import os @bp.route('/corpus-analysis') @@ -24,57 +24,65 @@ def service(service): # Check if the requested service exist if service not in SERVICES or service not in AddJobForms: abort(404) - version = request.args.get( - 'version', SERVICES[service]['versions']['latest']) + version = request.args.get('version', SERVICES[service]['latest_version']) if version not in SERVICES[service]['versions']: abort(404) form = AddJobForms[service](prefix='add-job-form', version=version) - form.version.data = version title = SERVICES[service]['name'] - versions = SERVICES[service]['versions'] if form.is_submitted(): if not form.validate(): return make_response(form.errors, 400) - service_args = [] - if service == 'nlp': - service_args.append(f'-l {form.language.data}') - if form.check_encoding.data: - service_args.append('--check-encoding') - if service == 'ocr': - service_args.append(f'-l {form.language.data}') + service_args = {} + if service == 'spacy-nlp': + service_args['model'] = form.model.data + if form.encoding_detection.data: + service_args['encoding_detection'] = True + if service == 'tesseract-ocr': + service_args['model'] = hashids.decode(form.model.data) if form.binarization.data: - service_args.append('--binarize') - job = Job(user=current_user, - description=form.description.data, - service=service, service_args=json.dumps(service_args), - service_version=form.version.data, - status='preparing', title=form.title.data) + service_args['binarization'] = True + job = Job( + user=current_user, + description=form.description.data, + service=service, + service_args=json.dumps(service_args), + service_version=form.version.data, + status='preparing', + title=form.title.data + ) db.session.add(job) - db.session.flush() + db.session.flush(objects=[job]) db.session.refresh(job) try: - os.makedirs(job.path) - except OSError: - current_app.logger.error(f'Make dir {job.path} led to an OSError!') + job.makedirs() + except OSError as e: + current_app.logger.error(e) db.session.rollback() flash('Internal Server Error', 'error') - return make_response( - {'redirect_url': url_for('.service', service=service)}, 500) - else: - for file in form.files.data: - filename = secure_filename(file.filename) - job_input = JobInput( - filename=filename, job=job, mimetype=file.mimetype) + return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa + for file in form.files.data: + filename = secure_filename(file.filename) + job_input = JobInput( + filename=filename, + job=job, + mimetype=file.mimetype + ) + db.session.add(job_input) + db.session.flush(objects=[job_input]) + db.session.refresh(job_input) + try: file.save(job_input.path) - db.session.add(job_input) - job.status = 'submitted' - db.session.commit() - flash(f'Job "{job.title}" added', 'job') - return make_response( - {'redirect_url': url_for('jobs.job', job_id=job.id)}, 201) + except OSError as e: + current_app.logger.error(e) + db.session.rollback() + flash('Internal Server Error', 'error') + return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa + job.status = 'submitted' + db.session.commit() + flash(f'Job "{job.title}" added', 'job') + return make_response({'redirect_url': url_for('jobs.job', job_id=job.id)}, 201) # noqa return render_template( f'services/{service.replace("-", "_")}.html.j2', form=form, - title=title, - versions=versions + title=title ) diff --git a/app/services/services.yml b/app/services/services.yml new file mode 100644 index 00000000..0c82c3d9 --- /dev/null +++ b/app/services/services.yml @@ -0,0 +1,38 @@ +# TODO: This could also be done via GitLab/GitHub APIs +#file-setup-pipeline: +file-setup: + name: 'File setup pipeline' + latest_version: '0.1.0' + versions: + 0.1.0: + publisher: 'Bielefeld University - CRC 1288 - INF' + publishing_year: 2022 + url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup/-/releases/v0.1.0' +#spacy-nlp-pipeline: +spacy-nlp: + name: 'spaCy NLP' + latest_version: '0.1.0' + versions: + 0.1.0: + methods: + - 'encoding_detection' + models: + de: 'German' + en: 'English' + it: 'Italian' + pl: 'Polish' + zh: 'Chinese' + publisher: 'Bielefeld University - CRC 1288 - INF' + publishing_year: 2022 + url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/releases/v0.1.0' +#tesseract-ocr-pipeline: +tesseract-ocr: + name: 'Tesseract OCR' + latest_version: '0.1.0' + versions: + 0.1.0: + methods: + - 'binarization' + publisher: 'Bielefeld University - CRC 1288 - INF' + publishing_year: 2022 + url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/releases/v0.1.0' diff --git a/app/static/css/nopaque.css b/app/static/css/nopaque.css index 90f4df68..ee5377e1 100644 --- a/app/static/css/nopaque.css +++ b/app/static/css/nopaque.css @@ -50,8 +50,8 @@ h1 .nopaque-icons, h2 .nopaque-icons, h3 .nopaque-icons, h4 .nopaque-icons, } .nopaque-icons.service-icon[data-service="corpus-analysis"]:empty:before {content: "H";} .nopaque-icons.service-icon[data-service="file-setup"]:empty:before {content: "E";} -.nopaque-icons.service-icon[data-service="nlp"]:empty:before {content: "G";} -.nopaque-icons.service-icon[data-service="ocr"]:empty:before {content: "F";} +.nopaque-icons.service-icon[data-service="spacy-nlp"]:empty:before {content: "G";} +.nopaque-icons.service-icon[data-service="tesseract-ocr"]:empty:before {content: "F";} .status-text[data-status]:empty:before {content: attr(data-status);} diff --git a/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js b/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js index ad324e34..c07ff35d 100644 --- a/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js +++ b/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js @@ -53,7 +53,7 @@ class CorpusAnalysisApp { this.data.cQiClient = new CQiClient(this.settings.corpusId); this.data.cQiClient.connect() .then(cQiStatus => { - return this.data.cQiClient.corpora.get('CORPUS'); + return this.data.cQiClient.corpora.get(`NOPAQUE_${this.settings.corpusId}`); }) .then( cQiCorpus => { diff --git a/app/static/js/RessourceDisplays/JobDisplay.js b/app/static/js/RessourceDisplays/JobDisplay.js index 61222693..92102908 100644 --- a/app/static/js/RessourceDisplays/JobDisplay.js +++ b/app/static/js/RessourceDisplays/JobDisplay.js @@ -100,7 +100,10 @@ class JobDisplay extends RessourceDisplay { } setServiceArgs(serviceArgs) { - this.setElements(this.displayElement.querySelectorAll('.job-service-args'), serviceArgs); + this.setElements( + this.displayElement.querySelectorAll('.job-service-args'), + JSON.stringify(serviceArgs) + ); } setServiceVersion(serviceVersion) { diff --git a/app/static/js/RessourceLists/JobResultList.js b/app/static/js/RessourceLists/JobResultList.js index 56399bcb..708b25f2 100644 --- a/app/static/js/RessourceLists/JobResultList.js +++ b/app/static/js/RessourceLists/JobResultList.js @@ -10,25 +10,10 @@ class JobResultList extends RessourceList { `.trim(), ressourceMapper: jobResult => { - let description; - - if (jobResult.filename.endsWith('.pdf.zip')) { - description = 'PDF files with text layer'; - } else if (jobResult.filename.endsWith('.txt.zip')) { - description = 'Raw text files'; - } else if (jobResult.filename.endsWith('.vrt.zip')) { - description = 'VRT compliant files including the NLP data'; - } else if (jobResult.filename.endsWith('.xml.zip')) { - description = 'TEI compliant files'; - } else if (jobResult.filename.endsWith('.poco.zip')) { - description = 'HOCR and image files for post correction (PoCo)'; - } else { - description = 'All result files created during this job'; - } return { id: jobResult.id, creationDate: jobResult.creation_date, - description: description, + description: jobResult.description, filename: jobResult.filename }; }, diff --git a/app/templates/_colors.html.j2 b/app/templates/_colors.html.j2 index 84715cbe..a6ac0ed8 100644 --- a/app/templates/_colors.html.j2 +++ b/app/templates/_colors.html.j2 @@ -19,12 +19,12 @@ 'darken': '#a1b300', 'lighten': '#f2f3e1' }, - 'nlp': { + 'spacy-nlp': { 'base': '#98acd2', 'darken': '#0064a3', 'lighten': '#e5e8f5' }, - 'ocr': { + 'tesseract-ocr': { 'base': '#a9d8c8', 'darken': '#00a58b', 'lighten': '#e7f4f1' diff --git a/app/templates/_sidenav.html.j2 b/app/templates/_sidenav.html.j2 index c3ac9ab8..8729f4f8 100644 --- a/app/templates/_sidenav.html.j2 +++ b/app/templates/_sidenav.html.j2 @@ -15,8 +15,8 @@
  • Processes & Services
  • File setup
  • -
  • OCR
  • -
  • NLP
  • +
  • OCR
  • +
  • NLP
  • Corpus analysis
  • Account
  • @@ -28,6 +28,9 @@ {% if current_user.can(Permission.ADMINISTRATE) %}
  • admin_panel_settingsAdministration
  • {% endif %} + {% if current_user.can(Permission.CONTRIBUTE) %} +
  • new_labelContribute
  • + {% endif %} {% if current_user.can(Permission.USE_API) %}
  • apiAPI
  • {% endif %} diff --git a/app/templates/main/dashboard.html.j2 b/app/templates/main/dashboard.html.j2 index 1e763c3e..05f5b804 100644 --- a/app/templates/main/dashboard.html.j2 +++ b/app/templates/main/dashboard.html.j2 @@ -120,32 +120,32 @@

    File setup

    -

    Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing and the application of other services.

    +

    Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing.

    Create Job

    - - + +

    -

    Optical Character Recognition

    +

    Optical Character Recognition

    nopaque converts your image data – like photos or scans – into text data through a process called OCR. This step enables you to proceed with further computational analysis of your documents.

    - Create Job + Create Job

    - - + +

    -

    Natural Language Processing

    +

    Natural Language Processing

    By means of computational linguistic data processing (tokenization, lemmatization, part-of-speech tagging and named-entity recognition) nopaque extracts additional information from your text.

    - Create Job + Create Job
    diff --git a/app/templates/main/index.html.j2 b/app/templates/main/index.html.j2 index bbc44283..0bd343c3 100644 --- a/app/templates/main/index.html.j2 +++ b/app/templates/main/index.html.j2 @@ -84,11 +84,11 @@

    Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing and the application of other services.

    - - + +

    -

    Optical Character Recognition

    +

    Optical Character Recognition

    nopaque converts your image data – like photos or scans – into text data through OCR making it machine readable. This step enables you to proceed with further computational analysis of your documents.

    diff --git a/app/templates/services/nlp.html.j2 b/app/templates/services/spacy_nlp.html.j2 similarity index 93% rename from app/templates/services/nlp.html.j2 rename to app/templates/services/spacy_nlp.html.j2 index d07470e1..30fab84c 100644 --- a/app/templates/services/nlp.html.j2 +++ b/app/templates/services/spacy_nlp.html.j2 @@ -2,7 +2,7 @@ {% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %} {% import "materialize/wtf.html.j2" as wtf %} -{% block main_attribs %} class="service-scheme" data-service="nlp"{% endblock main_attribs %} +{% block main_attribs %} class="service-scheme" data-service="spacy-nlp"{% endblock main_attribs %} {% block page_content %}
    @@ -16,13 +16,13 @@

     

     

    - +
    -
    +
    @@ -71,7 +71,7 @@ {{ wtf.render_field(form.files, accept='text/plain', placeholder='Choose your .txt files') }}
    - {{ wtf.render_field(form.language, material_icon='language') }} + {{ wtf.render_field(form.model, material_icon='language') }}
    {{ wtf.render_field(form.version, material_icon='apps') }} @@ -80,13 +80,13 @@ Preprocessing
    -

    {{ form.check_encoding.label.text }}

    +

    {{ form.encoding_detection.label.text }}

    If the input files are not created with the nopaque OCR service or you do not know if your text files are UTF-8 encoded, check this switch. We will try to automatically determine the right encoding for your texts to process them.

    diff --git a/app/templates/services/ocr.html.j2 b/app/templates/services/tesseract_ocr.html.j2 similarity index 93% rename from app/templates/services/ocr.html.j2 rename to app/templates/services/tesseract_ocr.html.j2 index 9af593b4..66121281 100644 --- a/app/templates/services/ocr.html.j2 +++ b/app/templates/services/tesseract_ocr.html.j2 @@ -2,7 +2,7 @@ {% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %} {% import "materialize/wtf.html.j2" as wtf %} -{% block main_attribs %} class="service-scheme" data-service="ocr"{% endblock main_attribs %} +{% block main_attribs %} class="service-scheme" data-service="tesseract-ocr"{% endblock main_attribs %} {% block page_content %}
    @@ -16,13 +16,13 @@

     

     

    - +
    -
    +
    @@ -50,10 +50,10 @@ {{ wtf.render_field(form.description, data_length='255', material_icon='description') }}
    - {{ wtf.render_field(form.files, accept='application/pdf', color=ocr_color_darken, placeholder='Choose your .pdf files') }} + {{ wtf.render_field(form.files, accept='application/pdf', placeholder='Choose your .pdf files') }}
    - {{ wtf.render_field(form.language, material_icon='language') }} + {{ wtf.render_field(form.model, material_icon='language') }}
    {{ wtf.render_field(form.version, material_icon='apps') }} @@ -127,7 +127,7 @@
    - {{ wtf.render_field(form.submit, color=ocr_color_darken, material_icon='send') }} + {{ wtf.render_field(form.submit, material_icon='send') }}
    diff --git a/app/utils.py b/app/utils.py deleted file mode 100644 index 75d38b7c..00000000 --- a/app/utils.py +++ /dev/null @@ -1,10 +0,0 @@ -from app import hashids -from werkzeug.routing import BaseConverter - - -class HashidConverter(BaseConverter): - def to_python(self, value: str) -> int: - return hashids.decode(value)[0] - - def to_url(self, value: int) -> str: - return hashids.encode(value) diff --git a/docker-compose.traefik.yml b/docker-compose.traefik.yml index c261b2d2..c7d01575 100644 --- a/docker-compose.traefik.yml +++ b/docker-compose.traefik.yml @@ -5,14 +5,14 @@ version: "3.5" networks: - reverse-proxy: - external: - name: reverse-proxy + traefik: + external: true + name: "traefik" services: nopaque: labels: - - "traefik.docker.network=reverse-proxy" + - "traefik.docker.network=traefik" - "traefik.enable=true" ### ### - "traefik.http.middlewares.http-nopaque-headers.headers.customrequestheaders.X-Forwarded-Proto=http" diff --git a/migrations/versions/ad0d835fe5b1_.py b/migrations/versions/ad0d835fe5b1_.py new file mode 100644 index 00000000..0248e316 --- /dev/null +++ b/migrations/versions/ad0d835fe5b1_.py @@ -0,0 +1,45 @@ +"""empty message + +Revision ID: ad0d835fe5b1 +Revises: 68ed092ffe5e +Create Date: 2022-01-18 16:23:45.673993 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = 'ad0d835fe5b1' +down_revision = '68ed092ffe5e' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('tesseract_ocr_models', + sa.Column('creation_date', sa.DateTime(), nullable=True), + sa.Column('filename', sa.String(length=255), nullable=True), + sa.Column('last_edited_date', sa.DateTime(), nullable=True), + sa.Column('mimetype', sa.String(length=255), nullable=True), + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('user_id', sa.Integer(), nullable=True), + sa.Column('compatible_service_versions', sa.String(length=255), nullable=True), + sa.Column('description', sa.String(length=255), nullable=True), + sa.Column('publisher', sa.String(length=128), nullable=True), + sa.Column('publishing_year', sa.Integer(), nullable=True), + sa.Column('title', sa.String(length=64), nullable=True), + sa.Column('version', sa.String(length=16), nullable=True), + sa.ForeignKeyConstraint(['user_id'], ['users.id'], ), + sa.PrimaryKeyConstraint('id') + ) + op.add_column('job_results', sa.Column('description', sa.String(length=255), nullable=True)) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('job_results', 'description') + op.drop_table('tesseract_ocr_models') + # ### end Alembic commands ### diff --git a/nopaque.py b/nopaque.py index 0045d02b..ab8db5a6 100644 --- a/nopaque.py +++ b/nopaque.py @@ -3,10 +3,9 @@ import eventlet eventlet.monkey_patch() - -from app import db, cli, create_app # noqa from app.models import (Corpus, CorpusFile, Job, JobInput, JobResult, - Permission, QueryResult, Role, User) # noqa + Permission, QueryResult, Role, TesseractOCRModel, User) # noqa +from app import db, cli, create_app # noqa from flask import Flask # noqa from typing import Any, Dict # noqa @@ -34,5 +33,6 @@ def make_shell_context() -> Dict[str, Any]: 'Permission': Permission, 'QueryResult': QueryResult, 'Role': Role, + 'TesseractOCRModel': TesseractOCRModel, 'User': User } diff --git a/requirements.txt b/requirements.txt index 202121fd..52770c57 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,5 +19,7 @@ hiredis jsonschema psycopg2 python-dotenv +pyyaml redis +tqdm wtforms[email]