From bdcc80a66ff660f653756b0baae439cd499c07d8 Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Tue, 28 Nov 2023 10:34:30 +0100 Subject: [PATCH 1/2] Add new tesseract-ocr-pipeline version. Remove redundant spacy-nlp-pipeline version. --- app/SpaCyNLPPipelineModel.defaults.yml | 14 +-- app/TesseractOCRPipelineModel.defaults.yml | 102 +++++++++++++++++++++ app/services/services.yml | 13 +-- 3 files changed, 111 insertions(+), 18 deletions(-) diff --git a/app/SpaCyNLPPipelineModel.defaults.yml b/app/SpaCyNLPPipelineModel.defaults.yml index 62dc5e65..cabbb7a3 100644 --- a/app/SpaCyNLPPipelineModel.defaults.yml +++ b/app/SpaCyNLPPipelineModel.defaults.yml @@ -8,7 +8,7 @@ pipeline_name: 'ca_core_news_md' version: '3.2.0' compatible_service_versions: - - '0.1.0' + - '0.1.0' - title: 'German' description: 'German pipeline optimized for CPU. Components: tok2vec, tagger, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer.' url: 'https://github.com/explosion/spacy-models/releases/download/de_core_news_md-3.2.0/de_core_news_md-3.2.0.tar.gz' @@ -19,7 +19,7 @@ pipeline_name: 'de_core_news_md' version: '3.2.0' compatible_service_versions: - - '0.1.0' + - '0.1.0' - title: 'Greek' description: 'Greek pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer.' url: 'https://github.com/explosion/spacy-models/releases/download/el_core_news_md-3.2.0/el_core_news_md-3.2.0.tar.gz' @@ -120,7 +120,6 @@ version: '3.4.0' compatible_service_versions: - '0.1.1' - - '0.1.2' - title: 'German' description: 'German pipeline optimized for CPU. Components: tok2vec, tagger, morphologizer, parser, lemmatizer (trainable_lemmatizer), senter, ner.' url: 'https://github.com/explosion/spacy-models/releases/download/de_core_news_md-3.4.0/de_core_news_md-3.4.0.tar.gz' @@ -132,7 +131,6 @@ version: '3.4.0' compatible_service_versions: - '0.1.1' - - '0.1.2' - title: 'Greek' description: 'Greek pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, lemmatizer (trainable_lemmatizer), senter, ner, attribute_ruler.' url: 'https://github.com/explosion/spacy-models/releases/download/el_core_news_md-3.4.0/el_core_news_md-3.4.0.tar.gz' @@ -144,7 +142,6 @@ version: '3.4.0' compatible_service_versions: - '0.1.1' - - '0.1.2' - title: 'English' description: 'English pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer.' url: 'https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.1/en_core_web_md-3.4.1.tar.gz' @@ -156,7 +153,6 @@ version: '3.4.1' compatible_service_versions: - '0.1.1' - - '0.1.2' - title: 'Spanish' description: 'Spanish pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer.' url: 'https://github.com/explosion/spacy-models/releases/download/es_core_news_md-3.4.0/es_core_news_md-3.4.0.tar.gz' @@ -168,7 +164,6 @@ version: '3.4.0' compatible_service_versions: - '0.1.1' - - '0.1.2' - title: 'French' description: 'French pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer.' url: 'https://github.com/explosion/spacy-models/releases/download/fr_core_news_md-3.4.0/fr_core_news_md-3.4.0.tar.gz' @@ -180,7 +175,6 @@ version: '3.4.0' compatible_service_versions: - '0.1.1' - - '0.1.2' - title: 'Italian' description: 'Italian pipeline optimized for CPU. Components: tok2vec, morphologizer, tagger, parser, lemmatizer (trainable_lemmatizer), senter, ner' url: 'https://github.com/explosion/spacy-models/releases/download/it_core_news_md-3.4.0/it_core_news_md-3.4.0.tar.gz' @@ -192,7 +186,6 @@ version: '3.4.0' compatible_service_versions: - '0.1.1' - - '0.1.2' - title: 'Polish' description: 'Polish pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, lemmatizer (trainable_lemmatizer), tagger, senter, ner.' url: 'https://github.com/explosion/spacy-models/releases/download/pl_core_news_md-3.4.0/pl_core_news_md-3.4.0.tar.gz' @@ -204,7 +197,6 @@ version: '3.4.0' compatible_service_versions: - '0.1.1' - - '0.1.2' - title: 'Russian' description: 'Russian pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer.' url: 'https://github.com/explosion/spacy-models/releases/download/ru_core_news_md-3.4.0/ru_core_news_md-3.4.0.tar.gz' @@ -216,7 +208,6 @@ version: '3.4.0' compatible_service_versions: - '0.1.1' - - '0.1.2' - title: 'Chinese' description: 'Chinese pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler.' url: 'https://github.com/explosion/spacy-models/releases/download/zh_core_web_md-3.4.0/zh_core_web_md-3.4.0.tar.gz' @@ -228,4 +219,3 @@ version: '3.4.0' compatible_service_versions: - '0.1.1' - - '0.1.2' diff --git a/app/TesseractOCRPipelineModel.defaults.yml b/app/TesseractOCRPipelineModel.defaults.yml index 834b0ea5..e83bb503 100644 --- a/app/TesseractOCRPipelineModel.defaults.yml +++ b/app/TesseractOCRPipelineModel.defaults.yml @@ -9,6 +9,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Amharic' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/amh.traineddata' @@ -20,6 +21,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' - title: 'Arabic' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ara.traineddata' @@ -31,6 +33,7 @@ compatible_service_versions: - '0.1.0' - '0.1.1' + - '0.1.2' # - title: 'Assamese' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/asm.traineddata' @@ -42,6 +45,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Azerbaijani' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/aze.traineddata' @@ -53,6 +57,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Azerbaijani - Cyrillic' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/aze_cyrl.traineddata' @@ -64,6 +69,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Belarusian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bel.traineddata' @@ -75,6 +81,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Bengali' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ben.traineddata' @@ -86,6 +93,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Tibetan' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bod.traineddata' @@ -97,6 +105,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Bosnian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bos.traineddata' @@ -108,6 +117,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Bulgarian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bul.traineddata' @@ -119,6 +129,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Catalan; Valencian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/cat.traineddata' @@ -130,6 +141,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Cebuano' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ceb.traineddata' @@ -141,6 +153,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Czech' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ces.traineddata' @@ -152,6 +165,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Chinese - Simplified' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chi_sim.traineddata' @@ -163,6 +177,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' - title: 'Chinese - Traditional' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chi_tra.traineddata' @@ -174,6 +189,7 @@ compatible_service_versions: - '0.1.0' - '0.1.1' + - '0.1.2' # - title: 'Cherokee' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chr.traineddata' @@ -185,6 +201,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Welsh' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/cym.traineddata' @@ -196,6 +213,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' - title: 'Danish' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/dan.traineddata' @@ -207,6 +225,7 @@ compatible_service_versions: - '0.1.0' - '0.1.1' + - '0.1.2' - title: 'German' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/deu.traineddata' @@ -218,6 +237,7 @@ compatible_service_versions: - '0.1.0' - '0.1.1' + - '0.1.2' # - title: 'Dzongkha' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/dzo.traineddata' @@ -229,6 +249,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' - title: 'Greek, Modern (1453-)' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ell.traineddata' @@ -240,6 +261,7 @@ compatible_service_versions: - '0.1.0' - '0.1.1' + - '0.1.2' - title: 'English' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/eng.traineddata' @@ -251,6 +273,7 @@ compatible_service_versions: - '0.1.0' - '0.1.1' + - '0.1.2' - title: 'English, Middle (1100-1500)' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/enm.traineddata' @@ -262,6 +285,7 @@ compatible_service_versions: - '0.1.0' - '0.1.1' + - '0.1.2' # - title: 'Esperanto' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/epo.traineddata' @@ -273,6 +297,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Estonian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/est.traineddata' @@ -284,6 +309,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Basque' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/eus.traineddata' @@ -295,6 +321,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Persian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fas.traineddata' @@ -306,6 +333,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Finnish' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fin.traineddata' @@ -317,6 +345,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' - title: 'French' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fra.traineddata' @@ -328,6 +357,7 @@ compatible_service_versions: - '0.1.0' - '0.1.1' + - '0.1.2' - title: 'German Fraktur' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/frk.traineddata' @@ -339,6 +369,7 @@ compatible_service_versions: - '0.1.0' - '0.1.1' + - '0.1.2' - title: 'French, Middle (ca. 1400-1600)' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/frm.traineddata' @@ -350,6 +381,7 @@ compatible_service_versions: - '0.1.0' - '0.1.1' + - '0.1.2' # - title: 'Irish' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/gle.traineddata' @@ -361,6 +393,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Galician' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/glg.traineddata' @@ -372,6 +405,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' - title: 'Greek, Ancient (-1453)' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/grc.traineddata' @@ -383,6 +417,7 @@ compatible_service_versions: - '0.1.0' - '0.1.1' + - '0.1.2' # - title: 'Gujarati' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/guj.traineddata' @@ -394,6 +429,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Haitian; Haitian Creole' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hat.traineddata' @@ -405,6 +441,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Hebrew' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/heb.traineddata' @@ -416,6 +453,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Hindi' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hin.traineddata' @@ -427,6 +465,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Croatian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hrv.traineddata' @@ -438,6 +477,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Hungarian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hun.traineddata' @@ -449,6 +489,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Inuktitut' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/iku.traineddata' @@ -460,6 +501,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Indonesian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ind.traineddata' @@ -471,6 +513,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Icelandic' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/isl.traineddata' @@ -482,6 +525,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' - title: 'Italian' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ita.traineddata' @@ -493,6 +537,7 @@ compatible_service_versions: - '0.1.0' - '0.1.1' + - '0.1.2' - title: 'Italian - Old' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ita_old.traineddata' @@ -504,6 +549,7 @@ compatible_service_versions: - '0.1.0' - '0.1.1' + - '0.1.2' # - title: 'Javanese' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/jav.traineddata' @@ -515,6 +561,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Japanese' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/jpn.traineddata' @@ -526,6 +573,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Kannada' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kan.traineddata' @@ -537,6 +585,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Georgian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kat.traineddata' @@ -548,6 +597,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Georgian - Old' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kat_old.traineddata' @@ -559,6 +609,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Kazakh' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kaz.traineddata' @@ -570,6 +621,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Central Khmer' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/khm.traineddata' @@ -581,6 +633,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Kirghiz; Kyrgyz' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kir.traineddata' @@ -592,6 +645,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Korean' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kor.traineddata' @@ -603,6 +657,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Kurdish' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kur.traineddata' @@ -614,6 +669,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Lao' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lao.traineddata' @@ -625,6 +681,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Latin' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lat.traineddata' @@ -636,6 +693,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Latvian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lav.traineddata' @@ -647,6 +705,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Lithuanian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lit.traineddata' @@ -658,6 +717,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Malayalam' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mal.traineddata' @@ -669,6 +729,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Marathi' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mar.traineddata' @@ -680,6 +741,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Macedonian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mkd.traineddata' @@ -691,6 +753,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Maltese' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mlt.traineddata' @@ -702,6 +765,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Malay' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/msa.traineddata' @@ -713,6 +777,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Burmese' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mya.traineddata' @@ -724,6 +789,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Nepali' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nep.traineddata' @@ -735,6 +801,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Dutch; Flemish' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nld.traineddata' @@ -746,6 +813,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Norwegian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nor.traineddata' @@ -757,6 +825,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Oriya' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ori.traineddata' @@ -768,6 +837,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Panjabi; Punjabi' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pan.traineddata' @@ -779,6 +849,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Polish' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pol.traineddata' @@ -790,6 +861,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' - title: 'Portuguese' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/por.traineddata' @@ -801,6 +873,7 @@ compatible_service_versions: - '0.1.0' - '0.1.1' + - '0.1.2' # - title: 'Pushto; Pashto' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pus.traineddata' @@ -812,6 +885,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Romanian; Moldavian; Moldovan' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ron.traineddata' @@ -823,6 +897,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' - title: 'Russian' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/rus.traineddata' @@ -834,6 +909,7 @@ compatible_service_versions: - '0.1.0' - '0.1.1' + - '0.1.2' # - title: 'Sanskrit' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/san.traineddata' @@ -845,6 +921,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Sinhala; Sinhalese' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/sin.traineddata' @@ -856,6 +933,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Slovak' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/slk.traineddata' @@ -867,6 +945,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Slovenian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/slv.traineddata' @@ -878,6 +957,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' - title: 'Spanish; Castilian' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/spa.traineddata' @@ -889,6 +969,7 @@ compatible_service_versions: - '0.1.0' - '0.1.1' + - '0.1.2' - title: 'Spanish; Castilian - Old' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/spa_old.traineddata' @@ -900,6 +981,7 @@ compatible_service_versions: - '0.1.0' - '0.1.1' + - '0.1.2' # - title: 'Albanian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/sqi.traineddata' @@ -911,6 +993,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Serbian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/srp.traineddata' @@ -922,6 +1005,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Serbian - Latin' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/srp_latn.traineddata' @@ -933,6 +1017,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Swahili' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/swa.traineddata' @@ -944,6 +1029,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Swedish' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/swe.traineddata' @@ -955,6 +1041,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Syriac' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/syr.traineddata' @@ -966,6 +1053,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Tamil' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tam.traineddata' @@ -977,6 +1065,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Telugu' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tel.traineddata' @@ -988,6 +1077,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Tajik' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tgk.traineddata' @@ -999,6 +1089,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Tagalog' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tgl.traineddata' @@ -1010,6 +1101,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Thai' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tha.traineddata' @@ -1021,6 +1113,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Tigrinya' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tir.traineddata' @@ -1032,6 +1125,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Turkish' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tur.traineddata' @@ -1043,6 +1137,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Uighur; Uyghur' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uig.traineddata' @@ -1054,6 +1149,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Ukrainian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ukr.traineddata' @@ -1065,6 +1161,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Urdu' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/urd.traineddata' @@ -1076,6 +1173,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Uzbek' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uzb.traineddata' @@ -1087,6 +1185,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Uzbek - Cyrillic' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uzb_cyrl.traineddata' @@ -1098,6 +1197,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Vietnamese' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/vie.traineddata' @@ -1109,6 +1209,7 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' # - title: 'Yiddish' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/yid.traineddata' @@ -1120,3 +1221,4 @@ # compatible_service_versions: # - '0.1.0' # - '0.1.1' +# - '0.1.2' diff --git a/app/services/services.yml b/app/services/services.yml index a686f683..7597a5c6 100644 --- a/app/services/services.yml +++ b/app/services/services.yml @@ -10,7 +10,7 @@ file-setup-pipeline: tesseract-ocr-pipeline: name: 'Tesseract OCR Pipeline' publisher: 'Bielefeld University - CRC 1288 - INF' - latest_version: '0.1.1' + latest_version: '0.1.2' versions: 0.1.0: methods: @@ -23,6 +23,12 @@ tesseract-ocr-pipeline: - 'ocropus_nlbin_threshold' publishing_year: 2022 url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.1' + 0.1.2: + methods: + - 'binarization' + - 'ocropus_nlbin_threshold' + publishing_year: 2023 + url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.2' transkribus-htr-pipeline: name: 'Transkribus HTR Pipeline' publisher: 'Bielefeld University - CRC 1288 - INF' @@ -53,8 +59,3 @@ spacy-nlp-pipeline: - 'encoding_detection' publishing_year: 2022 url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/spacy-nlp-pipeline/-/releases/v0.1.1' - 0.1.2: - methods: - - 'encoding_detection' - publishing_year: 2022 - url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/spacy-nlp-pipeline/-/releases/v0.1.2' From 9c22370eeaf951055c29988183abe40599522c5d Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Tue, 28 Nov 2023 12:10:55 +0100 Subject: [PATCH 2/2] Implement force download parameter in model insert_defaults methods --- app/models.py | 138 ++++++++++++++++++++++++++------------------------ 1 file changed, 71 insertions(+), 67 deletions(-) diff --git a/app/models.py b/app/models.py index 91477f92..ba90ca08 100644 --- a/app/models.py +++ b/app/models.py @@ -953,7 +953,7 @@ class TesseractOCRPipelineModel(FileMixin, HashidMixin, db.Model): return self.user.hashid @staticmethod - def insert_defaults(): + def insert_defaults(force_download=False): nopaque_user = User.query.filter_by(username='nopaque').first() defaults_file = os.path.join( os.path.dirname(os.path.abspath(__file__)), @@ -966,6 +966,7 @@ class TesseractOCRPipelineModel(FileMixin, HashidMixin, db.Model): if model is not None: model.compatible_service_versions = m['compatible_service_versions'] model.description = m['description'] + model.filename = f'{model.id}.traineddata' model.publisher = m['publisher'] model.publisher_url = m['publisher_url'] model.publishing_url = m['publishing_url'] @@ -973,38 +974,39 @@ class TesseractOCRPipelineModel(FileMixin, HashidMixin, db.Model): model.is_public = True model.title = m['title'] model.version = m['version'] - continue - model = TesseractOCRPipelineModel( - compatible_service_versions=m['compatible_service_versions'], - description=m['description'], - publisher=m['publisher'], - publisher_url=m['publisher_url'], - publishing_url=m['publishing_url'], - publishing_year=m['publishing_year'], - is_public=True, - title=m['title'], - user=nopaque_user, - version=m['version'] - ) - db.session.add(model) - db.session.flush(objects=[model]) - db.session.refresh(model) - model.filename = f'{model.id}.traineddata' - r = requests.get(m['url'], stream=True) - pbar = tqdm( - desc=f'{model.title} ({model.filename})', - unit="B", - unit_scale=True, - unit_divisor=1024, - total=int(r.headers['Content-Length']) - ) - pbar.clear() - with open(model.path, 'wb') as f: - for chunk in r.iter_content(chunk_size=1024): - if chunk: # filter out keep-alive new chunks - pbar.update(len(chunk)) - f.write(chunk) - pbar.close() + else: + model = TesseractOCRPipelineModel( + compatible_service_versions=m['compatible_service_versions'], + description=m['description'], + publisher=m['publisher'], + publisher_url=m['publisher_url'], + publishing_url=m['publishing_url'], + publishing_year=m['publishing_year'], + is_public=True, + title=m['title'], + user=nopaque_user, + version=m['version'] + ) + db.session.add(model) + db.session.flush(objects=[model]) + db.session.refresh(model) + model.filename = f'{model.id}.traineddata' + if not os.path.exists(model.path) or force_download: + r = requests.get(m['url'], stream=True) + pbar = tqdm( + desc=f'{model.title} ({model.filename})', + unit="B", + unit_scale=True, + unit_divisor=1024, + total=int(r.headers['Content-Length']) + ) + pbar.clear() + with open(model.path, 'wb') as f: + for chunk in r.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + pbar.update(len(chunk)) + f.write(chunk) + pbar.close() db.session.commit() def delete(self): @@ -1080,7 +1082,7 @@ class SpaCyNLPPipelineModel(FileMixin, HashidMixin, db.Model): return self.user.hashid @staticmethod - def insert_defaults(): + def insert_defaults(force_download=False): nopaque_user = User.query.filter_by(username='nopaque').first() defaults_file = os.path.join( os.path.dirname(os.path.abspath(__file__)), @@ -1093,6 +1095,7 @@ class SpaCyNLPPipelineModel(FileMixin, HashidMixin, db.Model): if model is not None: model.compatible_service_versions = m['compatible_service_versions'] model.description = m['description'] + model.filename = m['url'].split('/')[-1] model.publisher = m['publisher'] model.publisher_url = m['publisher_url'] model.publishing_url = m['publishing_url'] @@ -1101,39 +1104,40 @@ class SpaCyNLPPipelineModel(FileMixin, HashidMixin, db.Model): model.title = m['title'] model.version = m['version'] model.pipeline_name = m['pipeline_name'] - continue - model = SpaCyNLPPipelineModel( - compatible_service_versions=m['compatible_service_versions'], - description=m['description'], - publisher=m['publisher'], - publisher_url=m['publisher_url'], - publishing_url=m['publishing_url'], - publishing_year=m['publishing_year'], - is_public=True, - title=m['title'], - user=nopaque_user, - version=m['version'], - pipeline_name=m['pipeline_name'] - ) - db.session.add(model) - db.session.flush(objects=[model]) - db.session.refresh(model) - model.filename = m['url'].split('/')[-1] - r = requests.get(m['url'], stream=True) - pbar = tqdm( - desc=f'{model.title} ({model.filename})', - unit="B", - unit_scale=True, - unit_divisor=1024, - total=int(r.headers['Content-Length']) - ) - pbar.clear() - with open(model.path, 'wb') as f: - for chunk in r.iter_content(chunk_size=1024): - if chunk: # filter out keep-alive new chunks - pbar.update(len(chunk)) - f.write(chunk) - pbar.close() + else: + model = SpaCyNLPPipelineModel( + compatible_service_versions=m['compatible_service_versions'], + description=m['description'], + filename=m['url'].split('/')[-1], + publisher=m['publisher'], + publisher_url=m['publisher_url'], + publishing_url=m['publishing_url'], + publishing_year=m['publishing_year'], + is_public=True, + title=m['title'], + user=nopaque_user, + version=m['version'], + pipeline_name=m['pipeline_name'] + ) + db.session.add(model) + db.session.flush(objects=[model]) + db.session.refresh(model) + if not os.path.exists(model.path) or force_download: + r = requests.get(m['url'], stream=True) + pbar = tqdm( + desc=f'{model.title} ({model.filename})', + unit="B", + unit_scale=True, + unit_divisor=1024, + total=int(r.headers['Content-Length']) + ) + pbar.clear() + with open(model.path, 'wb') as f: + for chunk in r.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + pbar.update(len(chunk)) + f.write(chunk) + pbar.close() db.session.commit() def delete(self):