From fa4a798351ce91fcb59d6cb22ef33c3e5cdf62e5 Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Wed, 31 Jul 2019 11:13:55 +0200 Subject: [PATCH] Use language models from repository. Remove workaround for the legacy German Fraktur model. --- Dockerfile | 23 ++++++++++------------- README.md | 30 +----------------------------- ocr | 8 +------- 3 files changed, 12 insertions(+), 49 deletions(-) diff --git a/Dockerfile b/Dockerfile index 8645ea4..704d5a2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -42,21 +42,18 @@ RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /et wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - && \ apt-get update && \ apt-get install -y --no-install-recommends \ - tesseract-ocr && \ - wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ - wget -nv https://github.com/tesseract-ocr/tessdata/raw/master/deu_frak.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ - wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ - wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ - wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ - wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ - wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/ita.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ - wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ - wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata + tesseract-ocr \ + tesseract-ocr-deu \ + tesseract-ocr-eng \ + tesseract-ocr-enm \ + tesseract-ocr-fra \ + tesseract-ocr-frk \ + tesseract-ocr-frm \ + tesseract-ocr-ita \ + tesseract-ocr-por \ + tesseract-ocr-spa COPY hocrtotei /usr/local/bin COPY ocr /usr/local/bin -RUN mkdir /input /output && \ - chmod a+rw /input /output - ENTRYPOINT ["ocr"] diff --git a/README.md b/README.md index 062100c..8e56982 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ If you want to use the prebuilt image, replace `sfb1288inf/ocr:latest` with `git `-l languagecode` * Tells tesseract which language will be used. -* options = deu (German), deu_frak (German Fraktur), eng (English), enm (Middle englisch), fra (French), frm (Middle french), ita (Italian), por (Portuguese), spa (Spanish) +* options = deu (German), eng (English), enm (Middle englisch), fra (French), frk (German Fraktur), frm (Middle french), ita (Italian), por (Portuguese), spa (Spanish) * required = True `--keep-intermediates` @@ -94,31 +94,3 @@ docker run \ --nCores 8 \ --skip-binarisation ``` - -# Additional language models for OCR -Additional language models can be easily installed. Just add them analogical to the existing models to the `Dockerfile`. - -The standard language models for various languages can be found under https://github.com/tesseract-ocr/tessdata. Click on one of the languages and copy the link from the download button. The URL for Afrikaans (afr) would be for example https://github.com/tesseract-ocr/tessdata/raw/4.00/afr.traineddata. - -The more accurate but slower language models can be found under https://github.com/tesseract-ocr/tessdata_best. Click on one of the languages and copy the link from the download button. The URL for Afrikaans (afr) would be for example https://github.com/tesseract-ocr/tessdata_best/raw/master/afr.traineddata. - -Language models for fraktur fonts can also be found in the standard tessdata repository https://github.com/tesseract-ocr/tessdata. - -The `Dockerfile` section for the language models with added language support for Afrikaans would look like this: - -``` -RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list && \ - wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - && \ - apt-get update && \ - apt-get install -y --no-install-recommends tesseract-ocr && \ - wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/afr.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata \ - wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ - wget -nv https://github.com/tesseract-ocr/tessdata/raw/master/deu_frak.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ - wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ - wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ - wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ - wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ - wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/ita.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ - wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ - wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata -``` diff --git a/ocr b/ocr index 07e3913..28ecbd9 100755 --- a/ocr +++ b/ocr @@ -30,7 +30,7 @@ def parse_arguments(): parser.add_argument( '-l', choices=[ - 'deu', 'deu_frak', 'eng', 'enm', 'fra', 'frm', 'ita', 'por', 'spa' + 'deu', 'eng', 'enm', 'fra', 'frk', 'frm', 'ita', 'por', 'spa' ], dest='lang', required=True @@ -240,12 +240,6 @@ class OCRWorkflow(WorkflowRunner): ' the available core number. ''' ocr_job_n_cores = min(4, self.n_cores) - ''' - ' WORKAROUND: Tesseract only uses one core for the deu_frak language - ' model, so the workflow will also only reserve one in this case. - ''' - if self.lang == "deu_frak": - ocr_job_n_cores = 1 for index, job in enumerate(self.jobs): files = os.listdir(os.path.join(job['output_dir'], 'tmp')) if self.skip_binarisation: