Use language models from repository. Remove workaround for the legacy German Fraktur model.

2026-07-04 07:55:37 +00:00 · 2019-07-31 11:13:55 +02:00
parent 1a3d7175fe
commit fa4a798351
3 changed files with 12 additions and 49 deletions
@@ -42,21 +42,18 @@ RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /et
    wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - && \
    apt-get update && \
    apt-get install -y --no-install-recommends \
-    tesseract-ocr  && \
-    wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
-    wget -nv https://github.com/tesseract-ocr/tessdata/raw/master/deu_frak.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
-    wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
-    wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
-    wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
-    wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
-    wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/ita.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
-    wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
-    wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata
+    tesseract-ocr  \
+    tesseract-ocr-deu \
+    tesseract-ocr-eng \
+    tesseract-ocr-enm \
+    tesseract-ocr-fra \
+    tesseract-ocr-frk \
+    tesseract-ocr-frm \
+    tesseract-ocr-ita \
+    tesseract-ocr-por \
+    tesseract-ocr-spa

 COPY hocrtotei /usr/local/bin
 COPY ocr /usr/local/bin

-RUN mkdir /input /output && \
-    chmod a+rw /input /output
-
 ENTRYPOINT ["ocr"]
@@ -60,7 +60,7 @@ If you want to use the prebuilt image, replace `sfb1288inf/ocr:latest` with `git

 `-l languagecode`
 * Tells tesseract which language will be used.
-* options = deu (German), deu_frak (German Fraktur), eng (English), enm (Middle englisch), fra (French), frm (Middle french), ita (Italian), por (Portuguese), spa (Spanish)
+* options = deu (German), eng (English), enm (Middle englisch), fra (French), frk (German Fraktur), frm (Middle french), ita (Italian), por (Portuguese), spa (Spanish)
 * required = True

 `--keep-intermediates`
@@ -94,31 +94,3 @@ docker run \
        --nCores 8 \
        --skip-binarisation
 ```
-
-# Additional language models for OCR
-Additional language models can be easily installed. Just add them analogical to the existing models to the `Dockerfile`.
-
-The standard language models for various languages can be found under https://github.com/tesseract-ocr/tessdata. Click on one of the languages and copy the link from the download button. The URL for Afrikaans (afr) would be for example https://github.com/tesseract-ocr/tessdata/raw/4.00/afr.traineddata.
-
-The more accurate but slower language models can be found under https://github.com/tesseract-ocr/tessdata_best. Click on one of the languages and copy the link from the download button. The URL for Afrikaans (afr) would be for example https://github.com/tesseract-ocr/tessdata_best/raw/master/afr.traineddata.
-
-Language models for fraktur fonts can also be found in the standard tessdata repository https://github.com/tesseract-ocr/tessdata.
-
-The `Dockerfile` section for the language models with added language support for Afrikaans would look like this:
-
-```
-RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list && \
-    wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends tesseract-ocr && \
-    wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/afr.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata \
-    wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
-    wget -nv https://github.com/tesseract-ocr/tessdata/raw/master/deu_frak.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
-    wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
-    wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
-    wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
-    wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
-    wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/ita.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
-    wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
-    wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata
-```
@@ -30,7 +30,7 @@ def parse_arguments():
    parser.add_argument(
        '-l',
        choices=[
-            'deu', 'deu_frak', 'eng', 'enm', 'fra', 'frm', 'ita', 'por', 'spa'
+            'deu', 'eng', 'enm', 'fra', 'frk', 'frm', 'ita', 'por', 'spa'
        ],
        dest='lang',
        required=True
@@ -240,12 +240,6 @@ class OCRWorkflow(WorkflowRunner):
        ' the available core number.
        '''
        ocr_job_n_cores = min(4, self.n_cores)
-        '''
-        ' WORKAROUND: Tesseract only uses one core for the deu_frak language
-        ' model, so the workflow will also only reserve one in this case.
-        '''
-        if self.lang == "deu_frak":
-            ocr_job_n_cores = 1
        for index, job in enumerate(self.jobs):
            files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
            if self.skip_binarisation: