mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2024-12-26 15:14:18 +00:00
Use language models from repository. Remove workaround for the legacy German Fraktur model.
This commit is contained in:
parent
1a3d7175fe
commit
fa4a798351
23
Dockerfile
23
Dockerfile
@ -42,21 +42,18 @@ RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /et
|
|||||||
wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - && \
|
wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - && \
|
||||||
apt-get update && \
|
apt-get update && \
|
||||||
apt-get install -y --no-install-recommends \
|
apt-get install -y --no-install-recommends \
|
||||||
tesseract-ocr && \
|
tesseract-ocr \
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
tesseract-ocr-deu \
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata/raw/master/deu_frak.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
tesseract-ocr-eng \
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
tesseract-ocr-enm \
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
tesseract-ocr-fra \
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
tesseract-ocr-frk \
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
tesseract-ocr-frm \
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/ita.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
tesseract-ocr-ita \
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
tesseract-ocr-por \
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata
|
tesseract-ocr-spa
|
||||||
|
|
||||||
COPY hocrtotei /usr/local/bin
|
COPY hocrtotei /usr/local/bin
|
||||||
COPY ocr /usr/local/bin
|
COPY ocr /usr/local/bin
|
||||||
|
|
||||||
RUN mkdir /input /output && \
|
|
||||||
chmod a+rw /input /output
|
|
||||||
|
|
||||||
ENTRYPOINT ["ocr"]
|
ENTRYPOINT ["ocr"]
|
||||||
|
30
README.md
30
README.md
@ -60,7 +60,7 @@ If you want to use the prebuilt image, replace `sfb1288inf/ocr:latest` with `git
|
|||||||
|
|
||||||
`-l languagecode`
|
`-l languagecode`
|
||||||
* Tells tesseract which language will be used.
|
* Tells tesseract which language will be used.
|
||||||
* options = deu (German), deu_frak (German Fraktur), eng (English), enm (Middle englisch), fra (French), frm (Middle french), ita (Italian), por (Portuguese), spa (Spanish)
|
* options = deu (German), eng (English), enm (Middle englisch), fra (French), frk (German Fraktur), frm (Middle french), ita (Italian), por (Portuguese), spa (Spanish)
|
||||||
* required = True
|
* required = True
|
||||||
|
|
||||||
`--keep-intermediates`
|
`--keep-intermediates`
|
||||||
@ -94,31 +94,3 @@ docker run \
|
|||||||
--nCores 8 \
|
--nCores 8 \
|
||||||
--skip-binarisation
|
--skip-binarisation
|
||||||
```
|
```
|
||||||
|
|
||||||
# Additional language models for OCR
|
|
||||||
Additional language models can be easily installed. Just add them analogical to the existing models to the `Dockerfile`.
|
|
||||||
|
|
||||||
The standard language models for various languages can be found under https://github.com/tesseract-ocr/tessdata. Click on one of the languages and copy the link from the download button. The URL for Afrikaans (afr) would be for example https://github.com/tesseract-ocr/tessdata/raw/4.00/afr.traineddata.
|
|
||||||
|
|
||||||
The more accurate but slower language models can be found under https://github.com/tesseract-ocr/tessdata_best. Click on one of the languages and copy the link from the download button. The URL for Afrikaans (afr) would be for example https://github.com/tesseract-ocr/tessdata_best/raw/master/afr.traineddata.
|
|
||||||
|
|
||||||
Language models for fraktur fonts can also be found in the standard tessdata repository https://github.com/tesseract-ocr/tessdata.
|
|
||||||
|
|
||||||
The `Dockerfile` section for the language models with added language support for Afrikaans would look like this:
|
|
||||||
|
|
||||||
```
|
|
||||||
RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list && \
|
|
||||||
wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - && \
|
|
||||||
apt-get update && \
|
|
||||||
apt-get install -y --no-install-recommends tesseract-ocr && \
|
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/afr.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata \
|
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata/raw/master/deu_frak.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/ita.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
|
||||||
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata
|
|
||||||
```
|
|
||||||
|
8
ocr
8
ocr
@ -30,7 +30,7 @@ def parse_arguments():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-l',
|
'-l',
|
||||||
choices=[
|
choices=[
|
||||||
'deu', 'deu_frak', 'eng', 'enm', 'fra', 'frm', 'ita', 'por', 'spa'
|
'deu', 'eng', 'enm', 'fra', 'frk', 'frm', 'ita', 'por', 'spa'
|
||||||
],
|
],
|
||||||
dest='lang',
|
dest='lang',
|
||||||
required=True
|
required=True
|
||||||
@ -240,12 +240,6 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
' the available core number.
|
' the available core number.
|
||||||
'''
|
'''
|
||||||
ocr_job_n_cores = min(4, self.n_cores)
|
ocr_job_n_cores = min(4, self.n_cores)
|
||||||
'''
|
|
||||||
' WORKAROUND: Tesseract only uses one core for the deu_frak language
|
|
||||||
' model, so the workflow will also only reserve one in this case.
|
|
||||||
'''
|
|
||||||
if self.lang == "deu_frak":
|
|
||||||
ocr_job_n_cores = 1
|
|
||||||
for index, job in enumerate(self.jobs):
|
for index, job in enumerate(self.jobs):
|
||||||
files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
|
files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
|
||||||
if self.skip_binarisation:
|
if self.skip_binarisation:
|
||||||
|
Loading…
Reference in New Issue
Block a user