First work on version 1.0.0

This commit is contained in:
Patrick Jentsch 2021-02-19 13:04:03 +01:00
parent 07635dcdfa
commit ca7df6d0ed
5 changed files with 73 additions and 80 deletions

View File

@ -1,7 +1,7 @@
FROM debian:buster-slim FROM debian:buster-slim
LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <sporada@uni-bielefeld.de>" LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>"
ENV LANG=C.UTF-8 ENV LANG=C.UTF-8
@ -16,26 +16,22 @@ ENV PYFLOW_RELEASE=1.1.20
ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" . ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" .
RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \ RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \
&& cd "pyflow-${PYFLOW_RELEASE}" \ && cd "pyflow-${PYFLOW_RELEASE}" \
&& apt-get update \
&& apt-get install --no-install-recommends --yes \ && apt-get install --no-install-recommends --yes \
python2.7 \ python2.7 \
&& rm -r /var/lib/apt/lists/* \
&& python2.7 setup.py build install \ && python2.7 setup.py build install \
&& cd .. \ && cd .. \
&& rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz" && rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz"
## Install ocropy ## ## Install ocropy ##
ENV OCROPY_RELEASE 1.3.3 ENV OCROPY_RELEASE=1.3.3
ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_RELEASE}.tar.gz" . ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_RELEASE}.tar.gz" .
RUN tar -xzf "v${OCROPY_RELEASE}.tar.gz" \ RUN tar -xzf "v${OCROPY_RELEASE}.tar.gz" \
&& cd "ocropy-${OCROPY_RELEASE}" \ && cd "ocropy-${OCROPY_RELEASE}" \
&& apt-get update \
&& apt-get install --no-install-recommends --yes \ && apt-get install --no-install-recommends --yes \
python-pil \ python-pil \
python-tk \ python-tk \
$(cat PACKAGES) \ $(cat PACKAGES) \
&& rm -r /var/lib/apt/lists/* \
&& python2.7 setup.py install \ && python2.7 setup.py install \
&& cd .. \ && cd .. \
&& rm -r "ocropy-${OCROPY_RELEASE}" "v${OCROPY_RELEASE}.tar.gz" && rm -r "ocropy-${OCROPY_RELEASE}" "v${OCROPY_RELEASE}.tar.gz"
@ -46,7 +42,6 @@ ENV TESSERACT_RELEASE=4.1.1
ADD "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_RELEASE}.tar.gz" . ADD "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_RELEASE}.tar.gz" .
RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \ RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \
&& cd "tesseract-${TESSERACT_RELEASE}" \ && cd "tesseract-${TESSERACT_RELEASE}" \
&& apt-get update \
&& apt-get install --no-install-recommends --yes \ && apt-get install --no-install-recommends --yes \
autoconf \ autoconf \
automake \ automake \
@ -59,7 +54,6 @@ RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \
make \ make \
pkg-config \ pkg-config \
zlib1g-dev \ zlib1g-dev \
&& rm -r /var/lib/apt/lists/* \
&& ./autogen.sh \ && ./autogen.sh \
&& ./configure \ && ./configure \
&& make \ && make \
@ -67,30 +61,34 @@ RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \
&& ldconfig \ && ldconfig \
&& cd - > /dev/null \ && cd - > /dev/null \
&& rm -r "tesseract-${TESSERACT_RELEASE}" "${TESSERACT_RELEASE}.tar.gz" && rm -r "tesseract-${TESSERACT_RELEASE}" "${TESSERACT_RELEASE}.tar.gz"
ADD "https://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata" \
"https://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata" \ ENV TESSDATA_BEST_RELEASE=4.1.0
"https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata" \ ADD "https://github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_RELEASE}.tar.gz" .
"https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata" \ RUN tar -xzf "${TESSDATA_BEST_RELEASE}.tar.gz" \
"https://github.com/tesseract-ocr/tessdata_best/raw/master/frk.traineddata" \ && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/deu.traineddata" "/usr/local/share/tessdata/" \
"https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata" \ && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/eng.traineddata" "/usr/local/share/tessdata/" \
"https://github.com/tesseract-ocr/tessdata_best/raw/master/ita.traineddata" \ && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/enm.traineddata" "/usr/local/share/tessdata/" \
"https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata" \ && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/fra.traineddata" "/usr/local/share/tessdata/" \
"https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata" \ && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/frk.traineddata" "/usr/local/share/tessdata/" \
"/usr/local/share/tessdata/" && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/frm.traineddata" "/usr/local/share/tessdata/" \
RUN chmod 644 /usr/local/share/tessdata/*.traineddata && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/ita.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/por.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/spa.traineddata" "/usr/local/share/tessdata/" \
&& rm -r "tessdata_best-${TESSDATA_BEST_RELEASE}" "${TESSDATA_BEST_RELEASE}.tar.gz"
## Further dependencies ## ## Further dependencies ##
RUN apt-get update \ RUN apt-get install --no-install-recommends --yes \
&& apt-get install --no-install-recommends --yes \
ghostscript \ ghostscript \
python-pip \ python-pip \
python3.7 \ python3.7 \
zip \ zip \
&& rm -r /var/lib/apt/lists/* \
&& pip install natsort && pip install natsort
RUN rm -r /var/lib/apt/lists/*
## Install Pipeline ## ## Install Pipeline ##
COPY hocrtotei ocr /usr/local/bin/ COPY hocrtotei ocr /usr/local/bin/

View File

@ -1,62 +1,49 @@
# OCR # OCR - Optical Character Recognition
## Build image This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided.
1. Clone this repository and navigate into it: ## Software used in this pipeline implementation
``` - Official Debian Docker image (buster-slim) and programs from its free repositories: https://hub.docker.com/_/debian
git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git && cd ocr - ocropy (1.3.3): https://github.com/ocropus/ocropy/releases/tag/v1.3.3
- pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20
- Tesseract OCR (4.1.1): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1
- tessdata_best (4.1.0): https://github.com/tesseract-ocr/tessdata_best/releases/tag/4.1.0
## Use this image
1. Create input and output directories for the pipeline.
``` bash
mkdir -p /<my_data_location>/input /<my_data_location>/output
``` ```
2. Build image: 2. Place your PDF files inside `/<my_data_location>/input`. Files should all contain text of the same language.
```
docker build -t sfb1288inf/ocr:latest .
```
Alternatively build from the GitLab repository without cloning: 3. Start the pipeline process. Check the [Pipeline arguments](#pipeline-arguments) section for more details.
1. Build image:
```
docker build -t sfb1288inf/ocr:latest https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
``` ```
# Option one: Use the wrapper script
## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/raw/1.0.0/wrapper/ocr, make it executeable and add it to your ${PATH}
cd /<my_data_location>
ocr -i input -l <language_code> -o output <pipeline_arguments>
## Download prebuilt image # Option two: Classic Docker style
The GitLab registry provides a prebuilt image. It is automatically created, utilizing the conquaire build servers.
1. Download image:
```
docker pull gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest
```
## Run
1. Create input and output directories for the OCR software:
```
mkdir -p /<mydatalocation>/files_for_ocr /<mydatalocation>/files_from_ocr
```
2. Place your files inside the `/<mydatalocation>/files_for_ocr` directory. Files can either be PDF (.pdf) or multipage TIFF (.tiff, .tif) files. Files should all contain text of the same language.
3. Start the OCR process.
```
docker run \ docker run \
--rm \ --rm \
-it \ -it \
-u $(id -u $USER):$(id -g $USER) \ -u $(id -u $USER):$(id -g $USER) \
-v /<mydatalocation>/files_for_ocr:/input \ -v /<my_data_location>/input:/input \
-v /<mydatalocation>/files_from_ocr:/output \ -v /<my_data_location>/output:/output \
sfb1288inf/ocr:latest \ gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:1.0.0 \
-i /input \ -i /input \
-l <languagecode> \ -l <language_code>
-o /output -o /output \
<optional_pipeline_arguments>
``` ```
The arguments below `sfb1288inf/ocr:latest` are described in the [OCR arguments](#ocr-arguments) part.
If you want to use the prebuilt image, replace `sfb1288inf/ocr:latest` with `gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest`. 4. Check your results in the `/<my_data_location>/output` directory.
```
4. Check your results in the `/<mydatalocation>/files_from_ocr` directory. ### Pipeline arguments
### OCR arguments
`-l languagecode` `-l languagecode`
* Tells tesseract which language will be used. * Tells tesseract which language will be used.
@ -78,15 +65,15 @@ kept.
* Used to skip binarization with ocropus. If skipped, only the tesseract binarization is used. * Used to skip binarization with ocropus. If skipped, only the tesseract binarization is used.
* default = False * default = False
Example with all arguments used: ``` bash
``` # Example with all arguments used
docker run \ docker run \
--rm \ --rm \
-it \ -it \
-u $(id -u $USER):$(id -g $USER) \ -u $(id -u $USER):$(id -g $USER) \
-v "$HOME"/ocr/files_for_ocr:/input \ -v "$HOME"/ocr/input:/input \
-v "$HOME"/ocr/files_from_ocr:/output \ -v "$HOME"/ocr/output:/output \
sfb1288inf/ocr:latest \ gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:1.0.0 \
-i /input \ -i /input \
-l eng \ -l eng \
-o /output \ -o /output \

View File

@ -1,11 +1,13 @@
#!/usr/bin/env python3.7 #!/usr/bin/env python3.7
# coding=utf-8 # coding=utf-8
""""Merges hOCR files into a TEI file."""
from xml.sax.saxutils import escape from xml.sax.saxutils import escape
from argparse import ArgumentParser from argparse import ArgumentParser
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
parser = ArgumentParser(description='Merges hOCR files to one P5 file.') parser = ArgumentParser(description='Merges hOCR files into a TEI file.')
parser.add_argument('i', metavar='hOCR-sourcefile', nargs='+') parser.add_argument('i', metavar='hOCR-sourcefile', nargs='+')
parser.add_argument('o', metavar='TEI-destfile',) parser.add_argument('o', metavar='TEI-destfile',)
args = parser.parse_args() args = parser.parse_args()

17
ocr
View File

@ -1,13 +1,10 @@
#!/usr/bin/env python2.7 #!/usr/bin/env python2.7
# coding=utf-8 # coding=utf-8
""" """An OCR pipeline for PDF file processing."""
ocr
Usage: For usage instructions run with option --help __author__ = 'Patrick Jentsch <p.jentsch@uni-bielefeld.de>'
Authors: Patrick Jentsch <p.jentsch@uni-bielefeld.de __version__ = '1.0.0'
Stephan Porada <sporada@uni-bielefeld.de>
"""
from argparse import ArgumentParser from argparse import ArgumentParser
from natsort import natsorted from natsort import natsorted
@ -22,7 +19,10 @@ TESSERACT_MODELS = ['deu', 'eng', 'enm', 'fra', 'frk', 'frm', 'ita', 'por', 'spa
def parse_args(): def parse_args():
parser = ArgumentParser(description='OCR Pipeline utilizing tesseract.') parser = ArgumentParser(
description='An OCR pipeline for PDF file processing.',
prog='OCR pipeline'
)
parser.add_argument('-i', '--input-directory', parser.add_argument('-i', '--input-directory',
help='Input directory (only PDF files get processed)', help='Input directory (only PDF files get processed)',
required=True) required=True)
@ -45,6 +45,9 @@ def parse_args():
help='Zips all results in different archives depending' help='Zips all results in different archives depending'
' on result types. Also zips everything into one ' ' on result types. Also zips everything into one '
'archive.') 'archive.')
parser.add_argument('-v', '--version',
action='version',
version='%(prog)s {}'.format(__version__))
return parser.parse_args() return parser.parse_args()

View File

@ -1,11 +1,14 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# coding=utf-8 # coding=utf-8
"""A wrapper to execute the OCR pipeline in a Docker container"""
from argparse import ArgumentParser from argparse import ArgumentParser
import os import os
import subprocess import subprocess
CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest' CONTAINER_IMAGE_TAG = '1.0.0'
CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:{}'.format(CONTAINER_IMAGE_TAG) # noqa
CONTAINER_INPUT_DIR = '/input' CONTAINER_INPUT_DIR = '/input'
CONTAINER_INTERMEDIATE_DIR = '/intermediate' CONTAINER_INTERMEDIATE_DIR = '/intermediate'
CONTAINER_OUTPUT_DIR = '/output' CONTAINER_OUTPUT_DIR = '/output'