diff --git a/Dockerfile b/Dockerfile index a051cd9..2e07b90 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ FROM debian:buster-slim -LABEL authors="Patrick Jentsch , Stephan Porada " +LABEL authors="Patrick Jentsch " ENV LANG=C.UTF-8 @@ -16,26 +16,22 @@ ENV PYFLOW_RELEASE=1.1.20 ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" . RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \ && cd "pyflow-${PYFLOW_RELEASE}" \ - && apt-get update \ && apt-get install --no-install-recommends --yes \ python2.7 \ - && rm -r /var/lib/apt/lists/* \ && python2.7 setup.py build install \ && cd .. \ && rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz" ## Install ocropy ## -ENV OCROPY_RELEASE 1.3.3 +ENV OCROPY_RELEASE=1.3.3 ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_RELEASE}.tar.gz" . RUN tar -xzf "v${OCROPY_RELEASE}.tar.gz" \ && cd "ocropy-${OCROPY_RELEASE}" \ - && apt-get update \ && apt-get install --no-install-recommends --yes \ python-pil \ python-tk \ $(cat PACKAGES) \ - && rm -r /var/lib/apt/lists/* \ && python2.7 setup.py install \ && cd .. \ && rm -r "ocropy-${OCROPY_RELEASE}" "v${OCROPY_RELEASE}.tar.gz" @@ -46,7 +42,6 @@ ENV TESSERACT_RELEASE=4.1.1 ADD "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_RELEASE}.tar.gz" . RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \ && cd "tesseract-${TESSERACT_RELEASE}" \ - && apt-get update \ && apt-get install --no-install-recommends --yes \ autoconf \ automake \ @@ -59,7 +54,6 @@ RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \ make \ pkg-config \ zlib1g-dev \ - && rm -r /var/lib/apt/lists/* \ && ./autogen.sh \ && ./configure \ && make \ @@ -67,30 +61,34 @@ RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \ && ldconfig \ && cd - > /dev/null \ && rm -r "tesseract-${TESSERACT_RELEASE}" "${TESSERACT_RELEASE}.tar.gz" -ADD "https://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata" \ - "https://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata" \ - "https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata" \ - "https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata" \ - "https://github.com/tesseract-ocr/tessdata_best/raw/master/frk.traineddata" \ - "https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata" \ - "https://github.com/tesseract-ocr/tessdata_best/raw/master/ita.traineddata" \ - "https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata" \ - "https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata" \ - "/usr/local/share/tessdata/" -RUN chmod 644 /usr/local/share/tessdata/*.traineddata + +ENV TESSDATA_BEST_RELEASE=4.1.0 +ADD "https://github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_RELEASE}.tar.gz" . +RUN tar -xzf "${TESSDATA_BEST_RELEASE}.tar.gz" \ + && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/deu.traineddata" "/usr/local/share/tessdata/" \ + && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/eng.traineddata" "/usr/local/share/tessdata/" \ + && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/enm.traineddata" "/usr/local/share/tessdata/" \ + && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/fra.traineddata" "/usr/local/share/tessdata/" \ + && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/frk.traineddata" "/usr/local/share/tessdata/" \ + && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/frm.traineddata" "/usr/local/share/tessdata/" \ + && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/ita.traineddata" "/usr/local/share/tessdata/" \ + && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/por.traineddata" "/usr/local/share/tessdata/" \ + && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/spa.traineddata" "/usr/local/share/tessdata/" \ + && rm -r "tessdata_best-${TESSDATA_BEST_RELEASE}" "${TESSDATA_BEST_RELEASE}.tar.gz" ## Further dependencies ## -RUN apt-get update \ - && apt-get install --no-install-recommends --yes \ +RUN apt-get install --no-install-recommends --yes \ ghostscript \ python-pip \ python3.7 \ zip \ - && rm -r /var/lib/apt/lists/* \ && pip install natsort +RUN rm -r /var/lib/apt/lists/* + + ## Install Pipeline ## COPY hocrtotei ocr /usr/local/bin/ diff --git a/README.md b/README.md index 8e56982..c371cff 100644 --- a/README.md +++ b/README.md @@ -1,62 +1,49 @@ -# OCR +# OCR - Optical Character Recognition -## Build image +This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided. -1. Clone this repository and navigate into it: -``` -git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git && cd ocr +## Software used in this pipeline implementation +- Official Debian Docker image (buster-slim) and programs from its free repositories: https://hub.docker.com/_/debian +- ocropy (1.3.3): https://github.com/ocropus/ocropy/releases/tag/v1.3.3 +- pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20 +- Tesseract OCR (4.1.1): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1 +- tessdata_best (4.1.0): https://github.com/tesseract-ocr/tessdata_best/releases/tag/4.1.0 + + +## Use this image + +1. Create input and output directories for the pipeline. +``` bash +mkdir -p //input //output ``` -2. Build image: -``` -docker build -t sfb1288inf/ocr:latest . -``` +2. Place your PDF files inside `//input`. Files should all contain text of the same language. -Alternatively build from the GitLab repository without cloning: - -1. Build image: -``` -docker build -t sfb1288inf/ocr:latest https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git +3. Start the pipeline process. Check the [Pipeline arguments](#pipeline-arguments) section for more details. ``` +# Option one: Use the wrapper script +## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/raw/1.0.0/wrapper/ocr, make it executeable and add it to your ${PATH} +cd / +ocr -i input -l -o output -## Download prebuilt image - -The GitLab registry provides a prebuilt image. It is automatically created, utilizing the conquaire build servers. - -1. Download image: -``` -docker pull gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest -``` - -## Run - -1. Create input and output directories for the OCR software: -``` -mkdir -p //files_for_ocr //files_from_ocr -``` - -2. Place your files inside the `//files_for_ocr` directory. Files can either be PDF (.pdf) or multipage TIFF (.tiff, .tif) files. Files should all contain text of the same language. - -3. Start the OCR process. -``` +# Option two: Classic Docker style docker run \ --rm \ -it \ -u $(id -u $USER):$(id -g $USER) \ - -v //files_for_ocr:/input \ - -v //files_from_ocr:/output \ - sfb1288inf/ocr:latest \ + -v //input:/input \ + -v //output:/output \ + gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:1.0.0 \ -i /input \ - -l \ - -o /output + -l + -o /output \ + ``` -The arguments below `sfb1288inf/ocr:latest` are described in the [OCR arguments](#ocr-arguments) part. -If you want to use the prebuilt image, replace `sfb1288inf/ocr:latest` with `gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest`. +4. Check your results in the `//output` directory. +``` -4. Check your results in the `//files_from_ocr` directory. - -### OCR arguments +### Pipeline arguments `-l languagecode` * Tells tesseract which language will be used. @@ -78,15 +65,15 @@ kept. * Used to skip binarization with ocropus. If skipped, only the tesseract binarization is used. * default = False -Example with all arguments used: -``` +``` bash +# Example with all arguments used docker run \ --rm \ -it \ -u $(id -u $USER):$(id -g $USER) \ - -v "$HOME"/ocr/files_for_ocr:/input \ - -v "$HOME"/ocr/files_from_ocr:/output \ - sfb1288inf/ocr:latest \ + -v "$HOME"/ocr/input:/input \ + -v "$HOME"/ocr/output:/output \ + gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:1.0.0 \ -i /input \ -l eng \ -o /output \ diff --git a/hocrtotei b/hocrtotei index 5f33a93..142f4f5 100755 --- a/hocrtotei +++ b/hocrtotei @@ -1,11 +1,13 @@ #!/usr/bin/env python3.7 # coding=utf-8 +""""Merges hOCR files into a TEI file.""" + from xml.sax.saxutils import escape from argparse import ArgumentParser import xml.etree.ElementTree as ET -parser = ArgumentParser(description='Merges hOCR files to one P5 file.') +parser = ArgumentParser(description='Merges hOCR files into a TEI file.') parser.add_argument('i', metavar='hOCR-sourcefile', nargs='+') parser.add_argument('o', metavar='TEI-destfile',) args = parser.parse_args() diff --git a/ocr b/ocr index 0cedc33..9c7f64b 100755 --- a/ocr +++ b/ocr @@ -1,13 +1,10 @@ #!/usr/bin/env python2.7 # coding=utf-8 -""" -ocr +"""An OCR pipeline for PDF file processing.""" -Usage: For usage instructions run with option --help -Authors: Patrick Jentsch -""" +__author__ = 'Patrick Jentsch ' +__version__ = '1.0.0' from argparse import ArgumentParser from natsort import natsorted @@ -22,7 +19,10 @@ TESSERACT_MODELS = ['deu', 'eng', 'enm', 'fra', 'frk', 'frm', 'ita', 'por', 'spa def parse_args(): - parser = ArgumentParser(description='OCR Pipeline utilizing tesseract.') + parser = ArgumentParser( + description='An OCR pipeline for PDF file processing.', + prog='OCR pipeline' + ) parser.add_argument('-i', '--input-directory', help='Input directory (only PDF files get processed)', required=True) @@ -45,6 +45,9 @@ def parse_args(): help='Zips all results in different archives depending' ' on result types. Also zips everything into one ' 'archive.') + parser.add_argument('-v', '--version', + action='version', + version='%(prog)s {}'.format(__version__)) return parser.parse_args() diff --git a/wrapper/ocr b/wrapper/ocr index 5c908a9..3ed3e18 100755 --- a/wrapper/ocr +++ b/wrapper/ocr @@ -1,11 +1,14 @@ #!/usr/bin/env python3 # coding=utf-8 +"""A wrapper to execute the OCR pipeline in a Docker container""" + from argparse import ArgumentParser import os import subprocess -CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest' +CONTAINER_IMAGE_TAG = '1.0.0' +CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:{}'.format(CONTAINER_IMAGE_TAG) # noqa CONTAINER_INPUT_DIR = '/input' CONTAINER_INTERMEDIATE_DIR = '/intermediate' CONTAINER_OUTPUT_DIR = '/output'