From ca7df6d0edade6a319c84d749870c1aeb9efd6c6 Mon Sep 17 00:00:00 2001
From: Patrick Jentsch <p.jentsch@uni-bielefeld.de>
Date: Fri, 19 Feb 2021 13:04:03 +0100
Subject: [PATCH] First work on version 1.0.0

---
 Dockerfile  | 42 +++++++++++++-------------
 README.md   | 85 +++++++++++++++++++++++------------------------------
 hocrtotei   |  4 ++-
 ocr         | 17 ++++++-----
 wrapper/ocr |  5 +++-
 5 files changed, 73 insertions(+), 80 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index a051cd9..2e07b90 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,7 @@
 FROM debian:buster-slim
 
 
-LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <sporada@uni-bielefeld.de>"
+LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>"
 
 
 ENV LANG=C.UTF-8
@@ -16,26 +16,22 @@ ENV PYFLOW_RELEASE=1.1.20
 ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" .
 RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \
  && cd "pyflow-${PYFLOW_RELEASE}" \
- && apt-get update \
  && apt-get install --no-install-recommends --yes \
       python2.7 \
- && rm -r /var/lib/apt/lists/* \
  && python2.7 setup.py build install \
  && cd .. \
  && rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz"
 
 
 ## Install ocropy ##
-ENV OCROPY_RELEASE 1.3.3
+ENV OCROPY_RELEASE=1.3.3
 ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_RELEASE}.tar.gz" .
 RUN tar -xzf "v${OCROPY_RELEASE}.tar.gz" \
  && cd "ocropy-${OCROPY_RELEASE}" \
- && apt-get update \
  && apt-get install --no-install-recommends --yes \
       python-pil \
       python-tk \
       $(cat PACKAGES) \
- && rm -r /var/lib/apt/lists/* \
  && python2.7 setup.py install \
  && cd .. \
  && rm -r "ocropy-${OCROPY_RELEASE}" "v${OCROPY_RELEASE}.tar.gz"
@@ -46,7 +42,6 @@ ENV TESSERACT_RELEASE=4.1.1
 ADD "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_RELEASE}.tar.gz" .
 RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \
  && cd "tesseract-${TESSERACT_RELEASE}" \
- && apt-get update \
  && apt-get install --no-install-recommends --yes \
       autoconf \
       automake \
@@ -59,7 +54,6 @@ RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \
       make \
       pkg-config \
       zlib1g-dev \
- && rm -r /var/lib/apt/lists/* \
  && ./autogen.sh \
  && ./configure \
  && make \
@@ -67,30 +61,34 @@ RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \
  && ldconfig \
  && cd - > /dev/null \
  && rm -r "tesseract-${TESSERACT_RELEASE}" "${TESSERACT_RELEASE}.tar.gz"
-ADD "https://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata" \
-    "https://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata" \
-    "https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata" \
-    "https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata" \
-    "https://github.com/tesseract-ocr/tessdata_best/raw/master/frk.traineddata" \
-    "https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata" \
-    "https://github.com/tesseract-ocr/tessdata_best/raw/master/ita.traineddata" \
-    "https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata" \
-    "https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata" \
-    "/usr/local/share/tessdata/"
-RUN chmod 644 /usr/local/share/tessdata/*.traineddata
+
+ENV TESSDATA_BEST_RELEASE=4.1.0
+ADD "https://github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_RELEASE}.tar.gz" .
+RUN tar -xzf "${TESSDATA_BEST_RELEASE}.tar.gz" \
+ && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/deu.traineddata" "/usr/local/share/tessdata/" \
+ && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/eng.traineddata" "/usr/local/share/tessdata/" \
+ && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/enm.traineddata" "/usr/local/share/tessdata/" \
+ && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/fra.traineddata" "/usr/local/share/tessdata/" \
+ && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/frk.traineddata" "/usr/local/share/tessdata/" \
+ && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/frm.traineddata" "/usr/local/share/tessdata/" \
+ && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/ita.traineddata" "/usr/local/share/tessdata/" \
+ && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/por.traineddata" "/usr/local/share/tessdata/" \
+ && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/spa.traineddata" "/usr/local/share/tessdata/" \
+ && rm -r "tessdata_best-${TESSDATA_BEST_RELEASE}" "${TESSDATA_BEST_RELEASE}.tar.gz"
 
 
 ## Further dependencies ##
-RUN apt-get update \
- && apt-get install --no-install-recommends --yes \
+RUN apt-get install --no-install-recommends --yes \
       ghostscript \
       python-pip \
       python3.7 \
       zip \
- && rm -r /var/lib/apt/lists/* \
  && pip install natsort
 
 
+RUN rm -r /var/lib/apt/lists/*
+
+
 ## Install Pipeline ##
 COPY hocrtotei ocr /usr/local/bin/
 
diff --git a/README.md b/README.md
index 8e56982..c371cff 100644
--- a/README.md
+++ b/README.md
@@ -1,62 +1,49 @@
-# OCR
+# OCR - Optical Character Recognition
 
-## Build image
+This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided.
 
-1. Clone this repository and navigate into it:
-```
-git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git && cd ocr
+## Software used in this pipeline implementation
+- Official Debian Docker image (buster-slim) and programs from its free repositories: https://hub.docker.com/_/debian
+- ocropy (1.3.3): https://github.com/ocropus/ocropy/releases/tag/v1.3.3
+- pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20
+- Tesseract OCR (4.1.1): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1
+- tessdata_best (4.1.0): https://github.com/tesseract-ocr/tessdata_best/releases/tag/4.1.0
+
+
+## Use this image
+
+1. Create input and output directories for the pipeline.
+``` bash
+mkdir -p /<my_data_location>/input /<my_data_location>/output
 ```
 
-2. Build image:
-```
-docker build -t sfb1288inf/ocr:latest .
-```
+2. Place your PDF files inside `/<my_data_location>/input`. Files should all contain text of the same language.
 
-Alternatively build from the GitLab repository without cloning:
-
-1. Build image:
-```
-docker build -t sfb1288inf/ocr:latest https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
+3. Start the pipeline process. Check the [Pipeline arguments](#pipeline-arguments) section for more details.
 ```
+# Option one: Use the wrapper script
+## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/raw/1.0.0/wrapper/ocr, make it executeable and add it to your ${PATH}
+cd /<my_data_location>
+ocr -i input -l <language_code> -o output <pipeline_arguments>
 
-## Download prebuilt image
-
-The GitLab registry provides a prebuilt image. It is automatically created, utilizing the conquaire build servers.
-
-1. Download image:
-```
-docker pull gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest
-```
-
-## Run
-
-1. Create input and output directories for the OCR software:
-```
-mkdir -p /<mydatalocation>/files_for_ocr /<mydatalocation>/files_from_ocr
-```
-
-2. Place your files inside the `/<mydatalocation>/files_for_ocr` directory. Files can either be PDF (.pdf) or multipage TIFF (.tiff, .tif) files. Files should all contain text of the same language.
-
-3. Start the OCR process.
-```
+# Option two: Classic Docker style
 docker run \
     --rm \
     -it \
     -u $(id -u $USER):$(id -g $USER) \
-    -v /<mydatalocation>/files_for_ocr:/input \
-    -v /<mydatalocation>/files_from_ocr:/output \
-    sfb1288inf/ocr:latest \
+    -v /<my_data_location>/input:/input \
+    -v /<my_data_location>/output:/output \
+    gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:1.0.0 \
         -i /input \
-        -l <languagecode> \
-        -o /output
+        -l <language_code>
+        -o /output \
+        <optional_pipeline_arguments>
 ```
-The arguments below `sfb1288inf/ocr:latest` are described in the [OCR arguments](#ocr-arguments) part.
 
-If you want to use the prebuilt image, replace `sfb1288inf/ocr:latest` with `gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest`.
+4. Check your results in the `/<my_data_location>/output` directory.
+```
 
-4. Check your results in the `/<mydatalocation>/files_from_ocr` directory.
-
-### OCR arguments
+### Pipeline arguments
 
 `-l languagecode`
 * Tells tesseract which language will be used.
@@ -78,15 +65,15 @@ kept.
 * Used to skip binarization with ocropus. If skipped, only the tesseract binarization is used.
 * default = False
 
-Example with all arguments used:
-```
+``` bash
+# Example with all arguments used
 docker run \
     --rm \
     -it \
     -u $(id -u $USER):$(id -g $USER) \
-    -v "$HOME"/ocr/files_for_ocr:/input \
-    -v "$HOME"/ocr/files_from_ocr:/output \
-    sfb1288inf/ocr:latest \
+    -v "$HOME"/ocr/input:/input \
+    -v "$HOME"/ocr/output:/output \
+    gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:1.0.0 \
         -i /input \
         -l eng \
         -o /output \
diff --git a/hocrtotei b/hocrtotei
index 5f33a93..142f4f5 100755
--- a/hocrtotei
+++ b/hocrtotei
@@ -1,11 +1,13 @@
 #!/usr/bin/env python3.7
 # coding=utf-8
 
+""""Merges hOCR files into a TEI file."""
+
 from xml.sax.saxutils import escape
 from argparse import ArgumentParser
 import xml.etree.ElementTree as ET
 
-parser = ArgumentParser(description='Merges hOCR files to one P5 file.')
+parser = ArgumentParser(description='Merges hOCR files into a TEI file.')
 parser.add_argument('i', metavar='hOCR-sourcefile', nargs='+')
 parser.add_argument('o', metavar='TEI-destfile',)
 args = parser.parse_args()
diff --git a/ocr b/ocr
index 0cedc33..9c7f64b 100755
--- a/ocr
+++ b/ocr
@@ -1,13 +1,10 @@
 #!/usr/bin/env python2.7
 # coding=utf-8
 
-"""
-ocr
+"""An OCR pipeline for PDF file processing."""
 
-Usage: For usage instructions run with option --help
-Authors: Patrick Jentsch <p.jentsch@uni-bielefeld.de
-         Stephan Porada <sporada@uni-bielefeld.de>
-"""
+__author__ = 'Patrick Jentsch <p.jentsch@uni-bielefeld.de>'
+__version__ = '1.0.0'
 
 from argparse import ArgumentParser
 from natsort import natsorted
@@ -22,7 +19,10 @@ TESSERACT_MODELS = ['deu', 'eng', 'enm', 'fra', 'frk', 'frm', 'ita', 'por', 'spa
 
 
 def parse_args():
-    parser = ArgumentParser(description='OCR Pipeline utilizing tesseract.')
+    parser = ArgumentParser(
+        description='An OCR pipeline for PDF file processing.',
+        prog='OCR pipeline'
+    )
     parser.add_argument('-i', '--input-directory',
                         help='Input directory (only PDF files get processed)',
                         required=True)
@@ -45,6 +45,9 @@ def parse_args():
                         help='Zips all results in different archives depending'
                              ' on result types. Also zips everything into one '
                              'archive.')
+    parser.add_argument('-v', '--version',
+                        action='version',
+                        version='%(prog)s {}'.format(__version__))
     return parser.parse_args()
 
 
diff --git a/wrapper/ocr b/wrapper/ocr
index 5c908a9..3ed3e18 100755
--- a/wrapper/ocr
+++ b/wrapper/ocr
@@ -1,11 +1,14 @@
 #!/usr/bin/env python3
 # coding=utf-8
 
+"""A wrapper to execute the OCR pipeline in a Docker container"""
+
 from argparse import ArgumentParser
 import os
 import subprocess
 
-CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest'
+CONTAINER_IMAGE_TAG = '1.0.0'
+CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:{}'.format(CONTAINER_IMAGE_TAG)  # noqa
 CONTAINER_INPUT_DIR = '/input'
 CONTAINER_INTERMEDIATE_DIR = '/intermediate'
 CONTAINER_OUTPUT_DIR = '/output'