Update to Tesseract 5.0.0, Set version 0.1.0

2025-07-26 01:41:35 +00:00 · 2022-01-04 11:42:55 +01:00
parent a0760487ae
commit e1b78b6ba4
7 changed files with 574 additions and 338 deletions
--- a/39
+++ b/39
@@ -9,8 +9,14 @@ ENV LANG=C.UTF-8
 RUN apt-get update \
 && apt-get install --no-install-recommends --yes \
-      wget
+      ghostscript \
-
+      procps \
      python3.7 \
      python3-pip \
      rename \
      wget \
      zip \
 && python3 -m pip install lxml
 # Install the OCR pipeline and it's dependencies #
 ## Install pyFlow ##
@@ -43,7 +49,7 @@ RUN wget --no-check-certificate --quiet \
 ## Install Tesseract OCR ##
-ENV TESSERACT_VERSION=4.1.1
+ENV TESSERACT_VERSION=5.0.0
 RUN wget --no-check-certificate --quiet \
      "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \
 && tar -xzf "${TESSERACT_VERSION}.tar.gz" \
@@ -61,37 +67,20 @@ RUN wget --no-check-certificate --quiet \
      pkg-config \
      zlib1g-dev \
 && ./autogen.sh \
- && ./configure \
+ && ./configure --disable-openmp --disable-shared 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic' \
 && make \
 && make install \
 && ldconfig \
 && cd - > /dev/null \
 && rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz"
 ENV TESSERACT_MODELS="ara,chi_tra,dan,deu,ell,eng,enm,fra,frk,frm,ita,por,rus,spa"
 ENV TESSDATA_BEST_VERSION=4.1.0
 RUN wget --no-check-certificate --quiet \
      "https://github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}.tar.gz" \
 && tar -xzf "${TESSDATA_BEST_VERSION}.tar.gz" \
 && for tesseract_model in $(echo ${TESSERACT_MODELS} | tr "," "\n"); do mv "tessdata_best-${TESSDATA_BEST_VERSION}/${tesseract_model}.traineddata" "/usr/local/share/tessdata/"; done \
 && rm -r "tessdata_best-${TESSDATA_BEST_VERSION}" "${TESSDATA_BEST_VERSION}.tar.gz"
 ## Further dependencies ##
 RUN apt-get install --no-install-recommends --yes \
      procps \
      ghostscript \
      python3.7 \
      rename \
      zip
 ## Install Pipeline ##
 COPY hocrtotei ocr /usr/local/bin/
 RUN rm -r /var/lib/apt/lists/*
 ## Install Pipeline ##
 COPY hocr2tei hocr-combine ocr /usr/local/bin/
 ENTRYPOINT ["ocr"]
 CMD ["--help"]
--- a/21
+++ b/21
@@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2021 Bielefeld University - CRC 1288 - INF
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # OCR - Optical Character Recognition
-This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided.
+This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided. The pipeline is designed to run on Linux operating systems, but with some tweaks it should also run on Windows with WSL installed.
 ## Software used in this pipeline implementation
@@ -8,37 +8,26 @@ This software implements a heavily parallelized pipeline to recognize text in PD
  - Software from Debian Buster's free repositories
 - ocropy (1.3.3): https://github.com/ocropus/ocropy/releases/tag/v1.3.3
 - pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20
- Tesseract OCR (4.1.1): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1
+- Tesseract OCR (5.0.0): https://github.com/tesseract-ocr/tesseract/releases/tag/5.0.0
 - tessdata_best (4.1.0): https://github.com/tesseract-ocr/tessdata_best/releases/tag/4.1.0
-## Use this image
+## Installation
-1. Create input and output directories for the pipeline.
+1. Install Docker and Python 3.
-``` bash
+2. Clone this repository: `git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git`
-mkdir -p /<my_data_location>/input /<my_data_location>/output
+2. Build the Docker image: `docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:0.1.0 ocr`
-```
+2. Add the wrapper script (`wrapper/ocr` relative to this README file) to your `${PATH}`.
 3. Create working directories for the pipeline: `mkdir -p /<my_data_location>/{input,models,output}`.
 4. Place your Tesseract OCR model(s) inside `/<my_data_location>/models`.
-2. Place your PDF files inside `/<my_data_location>/input`. Files should all contain text of the same language.
+## Use the Pipeline
 1. Place your PDF files inside `/<my_data_location>/input`. Files should all contain text of the same language.
 2. Clear your `/<my_data_location>/output` directory.
 3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details.
-```
+```bash
 # Option one: Use the wrapper script
 ## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/raw/development/wrapper/ocr, make it executeable and add it to your ${PATH}
 cd /<my_data_location>
-ocr -i input -l <language_code> -o output <optional_pipeline_arguments>
+ocr -i input -o output -m models/<model_name> -l <language_code> <optional_pipeline_arguments>
-
+# or
-# Option two: Classic Docker style
+ocr -i input -o output -m models/* -l <language_code> <optional_pipeline_arguments>
 docker run \
    --rm \
    -it \
    -u $(id -u $USER):$(id -g $USER) \
    -v /<my_data_location>/input:/input \
    -v /<my_data_location>/output:/output \
    gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:development \
        -i /ocr_pipeline/input \
        -l <language_code> \
        -o /ocr_pipeline/output \
        <optional_pipeline_arguments>
 ```
 4. Check your results in the `/<my_data_location>/output` directory.
--- a/35
+++ b/35
@@ -0,0 +1,35 @@
 #!/usr/bin/env python3.7
 # coding=utf-8
 """"Combine multiple hOCR files."""
 from argparse import ArgumentParser
 from lxml import html
 parser = ArgumentParser(description='Combine multiple hOCR files.')
 parser.add_argument('file', help='Input file(s)', nargs='+')
 parser.add_argument('-o', '--output-file', help='Output file', required=True)
 args = parser.parse_args()
 for file in args.file:
    files = []
    if file.startswith('@'):
        with open(file[1:], 'r') as f:
            files += [x for x in f.read().split("\n") if x != '']
    else:
        files.append(file)
 if len(files) == 0:
    exit(1)
 hocr = html.parse(files[0])
 hocr_body = hocr.find('body')
 for file in files[1:]:
    for ocr_page in html.parse(file).findall('//div[@class="ocr_page"]'):
        hocr_body.append(ocr_page)
 with open(args.output_file, 'wb') as f:
    hocr.write(f, encoding='UTF-8', method='html')
--- a/39
+++ b/39
@@ -3,16 +3,18 @@
 """"Convert hOCR to TEI XML."""
 from xml.sax.saxutils import escape
 from argparse import ArgumentParser
 from lxml import html
 from xml.sax.saxutils import escape
 import re
-import xml.etree.ElementTree as ET
+
 parser = ArgumentParser(description='Convert hOCR to TEI XML.')
-parser.add_argument('input', metavar='Path to hOCR input file')
+parser.add_argument('file', help='Input file')
-parser.add_argument('output', metavar='Path to TEI output file')
+parser.add_argument('-o', '--output-file', help='Output file', required=True)
 args = parser.parse_args()
 tei = ''
 tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n'
 tei += '  <teiHeader>\n'
@@ -30,28 +32,27 @@ tei += '    </fileDesc>\n'
 tei += '  </teiHeader>\n'
 tei += '  <text>\n'
 tei += '    <body>\n'
-# Conversion start
+hocr = html.parse(args.file)
-hocr = ET.parse(args.input)
+for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
-for page in hocr.findall('.//*[@class="ocr_page"]'):
+    ocr_page_title_attrib = ocr_page.attrib.get('title')
-    page_properties = page.attrib.get('title')
+    facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1)
-    facsimile = re.search(r'image \"(.*?)\"', page_properties).group(1)
+    page_number = re.search(r'ppageno (\d+)', ocr_page_title_attrib).group(1)
-    page_number = re.search(r'ppageno (\d+)', page_properties).group(1)
+    tei += f'      <pb facs="{facsimile}" n="{page_number}"/>\n'
-    tei += '      <pb facs="{}" n="{}"/>\n'.format(facsimile, page_number)
+    for ocr_par in ocr_page.findall('.//p[@class="ocr_par"]'):
    for para in page.findall('.//*[@class="ocr_par"]'):
        tei += '      <p>\n'
-        for line in para.findall('.//*[@class="ocr_line"]'):
+        for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'):
            tei += '        <lb/>'
            indent = ''
-            for word in line.findall('.//*[@class="ocrx_word"]'):
+            for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'):
-                if word.text is not None:
+                if ocrx_word.text is not None:
-                    tei += indent + escape(word.text.strip())
+                    tei += indent + escape(ocrx_word.text)
                    indent = ' '
            tei += '\n'
        tei += '      </p>\n'
 # Conversion end
 tei += '    </body>\n'
 tei += '  </text>\n'
 tei += '</TEI>\n'
-with open(args.output, 'w') as tei_file:
+
-    tei_file.write(tei)
+with open(args.output_file, 'w') as f:
    f.write(tei)
--- a/719
+++ b/719
@@ -1,11 +1,9 @@
 #!/usr/bin/env python2.7
 # coding=utf-8
-"""OCR pipeline for PDF file processing."""
+''' OCR pipeline for PDF file processing. '''
 __version__ = '0.1.0'
 __author__ = 'Patrick Jentsch <p.jentsch@uni-bielefeld.de>,' \
             'Stephan Porada <porada@posteo.de>'
 __version__ = '1.0.0'
 from argparse import ArgumentParser
 from pyflow import WorkflowRunner
@@ -14,145 +12,402 @@ import os
 import sys
-class OCRPipelineJob:
+class PipelineJob:
-    """An OCR pipeline job class
+    '''
    OCR pipeline job class.
    Each input file of the pipeline is represented as an OCR pipeline job,
    which holds all necessary information for the pipeline to process it.
    Arguments:
    file -- Path to the file
-    output_dir -- Path to a directory, where job results a stored
+    output_dir -- Path to a directory, where job results are stored
-    """
+    '''
    def __init__(self, file, output_dir):
        self.file = file
-        self.name = os.path.basename(file).rsplit('.', 1)[0]
+        self.name = os.path.basename(file)[:-4]
        self.output_dir = output_dir
-        self.page_dir = os.path.join(output_dir, 'pages')
+        self.tmp_dir = os.path.join(output_dir, 'tmp')
-class OCRPipeline(WorkflowRunner):
+class SplitInputWorkflow(WorkflowRunner):
-    def __init__(self, input_dir, lang, output_dir, binarize, zip):
+    def __init__(self, job):
        self.job = job
    def workflow(self):
        '''
        ' ##################################################
        ' # gs                                             #
        ' ##################################################
        '''
        n_cores = min(2, self.getNCores())
        mem_mb = min(n_cores * 512, self.getMemMb())
        cmd = 'gs'
        cmd += ' -dBATCH'
        cmd += ' -dNOPAUSE'
        cmd += ' -dBufferSpace={}'.format(mem_mb * 1000000)
        cmd += ' -dNumRenderingThreads={}'.format(n_cores)
        cmd += ' -dQUIET'
        cmd += ' -r300'
        cmd += ' -sDEVICE=png16m'
        cmd += ' -sOutputFile="{}/page-%d.png"'.format(
            os.path.join(self.job.tmp_dir, 'images')
        )
        cmd += ' "{}"'.format(self.job.file)
        self.addTask(
            'gs',
            command=cmd,
            memMb=mem_mb,
            nCores=n_cores
        )
 class BinarizationWorkflow(WorkflowRunner):
    def __init__(self, job):
        self.job = job
    def workflow(self):
        '''
        ' ##################################################
        ' # ocropus-nlbin                                  #
        ' ##################################################
        '''
        # TODO: Update to newer ocropus-nlbin and start one task per page
        n_cores = self.getNCores()
        mem_mb = min(512 * n_cores, self.getMemMb())
        cmd = 'ls -dv "{}/"* > "{}"'.format(
            os.path.join(self.job.tmp_dir, 'images'),
            os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')
        )
        cmd += ' && '
        cmd += 'ocropus-nlbin "@{}"'.format(os.path.join(self.job.tmp_dir, 'images', 'inputs.txt'))  # noqa
        cmd += ' --nocheck'
        cmd += ' --output "{}"'.format(
            os.path.join(self.job.tmp_dir, 'images'))
        cmd += ' --parallel "{}"'.format(n_cores)
        cmd += ' && '
        cmd += 'rm "{}"'.format(os.path.join(self.job.tmp_dir, 'images', 'inputs.txt'))  # noqa
        ocropus_nlbin_task = self.addTask(
            'ocropus_nlbin',
            command=cmd,
            memMb=mem_mb,
            nCores=n_cores
        )
        '''
        ' ##################################################
        ' # cleanup                                        #
        ' ##################################################
        '''
        n_cores = 1
        mem_mb = min(128, self.getMemMb())
        cmd = 'cd "{}"'.format(os.path.join(self.job.tmp_dir, 'images'))
        cmd += ' && '
        cmd += 'mkdir tmp'
        cmd += ' && '
        cmd += 'mv *.bin.png tmp'
        cmd += ' && '
        cmd += 'rm *.png'
        cmd += ' && '
        cmd += 'mv tmp/* .'
        cmd += ' && '
        cmd += 'rmdir tmp'
        cmd += ' && '
        cmd += 'rename \'s/^0*/page-/\' *'
        cmd += ' && '
        cmd += 'rename \'s/.bin.png$/.png/\' *'
        cmd += ' && '
        cmd += 'cd -'
        self.addTask(
            'cleanup',
            command=cmd,
            dependencies=ocropus_nlbin_task,
            memMb=mem_mb,
            nCores=n_cores
        )
 class OCRWorkflow(WorkflowRunner):
    def __init__(self, job, lang):
        self.job = job
        self.lang = lang
    def workflow(self):
        '''
        ' ##################################################
        ' # tesseract                                      #
        ' ##################################################
        '''
        tesseract_tasks = []
        n_cores = 1
        mem_mb = min(512, self.getMemMb())
        for i, file in enumerate(os.listdir(os.path.join(self.job.tmp_dir, 'images'))):  # noqa
            cmd = 'tesseract "{}" "{}"'.format(
                os.path.join(self.job.tmp_dir, 'images', file),
                os.path.join(self.job.tmp_dir, file[:-4])
            )
            cmd += ' -l "{}"'.format(self.lang)
            cmd += ' hocr pdf txt'
            cmd += ' || '
            cmd += 'echo "${?}"'
            task = self.addTask(
                'tesseract_-_{}'.format(i),
                command=cmd,
                env={'OMP_THREAD_LIMIT': '{}'.format(n_cores)},
                memMb=mem_mb,
                nCores=n_cores
            )
            tesseract_tasks.append(task)
        '''
        ' ##################################################
        ' # move_files                                     #
        ' ##################################################
        '''
        n_cores = 1
        mem_mb = min(128, self.getMemMb())
        for i, file_extension in enumerate(['hocr', 'pdf', 'txt']):
            cmd = 'mv "{}/"*.{} "{}"'.format(
                self.job.tmp_dir,
                file_extension,
                os.path.join(self.job.tmp_dir, file_extension)
            )
            self.addTask(
                'move_{}_files'.format(file_extension),
                command=cmd,
                dependencies=tesseract_tasks,
                memMb=mem_mb,
                nCores=n_cores
            )
        cmd = 'mv "{}" "{}"'.format(
            os.path.join(self.job.tmp_dir, 'images'),
            os.path.join(self.job.output_dir)
        )
        self.addTask(
            'move_image_files',
            command=cmd,
            dependencies=tesseract_tasks,
            memMb=mem_mb,
            nCores=n_cores
        )
 class CreateHOCRWorkflow(WorkflowRunner):
    def __init__(self, job):
        self.job = job
    def workflow(self):
        '''
        ' ##################################################
        ' # fix-hocr                                       #
        ' ##################################################
        '''
        fix_hocr_tasks = []
        n_cores = 1
        mem_mb = min(256, self.getMemMb())
        for i, file in enumerate(os.listdir(os.path.join(self.job.tmp_dir, 'hocr'))):  # noqa
            cmd = 'sed -i \'s>{}>images>g\' "{}"'.format(
                os.path.join(self.job.tmp_dir, 'images'),
                os.path.join(self.job.tmp_dir, 'hocr', file)
            )
            cmd += ' && '
            cmd += 'sed -i \'s>ppageno [0-9]\\+>ppageno {}>g\' "{}"'.format(
                file[5:-5],
                os.path.join(self.job.tmp_dir, 'hocr', file)
            )
            cmd += ' && '
            cmd += 'sed -i \'s>page_[0-9]\\+>page_{}>g\' "{}"'.format(
                file[5:-5],
                os.path.join(self.job.tmp_dir, 'hocr', file)
            )
            cmd += ' && '
            cmd += 'sed -i \'s>block_[0-9]\\+>block_{}>g\' "{}"'.format(
                file[5:-5],
                os.path.join(self.job.tmp_dir, 'hocr', file)
            )
            cmd += ' && '
            cmd += 'sed -i \'s>par_[0-9]\\+>par_{}>g\' "{}"'.format(
                file[5:-5],
                os.path.join(self.job.tmp_dir, 'hocr', file)
            )
            cmd += ' && '
            cmd += 'sed -i \'s>line_[0-9]\\+>line_{}>g\' "{}"'.format(
                file[5:-5],
                os.path.join(self.job.tmp_dir, 'hocr', file)
            )
            cmd += ' && '
            cmd += 'sed -i \'s>word_[0-9]\\+>word_{}>g\' "{}"'.format(
                file[5:-5],
                os.path.join(self.job.tmp_dir, 'hocr', file)
            )
            task = self.addTask(
                'fix-hocr_-_{}'.format(i),
                command=cmd,
                memMb=mem_mb,
                nCores=n_cores
            )
            fix_hocr_tasks.append(task)
        '''
        ' ##################################################
        ' # hocr-combine                                   #
        ' ##################################################
        '''
        n_cores = 1
        mem_mb = min(512, self.getMemMb())
        cmd = 'ls -dv "{}/"* > "{}"'.format(
            os.path.join(self.job.tmp_dir, 'hocr'),
            os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt')
        )
        cmd += ' && '
        cmd += 'hocr-combine "@{}"'.format(
            os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt')
        )
        cmd += ' --output-file "{}.hocr"'.format(
            os.path.join(self.job.output_dir, self.job.name)
        )
        cmd += ' && '
        cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'hocr'))
        self.addTask(
            'hocr_combine',
            command=cmd,
            dependencies=fix_hocr_tasks,
            memMb=mem_mb,
            nCores=n_cores
        )
 class CreatePDFWorkflow(WorkflowRunner):
    def __init__(self, job):
        self.job = job
    def workflow(self):
        '''
        ' ##################################################
        ' # pdf_combine                                    #
        ' ##################################################
        '''
        n_cores = min(2, self.getNCores())
        mem_mb = min(n_cores * 256, self.getMemMb())
        cmd = 'ls -dQv "{}"/*'.format(os.path.join(self.job.tmp_dir, 'pdf'))
        cmd += ' | '
        cmd += 'xargs gs'
        cmd += ' -dBATCH'
        cmd += ' -dNOPAUSE'
        cmd += ' -dBufferSpace={}'.format(mem_mb * 1000000)
        cmd += ' -dNumRenderingThreads={}'.format(n_cores)
        cmd += ' -dPDFSETTINGS=/ebook'
        cmd += ' -dQUIET'
        cmd += ' -sDEVICE=pdfwrite'
        cmd += ' -sOutputFile="{}.pdf"'.format(
            os.path.join(self.job.output_dir, self.job.name)
        )
        cmd += ' && '
        cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'pdf'))
        self.addTask('pdf_combine', command=cmd, memMb=mem_mb, nCores=n_cores)
 class CreateTEIWorkflow(WorkflowRunner):
    def __init__(self, job):
        self.job = job
    def workflow(self):
        '''
        ' ##################################################
        ' # hocr2tei                                       #
        ' ##################################################
        '''
        n_cores = 1
        mem_mb = min(512, self.getMemMb())
        cmd = 'hocr2tei "{}.hocr"'.format(
            os.path.join(self.job.output_dir, self.job.name)
        )
        cmd += ' --output-file "{}.xml"'.format(
            os.path.join(self.job.output_dir, self.job.name)
        )
        self.addTask('hocr2tei', command=cmd, memMb=mem_mb, nCores=n_cores)
 class CreateTxtWorkflow(WorkflowRunner):
    def __init__(self, job):
        self.job = job
    def workflow(self):
        '''
        ' ##################################################
        ' # txt_combine                                    #
        ' ##################################################
        '''
        n_cores = 1
        mem_mb = min(512, self.getMemMb())
        cmd = 'ls -dQv "{}"/*'.format(os.path.join(self.job.tmp_dir, 'txt'))
        cmd += ' | '
        cmd += 'xargs cat'
        cmd += ' > '
        cmd += '"{}.txt"'.format(os.path.join(self.job.output_dir, self.job.name))  # noqa
        cmd += ' && '
        cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'txt'))
        self.addTask('txt_combine', command=cmd, memMb=mem_mb, nCores=n_cores)
 class MainWorkflow(WorkflowRunner):
    def __init__(self, input_dir, lang, output_dir, binarize):
        self.input_dir = input_dir
        self.lang = lang
        self.output_dir = output_dir
        self.binarize = binarize
-        self.zip = zip
+        self.jobs = self.collect_jobs()
-        self.jobs = collect_jobs(self.input_dir, self.output_dir)
+
    def collect_jobs(self):
        jobs = []
        for file in os.listdir(self.input_dir):
            if os.path.isdir(os.path.join(self.input_dir, file)):
                continue
            if file.lower().endswith('.pdf'):
                job = PipelineJob(
                    os.path.join(self.input_dir, file),
                    os.path.join(self.output_dir, file)
                )
                jobs.append(job)
        return jobs
    def workflow(self):
        if not self.jobs:
            return
-        '''
+        # Create output and temporary directories
-        ' ##################################################
+        for job in self.jobs:
-        ' # setup output directory                         #
+            os.mkdir(job.output_dir)
-        ' ##################################################
+            os.mkdir(job.tmp_dir)
-        '''
+            os.mkdir(os.path.join(job.tmp_dir, 'hocr'))
-        setup_output_directory_tasks = []
+            os.mkdir(os.path.join(job.tmp_dir, 'pdf'))
-        for i, job in enumerate(self.jobs):
+            os.mkdir(os.path.join(job.tmp_dir, 'images'))
-            cmd = 'mkdir -p "{}"'.format(job.page_dir)
+            os.mkdir(os.path.join(job.tmp_dir, 'txt'))
            lbl = 'setup_output_directory_-_{}'.format(i)
            task = self.addTask(command=cmd, label=lbl)
            setup_output_directory_tasks.append(task)
        '''
        ' ##################################################
-        ' # split input                                    #
+        ' # split-input                                    #
        ' ##################################################
        '''
        split_input_tasks = []
        n_cores = max(1, int(self.getNCores() / len(self.jobs)))
        for i, job in enumerate(self.jobs):
-            input_file = job.file
+            self.addWorkflowTask(
-            output_file = '{}/page-%d.tif'.format(job.page_dir)
+                'split_input_-_{}'.format(i),
-            cmd = 'gs'
+                SplitInputWorkflow(job)
-            cmd += ' -dBATCH'
+            )
            cmd += ' -dNOPAUSE'
            cmd += ' -dNumRenderingThreads={}'.format(n_cores)
            cmd += ' -dQUIET'
            cmd += ' -r300'
            cmd += ' -sDEVICE=tiff24nc'
            cmd += ' -sCompression=lzw'
            cmd += ' "-sOutputFile={}"'.format(output_file)
            cmd += ' "{}"'.format(input_file)
            deps = 'setup_output_directory_-_{}'.format(i)
            lbl = 'split_input_-_{}'.format(i)
            task = self.addTask(command=cmd, dependencies=deps, label=lbl,
                                nCores=n_cores)
            split_input_tasks.append(task)
        if self.binarize:
            '''
            ' ##################################################
            ' # pre binarization                               #
            ' ##################################################
            '''
            pre_binarization_tasks = []
            for i, job in enumerate(self.jobs):
                input_file = os.path.join(job.output_dir, 'binarization_input_files.txt')  # noqa
                cmd = 'ls -dv "{}/"* >> "{}"'.format(job.page_dir, input_file)
                deps = 'split_input_-_{}'.format(i)
                lbl = 'pre_binarization_-_{}'.format(i)
                task = self.addTask(command=cmd, dependencies=deps, label=lbl)
                pre_binarization_tasks.append(task)
            '''
            ' ##################################################
            ' # binarization                                   #
            ' ##################################################
            '''
            binarization_tasks = []
            n_cores = self.getNCores()
            mem_mb = self.getMemMb()
            for i, job in enumerate(self.jobs):
-                input_file = os.path.join(job.output_dir, 'binarization_input_files.txt')  # noqa
+                self.addWorkflowTask(
-                cmd = 'ocropus-nlbin "@{}"'.format(input_file)
+                    'binarization_-_{}'.format(i),
-                cmd += ' --nocheck'
+                    BinarizationWorkflow(job),
-                cmd += ' --output "{}"'.format(job.page_dir)
+                    dependencies='split_input_-_{}'.format(i)
-                cmd += ' --parallel "{}"'.format(n_cores)
+                )
                deps = 'pre_binarization_-_{}'.format(i)
                lbl = 'binarization_-_{}'.format(i)
                task = self.addTask(command=cmd, dependencies=deps, label=lbl,
                                    memMb=mem_mb, nCores=n_cores)
                binarization_tasks.append(task)
            '''
            ' ##################################################
            ' # post binarization                              #
            ' ##################################################
            '''
            post_binarization_tasks = []
            for i, job in enumerate(self.jobs):
                input_file = os.path.join(job.output_dir, 'binarization_input_files.txt')  # noqa
                cmd = 'rm "{}"'.format(input_file)
                cmd += ' && '
                cmd += 'cd "{}"'.format(job.page_dir)
                cmd += ' && '
                cmd += 'rm *.{nrm.png,tif}'
                cmd += ' && '
                cmd += 'rename \'s/^0*/page-/\' *'
                cmd += ' && '
                cmd += 'cd -'
                deps = 'binarization_-_{}'.format(i)
                lbl = 'post_binarization_-_{}'.format(i)
                task = self.addTask(command=cmd, dependencies=deps, label=lbl)
                post_binarization_tasks.append(task)
        '''
        ' ##################################################
        ' # pre ocr                                        #
        ' ##################################################
        '''
        pre_ocr_tasks = []
        for i, job in enumerate(self.jobs):
            input_file = os.path.join(job.output_dir, 'ocr_input_files.txt')
            cmd = 'ls -dv "{}/"* >> "{}"'.format(job.page_dir, input_file)
            deps = 'post_binarization_-_{}'.format(i) if self.binarize else 'split_input_-_{}'.format(i)  # noqa
            lbl = 'pre_ocr_-_{}'.format(i)
            task = self.addTask(command=cmd, dependencies=deps, label=lbl)
            pre_ocr_tasks.append(task)
        '''
        ' ##################################################
@@ -160,175 +415,117 @@ class OCRPipeline(WorkflowRunner):
        ' ##################################################
        '''
        ocr_tasks = []
        n_cores = min(4, self.getNCores())
        mem_mb = min(n_cores * 2048, self.getMemMb())
        for i, job in enumerate(self.jobs):
-            input_file = os.path.join(job.output_dir, 'ocr_input_files.txt')
+            if self.binarize:
-            output_file_base = os.path.join(job.output_dir, job.name)
+                deps = 'binarization_-_{}'.format(i)
-            cmd = 'tesseract "{}" "{}"'.format(input_file, output_file_base)
+            else:
-            cmd += ' -l "{}"'.format(self.lang)
+                deps = 'split_input_-_{}'.format(i)
-            cmd += ' hocr pdf txt'
+            task = self.addWorkflowTask(
-            deps = 'pre_ocr_-_{}'.format(i)
+                'ocr_-_{}'.format(i),
-            lbl = 'ocr_-_{}'.format(i)
+                OCRWorkflow(job, self.lang),
-            task = self.addTask(command=cmd, dependencies=deps,
+                dependencies=deps
-                                env={'OMP_THREAD_LIMIT': '{}'.format(n_cores)},
+            )
                                label=lbl, memMb=mem_mb, nCores=n_cores)
            ocr_tasks.append(task)
        '''
        ' ##################################################
-        ' # post ocr                                       #
+        ' # create-hocr                                    #
        ' ##################################################
        '''
-        post_ocr_tasks = []
+        create_hocr_tasks = []
        for i, job in enumerate(self.jobs):
-            input_file = os.path.join(job.output_dir, 'ocr_input_files.txt')
+            task = self.addWorkflowTask(
-            output_file_base = os.path.join(job.output_dir, job.name)
+                'create_hocr_-_{}'.format(i),
-            cmd = 'rm "{}"'.format(input_file)
+                CreateHOCRWorkflow(job),
-            cmd += ' && '
+                dependencies='ocr_-_{}'.format(i)
-            cmd += 'sed -i \'s+{}+pages+g\' "{}.hocr"'.format(job.page_dir, output_file_base)  # noqa
+            )
-            deps = 'ocr_-_{}'.format(i)
+            create_hocr_tasks.append(task)
            lbl = 'post_ocr_-_{}'.format(i)
            task = self.addTask(command=cmd, dependencies=deps, label=lbl)
            post_ocr_tasks.append(task)
        '''
        ' ##################################################
-        ' # hocr to tei                                    #
+        ' # create-pdf                                     #
        ' ##################################################
        '''
-        hocr_to_tei_tasks = []
+        create_pdf_tasks = []
        for i, job in enumerate(self.jobs):
-            output_file_base = os.path.join(job.output_dir, job.name)
+            task = self.addWorkflowTask(
-            cmd = 'hocrtotei "{}.hocr" "{}.xml"'.format(output_file_base, output_file_base)  # noqa
+                'create_pdf_-_{}'.format(i),
-            deps = 'post_ocr_-_{}'.format(i)
+                CreatePDFWorkflow(job),
-            lbl = 'hocr_to_tei_-_{}'.format(i)
+                dependencies='ocr_-_{}'.format(i)
-            task = self.addTask(command=cmd, dependencies=deps, label=lbl)
+            )
-            hocr_to_tei_tasks.append(task)
+            create_pdf_tasks.append(task)
        '''
        ' ##################################################
-        ' # zip creation                                   #
+        ' # create-tei                                     #
        ' ##################################################
        '''
-        zip_creation_tasks = []
+        create_tei_tasks = []
-        if self.zip is not None:
+        for i, job in enumerate(self.jobs):
-            # zip all files
+            task = self.addWorkflowTask(
-            cmd = 'cd "{}"'.format(self.output_dir)
+                'create_tei_-_{}'.format(i),
-            cmd += ' && '
+                CreateTEIWorkflow(job),
-            cmd += 'zip'
+                dependencies='create_hocr_-_{}'.format(i)
-            cmd += ' -r'
+            )
-            cmd += ' "{}.all.zip" .'.format(self.zip)
+            create_tei_tasks.append(task)
            cmd += ' -x "pyflow.data*" "*tmp*"'
            cmd += ' -i "*.pdf" "*.txt" "*.xml" "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif')  # noqa
            cmd += ' && '
            cmd += 'cd -'
            deps = hocr_to_tei_tasks
            lbl = 'zip_creation_-_all'
            task = self.addTask(command=cmd, dependencies=deps, label=lbl)
            zip_creation_tasks.append(task)
            # zip PDF files
            cmd = 'cd "{}"'.format(self.output_dir)
            cmd += ' && '
            cmd += 'zip'
            cmd += ' -r'
            cmd += ' "{}.pdf.zip" .'.format(self.zip)
            cmd += ' -x "pyflow.data*" "*tmp*"'
            cmd += ' -i "*.pdf"'
            cmd += ' && '
            cmd += 'cd -'
            deps = ocr_tasks
            lbl = 'zip_creation_-_pdf'
            task = self.addTask(command=cmd, dependencies=deps, label=lbl)
            zip_creation_tasks.append(task)
            # zip TXT files
            cmd = 'cd "{}"'.format(self.output_dir)
            cmd += ' && '
            cmd += 'zip'
            cmd += ' -r'
            cmd += ' "{}.txt.zip" .'.format(self.zip)
            cmd += ' -x "pyflow.data*" "*tmp*"'
            cmd += ' -i "*.txt"'
            cmd += ' && '
            cmd += 'cd -'
            deps = ocr_tasks
            lbl = 'zip_creation_-_txt'
            task = self.addTask(command=cmd, dependencies=deps, label=lbl)
            zip_creation_tasks.append(task)
            # zip XML files
            cmd = 'cd "{}"'.format(self.output_dir)
            cmd += ' && '
            cmd += 'zip'
            cmd += ' -r'
            cmd += ' "{}.xml.zip" .'.format(self.zip)
            cmd += ' -x "pyflow.data*" "*tmp*"'
            cmd += ' -i "*.xml"'
            cmd += ' && '
            cmd += 'cd -'
            deps = hocr_to_tei_tasks
            lbl = 'zip_creation_-_xml'
            task = self.addTask(command=cmd, dependencies=deps, label=lbl)
            zip_creation_tasks.append(task)
            # zip PoCo bundles
            cmd = 'cd "{}"'.format(self.output_dir)
            cmd += ' && '
            cmd += 'zip'
            cmd += ' -r'
            cmd += ' "{}.poco.zip" .'.format(self.zip)
            cmd += ' -x "pyflow.data*" "*tmp*"'
            cmd += ' -i "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif')  # noqa
            cmd += ' && '
            cmd += 'cd -'
            deps = post_ocr_tasks
            lbl = 'zip_creation_-_poco'
            task = self.addTask(command=cmd, dependencies=deps, label=lbl)
            zip_creation_tasks.append(task)
        '''
        ' ##################################################
        ' # create-txt                                     #
        ' ##################################################
        '''
        create_txt_tasks = []
        for i, job in enumerate(self.jobs):
            task = self.addWorkflowTask(
                'create_txt_-_{}'.format(i),
                CreateTxtWorkflow(job),
                dependencies='ocr_-_{}'.format(i)
            )
            create_txt_tasks.append(task)
-def collect_jobs(input_dir, output_dir):
+        # Remove temporary directories when all tasks are completed
-    jobs = []
+        self.waitForTasks()
-    for file in os.listdir(input_dir):
+        for job in self.jobs:
-        if os.path.isdir(os.path.join(input_dir, file)):
+            os.rmdir(job.tmp_dir)
            continue
        if file.lower().endswith('.pdf'):
            job = OCRPipelineJob(os.path.join(input_dir, file),
                                 os.path.join(output_dir, file))
            jobs.append(job)
    return jobs
 def parse_args():
-    parser = ArgumentParser(description='OCR pipeline for PDF file processing',
+    parser = ArgumentParser(description='OCR pipeline for PDF file processing')
-                            prog='OCR pipeline')
+    parser.add_argument(
-    parser.add_argument('-i', '--input-dir',
+        '-i', '--input-dir', help='Input directory', required=True)
-                        help='Input directory',
+    parser.add_argument(
-                        required=True)
+        '-o', '--output-dir', help='Output directory', required=True)
-    parser.add_argument('-o', '--output-dir',
+    parser.add_argument(
-                        help='Output directory',
+        '-l', '--language',
-                        required=True)
+        choices=[x[:-12] for x in os.listdir('/usr/local/share/tessdata')
-    parser.add_argument('-l', '--language',
+                 if x.endswith('.traineddata') and len(x) > 12],
-                        choices=list(map(lambda x: x[:-12], filter(lambda x: x.endswith('.traineddata'), os.listdir('/usr/local/share/tessdata')))),  # noqa
+        help='Language of the input (3-character ISO 639-2 language codes)',
-                        help='Language of the input '
+        required=True
-                             '(3-character ISO 639-2 language codes)',
+    )
-                        required=True)
+    parser.add_argument(
-    parser.add_argument('--binarize',
+        '--binarize',
-                        action='store_true',
+        action='store_true',
-                        help='Add binarization as a preprocessing step')
+        help='Add binarization as a preprocessing step'
-    parser.add_argument('--log-dir',
+    )
-                        help='Logging directory')
+    parser.add_argument(
-    parser.add_argument('--mem-mb',
+        '--log-dir', help='Logging directory (Default: --output-dir)')
-                        help='Amount of system memory to be used (Default: min(--n-cores * 2048, available system memory))',  # noqa
+    parser.add_argument(
-                        type=int)
+        '--mem-mb',
-    parser.add_argument('--n-cores',
+        help='Amount of system memory to be used (Default: min(--n-cores * 512, available system memory))',  # noqa
-                        default=min(4, multiprocessing.cpu_count()),
+        type=int
-                        help='Number of CPU threads to be used (Default: min(4, number of CPUs))',  # noqa
+    )
-                        type=int)
+    parser.add_argument(
-    parser.add_argument('--zip',
+        '--n-cores',
-                        help='Create one zip file per filetype')
+        default=min(4, multiprocessing.cpu_count()),
-    parser.add_argument('-v', '--version',
+        help='Number of CPU threads to be used (Default: min(4, CPU count))',
-                        action='version',
+        type=int
-                        help='Returns the current version of the OCR pipeline',
+    )
-                        version='%(prog)s {}'.format(__version__))
+    parser.add_argument(
        '-v', '--version',
        action='version',
        help='Returns the current version of the OCR pipeline',
        version='%(prog)s {}'.format(__version__)
    )
    args = parser.parse_args()
    # Set some tricky default values and check for insufficient input
@@ -338,20 +535,18 @@ def parse_args():
        raise Exception('--n-cores must be greater or equal 1')
    if args.mem_mb is None:
        max_mem_mb = int(os.popen('free -t -m').readlines()[-1].split()[1:][0])
-        args.mem_mb = min(args.n_cores * 2048, max_mem_mb)
+        args.mem_mb = min(args.n_cores * 512, max_mem_mb)
-    if args.mem_mb < 2048:
+    if args.mem_mb < 512:
-        raise Exception('--mem-mb must be greater or equal 2048')
+        raise Exception('--mem-mb must be greater or equal 512')
    if args.zip is not None and args.zip.lower().endswith('.zip'):
        # Remove .zip file extension if provided
        args.zip = args.zip[:-4]
        args.zip = args.zip if args.zip else 'output'
    return args
 def main():
    args = parse_args()
-    ocr_pipeline = OCRPipeline(args.input_dir, args.language, args.output_dir, args.binarize, args.zip)  # noqa
+    ocr_pipeline = MainWorkflow(
-    retval = ocr_pipeline.run(dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores)  # noqa
+        args.input_dir, args.language, args.output_dir, args.binarize)
    retval = ocr_pipeline.run(
        dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores)
    sys.exit(retval)
--- a/wrapper/ocr
+++ b/wrapper/ocr
@@ -6,9 +6,10 @@ import os
 import subprocess
 import sys
-CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:1.0.0'
+CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:0.1.0'
 CONTAINER_INPUT_DIR = '/input'
 CONTAINER_OUTPUT_DIR = '/output'
 CONTAINER_MODELS_DIR = '/usr/local/share/tessdata'
 CONTAINER_LOG_DIR = '/logs'
 UID = str(os.getuid())
 GID = str(os.getgid())
@@ -16,20 +17,25 @@ GID = str(os.getgid())
 parser = ArgumentParser(add_help=False)
 parser.add_argument('-i', '--input-dir')
 parser.add_argument('-o', '--output-dir')
 parser.add_argument('-m', '--model', action='extend', dest='models', nargs='+')
 parser.add_argument('--log-dir')
 args, remaining_args = parser.parse_known_args()
-cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)]
+cmd = ['docker', 'run', '--rm', '-it', '-u', f'{UID}:{GID}']
 if args.input_dir is not None:
-    mapping = os.path.abspath(args.input_dir) + ':' + CONTAINER_INPUT_DIR
+    mapping = f'{os.path.abspath(args.input_dir)}:{CONTAINER_INPUT_DIR}'
    cmd += ['-v', mapping]
    remaining_args += ['-i', CONTAINER_INPUT_DIR]
 if args.output_dir is not None:
-    mapping = os.path.abspath(args.output_dir) + ':' + CONTAINER_OUTPUT_DIR
+    mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}'
    cmd += ['-v', mapping]
    remaining_args += ['-o', CONTAINER_OUTPUT_DIR]
 if args.models is not None:
    for model in args.models:
        mapping = f'{os.path.abspath(model)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model)}'  # noqa
        cmd += ['-v', mapping]
 if args.log_dir is not None:
-    mapping = os.path.abspath(args.log_dir) + ':' + CONTAINER_LOG_DIR
+    mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}'
    cmd += ['-v', mapping]
    remaining_args += ['--log-dir', CONTAINER_LOG_DIR]
 cmd.append(CONTAINER_IMAGE)