Mark required arguments in scripts as required

Codestyle enhacements
Fix enumeration in readme
2025-07-09 18:13:17 +00:00 · 2022-02-03 10:40:50 +01:00 · 2022-01-27 13:40:23 +01:00 · 2022-01-18 13:46:52 +01:00 · 2022-01-18 13:45:17 +01:00 · 2022-01-17 15:07:46 +01:00
9 changed files with 831 additions and 566 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -1,8 +1,5 @@
 image: docker:19.03.13

-variables:
-  DOCKER_TLS_CERTDIR: "/certs"
-
 services:
  - docker:19.03.13-dind

@ -10,6 +7,10 @@ stages:
  - build
  - push

+variables:
+  DOCKER_TLS_CERTDIR: "/certs"
+  INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME-$CI_COMMIT_SHA
+
 .reg_setup:
  before_script:
    - apk add --no-cache curl
@ -28,8 +29,6 @@ build_image:
  stage: build
  tags:
    - docker
-  variables:
-    INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA

 push_master:
  extends:
@ -47,7 +46,6 @@ push_master:
    - docker
  variables:
    IMAGE_TAG: $CI_REGISTRY_IMAGE:latest
-    INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA

 push_other:
  extends:
@ -68,4 +66,3 @@ push_other:
    - docker
  variables:
    IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME
-    INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
--- a/86
+++ b/86
@ -1,47 +1,59 @@
 FROM debian:buster-slim


-LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>"
+LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <porada@posteo.de>"


 ENV LANG=C.UTF-8


-RUN apt-get update
+RUN apt-get update \
+ && apt-get install --no-install-recommends --yes \
+      ghostscript \
+      procps \
+      python3.7 \
+      python3-pip \
+      rename \
+      wget \
+      zip \
+ && python3 -m pip install lxml

-
-# Install pipeline dependencies #
+# Install the OCR pipeline and it's dependencies #
 ## Install pyFlow ##
-ENV PYFLOW_RELEASE=1.1.20
-ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" .
-RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \
- && cd "pyflow-${PYFLOW_RELEASE}" \
+ENV PYFLOW_VERSION=1.1.20
+RUN wget --no-check-certificate --quiet \
+      "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" \
+ && tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
+ && cd "pyflow-${PYFLOW_VERSION}" \
 && apt-get install --no-install-recommends --yes \
      python2.7 \
 && python2.7 setup.py build install \
- && cd .. \
- && rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz"
+ && cd - > /dev/null \
+ && rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz"


 ## Install ocropy ##
-ENV OCROPY_RELEASE=1.3.3
-ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_RELEASE}.tar.gz" .
-RUN tar -xzf "v${OCROPY_RELEASE}.tar.gz" \
- && cd "ocropy-${OCROPY_RELEASE}" \
+ENV OCROPY_VERSION=1.3.3
+RUN wget --no-check-certificate --quiet \
+      "https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" \
+ && tar -xzf "v${OCROPY_VERSION}.tar.gz" \
+ && cd "ocropy-${OCROPY_VERSION}" \
 && apt-get install --no-install-recommends --yes \
+      python2.7 \
      python-pil \
      python-tk \
      $(cat PACKAGES) \
 && python2.7 setup.py install \
- && cd .. \
- && rm -r "ocropy-${OCROPY_RELEASE}" "v${OCROPY_RELEASE}.tar.gz"
+ && cd - > /dev/null \
+ && rm -r "ocropy-${OCROPY_VERSION}" "v${OCROPY_VERSION}.tar.gz"


 ## Install Tesseract OCR ##
-ENV TESSERACT_RELEASE=4.1.1
-ADD "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_RELEASE}.tar.gz" .
-RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \
- && cd "tesseract-${TESSERACT_RELEASE}" \
+ENV TESSERACT_VERSION=5.0.0
+RUN wget --no-check-certificate --quiet \
+      "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \
+ && tar -xzf "${TESSERACT_VERSION}.tar.gz" \
+ && cd "tesseract-${TESSERACT_VERSION}" \
 && apt-get install --no-install-recommends --yes \
      autoconf \
      automake \
@ -55,47 +67,19 @@ RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \
      pkg-config \
      zlib1g-dev \
 && ./autogen.sh \
- && ./configure \
+ && ./configure --disable-openmp --disable-shared 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic' \
 && make \
 && make install \
 && ldconfig \
 && cd - > /dev/null \
- && rm -r "tesseract-${TESSERACT_RELEASE}" "${TESSERACT_RELEASE}.tar.gz"
-
-ENV TESSDATA_BEST_RELEASE=4.1.0
-ADD "https://github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_RELEASE}.tar.gz" .
-RUN tar -xzf "${TESSDATA_BEST_RELEASE}.tar.gz" \
- && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/ara.traineddata" "/usr/local/share/tessdata/" \
- && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/chi_tra.traineddata" "/usr/local/share/tessdata/" \
- && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/dan.traineddata" "/usr/local/share/tessdata/" \
- && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/deu.traineddata" "/usr/local/share/tessdata/" \
- && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/ell.traineddata" "/usr/local/share/tessdata/" \
- && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/eng.traineddata" "/usr/local/share/tessdata/" \
- && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/enm.traineddata" "/usr/local/share/tessdata/" \
- && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/fra.traineddata" "/usr/local/share/tessdata/" \
- && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/frk.traineddata" "/usr/local/share/tessdata/" \
- && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/frm.traineddata" "/usr/local/share/tessdata/" \
- && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/ita.traineddata" "/usr/local/share/tessdata/" \
- && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/por.traineddata" "/usr/local/share/tessdata/" \
- && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/rus.traineddata" "/usr/local/share/tessdata/" \
- && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/spa.traineddata" "/usr/local/share/tessdata/" \
- && rm -r "tessdata_best-${TESSDATA_BEST_RELEASE}" "${TESSDATA_BEST_RELEASE}.tar.gz"
-
-
-## Further dependencies ##
-RUN apt-get install --no-install-recommends --yes \
-      ghostscript \
-      python-pip \
-      python3.7 \
-      zip \
- && pip install natsort
+ && rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz"


 RUN rm -r /var/lib/apt/lists/*


 ## Install Pipeline ##
-COPY hocrtotei ocr /usr/local/bin/
+COPY hocr2tei hocr-combine ocr /usr/local/bin/


 ENTRYPOINT ["ocr"]
--- a/21
+++ b/21
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Bielefeld University - CRC 1288 - INF
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
@ -1,83 +1,49 @@
 # OCR - Optical Character Recognition

-This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided.
+This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided. The pipeline is designed to run on Linux operating systems, but with some tweaks it should also run on Windows with WSL installed.

 ## Software used in this pipeline implementation
- Official Debian Docker image (buster-slim) and programs from its free repositories: https://hub.docker.com/_/debian
+
+- Official Debian Docker image (buster-slim): https://hub.docker.com/_/debian
+  - Software from Debian Buster's free repositories
 - ocropy (1.3.3): https://github.com/ocropus/ocropy/releases/tag/v1.3.3
 - pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20
- Tesseract OCR (4.1.1): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1
- tessdata_best (4.1.0): https://github.com/tesseract-ocr/tessdata_best/releases/tag/4.1.0
+- Tesseract OCR (5.0.0): https://github.com/tesseract-ocr/tesseract/releases/tag/5.0.0

+## Installation

-## Use this image
+1. Install Docker and Python 3.
+2. Clone this repository: `git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git`
+3. Build the Docker image: `docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:v0.1.0 ocr`
+4. Add the wrapper script (`wrapper/ocr` relative to this README file) to your `${PATH}`.
+5. Create working directories for the pipeline: `mkdir -p /<my_data_location>/{input,models,output}`.
+6. Place your Tesseract OCR model(s) inside `/<my_data_location>/models`.

-1. Create input and output directories for the pipeline.
-``` bash
-mkdir -p /<my_data_location>/input /<my_data_location>/output
-```
+## Use the Pipeline

-2. Place your PDF files inside `/<my_data_location>/input`. Files should all contain text of the same language.
-
-3. Start the pipeline process. Check the [Pipeline arguments](#pipeline-arguments) section for more details.
-```
-# Option one: Use the wrapper script
-## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/raw/1.0.0/wrapper/ocr, make it executeable and add it to your ${PATH}
+1. Place your PDF files inside `/<my_data_location>/input`. Files should all contain text of the same language.
+2. Clear your `/<my_data_location>/output` directory.
+3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details.
+```bash
 cd /<my_data_location>
-ocr -i input -l <language_code> -o output <optional_pipeline_arguments>
-
-# Option two: Classic Docker style
-docker run \
-    --rm \
-    -it \
-    -u $(id -u $USER):$(id -g $USER) \
-    -v /<my_data_location>/input:/input \
-    -v /<my_data_location>/output:/output \
-    gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:1.0.0 \
-        -i /input \
-        -l <language_code>
-        -o /output \
-        <optional_pipeline_arguments>
+# <model_code> is the model filename without the ".traineddata" suffix
+ocr \
+  --input-dir input \
+  --output-dir output \
+  --model-file models/<model>
+  -m <model_code> <optional_pipeline_arguments>
+# More then one model
+ocr \
+  --input-dir input \
+  --output-dir output \
+  --model-file models/<model1>
+  --model-file models/<model2>
+  -m <model1_code>+<model2_code> <optional_pipeline_arguments>
+# Instead of multiple --model-file statements, you can also use
+ocr \
+  --input-dir input \
+  --output-dir output \
+  --model-file models/*
+  -m <model1_code>+<model2_code> <optional_pipeline_arguments>
 ```
-
 4. Check your results in the `/<my_data_location>/output` directory.
-```
-
-### Pipeline arguments
-
-`-l languagecode`
-* Tells tesseract which language will be used.
-* options = ara (Arabic), chi_tra (Chinese - Traditional), dan (Danish), deu (German), ell (Greek, Modern (1453-)), eng (English), enm (Middle englisch), fra (French), frk (German Fraktur), frm (Middle french), ita (Italian), por (Portuguese), rus (Russian), spa (Spanish)
-* required = True
-
-`--keep-intermediates`
-* If set, all intermediate files created during the OCR process will be
-kept.
-* default = False
-* required = False
-
-`--nCores corenumber`
-* Sets the number of CPU cores being used during the OCR process.
-* default = min(4, multiprocessing.cpu_count())
-* required = False
-
-`--skip-binarisation`
-* Used to skip binarization with ocropus. If skipped, only the tesseract binarization is used.
-* default = False
-
-``` bash
-# Example with all arguments used
-docker run \
-    --rm \
-    -it \
-    -u $(id -u $USER):$(id -g $USER) \
-    -v "$HOME"/ocr/input:/input \
-    -v "$HOME"/ocr/output:/output \
-    gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:1.0.0 \
-        -i /input \
-        -l eng \
-        -o /output \
-        --keep_intermediates \
-        --nCores 8 \
-        --skip-binarisation
-```
--- a/44
+++ b/44
@ -0,0 +1,44 @@
+#!/usr/bin/env python3.7
+# coding=utf-8
+
+''' Combine multiple hOCR files. '''
+
+from argparse import ArgumentParser
+from lxml import html
+
+
+parser = ArgumentParser(description='Combine multiple hOCR files.')
+parser.add_argument(
+    '-i', '--input-file',
+    help='Input file',
+    nargs='+',
+    required=True
+)
+parser.add_argument(
+    '-o', '--output-file',
+    help='Output file',
+    required=True
+)
+args = parser.parse_args()
+
+
+for input_file in args.input_file:
+    input_files = []
+    if input_file.startswith('@'):
+        with open(input_file[1:], 'r') as f:
+            input_files += [x for x in f.read().split("\n") if x != '']
+    else:
+        input_files.append(input_file)
+if len(input_files) == 0:
+    exit(1)
+
+
+hocr = html.parse(input_files[0])
+hocr_body = hocr.find('body')
+for input_file in input_files[1:]:
+    for ocr_page in html.parse(input_file).findall('//div[@class="ocr_page"]'):
+        hocr_body.append(ocr_page)
+
+
+with open(args.output_file, 'wb') as f:
+    hocr.write(f, encoding='UTF-8', method='html')
--- a/68
+++ b/68
@ -0,0 +1,68 @@
+#!/usr/bin/env python3.7
+# coding=utf-8
+
+''' Convert hOCR to TEI XML. '''
+
+from argparse import ArgumentParser
+from lxml import html
+from xml.sax.saxutils import escape
+import re
+
+
+parser = ArgumentParser(description='Convert hOCR to TEI XML.')
+parser.add_argument(
+    '-i', '--input-file',
+    help='Input file',
+    required=True
+)
+parser.add_argument(
+    '-o', '--output-file',
+    help='Output file',
+    required=True
+)
+args = parser.parse_args()
+
+
+tei = ''
+tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n'
+tei += '  <teiHeader>\n'
+tei += '    <fileDesc>\n'
+tei += '      <titleStmt>\n'
+tei += '        <title></title>\n'
+tei += '      </titleStmt>\n'
+tei += '      <publicationStmt>\n'
+tei += '        <p></p>\n'
+tei += '      </publicationStmt>\n'
+tei += '      <sourceDesc>\n'
+tei += '        <p></p>\n'
+tei += '      </sourceDesc>\n'
+tei += '    </fileDesc>\n'
+tei += '  </teiHeader>\n'
+tei += '  <text>\n'
+tei += '    <body>\n'
+hocr = html.parse(args.input_file)
+for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
+    ocr_page_title_attrib = ocr_page.attrib.get('title')
+    facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1)
+    page_number = re.search(r'ppageno (\d+)', ocr_page_title_attrib).group(1)
+    tei += f'      <pb facs="{facsimile}" n="{page_number}"/>\n'
+    for ocr_par in ocr_page.findall('.//p[@class="ocr_par"]'):
+        tei += '      <p>\n'
+        for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'):
+            tei += '        <lb/>'
+            is_first_word_in_line = True
+            for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'):
+                if ocrx_word.text is not None:
+                    if not is_first_word_in_line:
+                        tei += ' '
+                    tei += escape(ocrx_word.text)
+                    is_first_word_in_line = False
+            tei += '\n'
+        tei += '      </p>\n'
+tei += '    </body>\n'
+tei += '  </text>\n'
+tei += '</TEI>\n'
+
+
+with open(args.output_file, 'w') as f:
+    f.write(tei)
--- a/49
+++ b/49
@ -1,49 +0,0 @@
-#!/usr/bin/env python3.7
-# coding=utf-8
-
-""""Merges hOCR files into a TEI file."""
-
-from xml.sax.saxutils import escape
-from argparse import ArgumentParser
-import xml.etree.ElementTree as ET
-
-parser = ArgumentParser(description='Merges hOCR files into a TEI file.')
-parser.add_argument('i', metavar='hOCR-sourcefile', nargs='+')
-parser.add_argument('o', metavar='TEI-destfile',)
-args = parser.parse_args()
-
-output_file = open(args.o, 'w')
-output_file.write(
-      '<?xml version="1.0" encoding="UTF-8"?>\n'
-    + '<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="dtabf">\n'
-    + '    <teiHeader>\n'
-    + '        <fileDesc>\n'
-    + '            <titleStmt/>\n'
-    + '            <publicationStmt/>\n'
-    + '            <sourceDesc/>\n'
-    + '        </fileDesc>\n'
-    + '        <encodingDesc/>\n'
-    + '        <profileDesc/>\n'
-    + '    </teiHeader>\n'
-    + '    <text>\n'
-    + '        <body>\n'
-)
-for index, input_file in enumerate(args.i):
-    tree = ET.parse(input_file)
-    output_file.write('            <pb n="%i"/>\n' % (index + 1))
-    for para in tree.findall('.//*[@class="ocr_par"]'):
-        output_file.write('            <p>\n')
-        for line in para.findall('.//*[@class="ocr_line"]'):
-            first_word_in_line = True
-            for word in line.findall('.//*[@class="ocrx_word"]'):
-                if word.text is not None:
-                    output_file.write(('                ' if first_word_in_line else ' ') + escape(word.text.strip()))
-                    first_word_in_line = False
-            if not first_word_in_line:
-                output_file.write('<lb/>\n')
-        output_file.write('            </p>\n')
-output_file.write(
-      '        </body>\n'
-    + '    </text>\n'
-    + '</TEI>')
-output_file.close()
--- a/961
+++ b/961
--- a/wrapper/ocr
+++ b/wrapper/ocr
@ -1,43 +1,44 @@
 #!/usr/bin/env python3
 # coding=utf-8

-"""A wrapper to execute the OCR pipeline in a Docker container"""
-
 from argparse import ArgumentParser
 import os
 import subprocess
+import sys

-CONTAINER_IMAGE_TAG = '1.0.0'
-CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:{}'.format(CONTAINER_IMAGE_TAG)  # noqa
+CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:v0.1.0'
 CONTAINER_INPUT_DIR = '/input'
-CONTAINER_INTERMEDIATE_DIR = '/intermediate'
 CONTAINER_OUTPUT_DIR = '/output'
+CONTAINER_MODELS_DIR = '/usr/local/share/tessdata'
+CONTAINER_LOG_DIR = '/logs'
 UID = str(os.getuid())
 GID = str(os.getgid())

 parser = ArgumentParser(add_help=False)
-parser.add_argument('-i', '--input-directory')
-parser.add_argument('-o', '--output-directory')
-parser.add_argument('--intermediate-directory')
+parser.add_argument('-i', '--input-dir')
+parser.add_argument('-o', '--output-dir')
+parser.add_argument('-t', '--model-file', action='extend', nargs='+')
+parser.add_argument('--log-dir')
 args, remaining_args = parser.parse_known_args()

-cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)]
-if args.intermediate_directory is not None:
-    cmd += ['-v', '{}:{}'.format(os.path.abspath(args.intermediate_directory),
-                                 CONTAINER_INTERMEDIATE_DIR)]
-    remaining_args.insert(0, CONTAINER_INTERMEDIATE_DIR)
-    remaining_args.insert(0, '--intermediate-directory')
-if args.output_directory is not None:
-    cmd += ['-v', '{}:{}'.format(os.path.abspath(args.output_directory),
-                                 CONTAINER_OUTPUT_DIR)]
-    remaining_args.insert(0, CONTAINER_OUTPUT_DIR)
-    remaining_args.insert(0, '-o')
-if args.input_directory is not None:
-    cmd += ['-v', '{}:{}'.format(os.path.abspath(args.input_directory),
-                                 CONTAINER_INPUT_DIR)]
-    remaining_args.insert(0, CONTAINER_INPUT_DIR)
-    remaining_args.insert(0, '-i')
+cmd = ['docker', 'run', '--rm', '-it', '-u', f'{UID}:{GID}']
+if args.input_dir is not None:
+    mapping = f'{os.path.abspath(args.input_dir)}:{CONTAINER_INPUT_DIR}'
+    cmd += ['-v', mapping]
+    remaining_args += ['-i', CONTAINER_INPUT_DIR]
+if args.output_dir is not None:
+    mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}'
+    cmd += ['-v', mapping]
+    remaining_args += ['-o', CONTAINER_OUTPUT_DIR]
+if args.model_file is not None:
+    for model_file in args.model_file:
+        mapping = f'{os.path.abspath(model_file)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model_file)}'  # noqa
+        cmd += ['-v', mapping]
+if args.log_dir is not None:
+    mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}'
+    cmd += ['-v', mapping]
+    remaining_args += ['--log-dir', CONTAINER_LOG_DIR]
 cmd.append(CONTAINER_IMAGE)
 cmd += remaining_args

-subprocess.run(cmd)
+sys.exit(subprocess.run(cmd).returncode)
Author	SHA1	Message	Date
Patrick Jentsch	ca1803ab8a	Mark required arguments in scripts as required	2022-02-03 10:40:50 +01:00
Patrick Jentsch	4518ca1c83	Codestyle enhacements	2022-01-27 13:40:23 +01:00
Patrick Jentsch	aeab9b7802	Fix enumeration in readme	2022-01-18 13:46:52 +01:00
Patrick Jentsch	00c4b17018	Codestyle update	2022-01-18 13:45:17 +01:00
Patrick Jentsch	c057d324cf	Cleanup and change some output options	2022-01-17 15:07:46 +01:00
Patrick Jentsch	f51a8c4546	Change output files file format	2022-01-14 10:56:16 +01:00
Patrick Jentsch	c640d9743f	Add output_files.json (lists all output files) generation.	2022-01-05 11:25:00 +01:00
Patrick Jentsch	e3fd679b38	Mark all scripts as executeable	2022-01-04 13:21:38 +01:00
Patrick Jentsch	8a3816121c	fix image tag	2022-01-04 12:10:26 +01:00
Patrick Jentsch	e1b78b6ba4	Update to Tesseract 5.0.0, Set version 0.1.0	2022-01-04 11:42:55 +01:00
Patrick Jentsch	a0760487ae	Don't process files in subdirectories	2021-04-12 13:22:28 +02:00
Patrick Jentsch	a798457c43	Add mising --log-dir argument to wrapper script	2021-04-12 09:53:59 +02:00
Patrick Jentsch	e2da0fb839	Tweak the README and pipeline help.	2021-03-26 10:03:59 +01:00
Patrick Jentsch	e78f667438	Use more descriptive argument names then i and o (now: input and output)	2021-03-18 10:32:55 +01:00
Patrick Jentsch	41f70da8eb	Update the hocrtotei script	2021-03-17 16:58:13 +01:00
Patrick Jentsch	6db7f70446	Add back german language models	2021-03-17 14:26:24 +01:00
Patrick Jentsch	947658a7d8	Change intermediate image name in order to fix issues with building multiple branches/tags at the same time	2021-03-15 14:11:23 +01:00
Patrick Jentsch	acbf61be05	Cleanup and make use of globbing for input files for binarization and ocr	2021-03-15 12:45:05 +01:00
Patrick Jentsch	104598039e	Dockerfile codestyle	2021-02-24 15:28:04 +01:00
Patrick Jentsch	da29659a9b	Add back missing author mention	2021-02-24 15:17:42 +01:00