Mark required arguments in scripts as required

Codestyle enhacements
Fix enumeration in readme
2025-07-09 18:23:17 +00:00 · 2022-02-03 10:40:50 +01:00 · 2022-01-27 13:40:23 +01:00 · 2022-01-18 13:46:52 +01:00 · 2022-01-18 13:45:17 +01:00 · 2022-01-17 15:07:46 +01:00
9 changed files with 831 additions and 566 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -1,8 +1,5 @@
 image: docker:19.03.13
 variables:
  DOCKER_TLS_CERTDIR: "/certs"
 services:
  - docker:19.03.13-dind
@ -10,6 +7,10 @@ stages:
  - build
  - push
 variables:
  DOCKER_TLS_CERTDIR: "/certs"
  INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME-$CI_COMMIT_SHA
 .reg_setup:
  before_script:
    - apk add --no-cache curl
@ -28,8 +29,6 @@ build_image:
  stage: build
  tags:
    - docker
  variables:
    INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
 push_master:
  extends:
@ -47,7 +46,6 @@ push_master:
    - docker
  variables:
    IMAGE_TAG: $CI_REGISTRY_IMAGE:latest
    INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
 push_other:
  extends:
@ -68,4 +66,3 @@ push_other:
    - docker
  variables:
    IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME
    INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
--- a/86
+++ b/86
@ -1,47 +1,59 @@
 FROM debian:buster-slim
-LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>"
+LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <porada@posteo.de>"
 ENV LANG=C.UTF-8
-RUN apt-get update
+RUN apt-get update \
 && apt-get install --no-install-recommends --yes \
      ghostscript \
      procps \
      python3.7 \
      python3-pip \
      rename \
      wget \
      zip \
 && python3 -m pip install lxml
-
+# Install the OCR pipeline and it's dependencies #
 # Install pipeline dependencies #
 ## Install pyFlow ##
-ENV PYFLOW_RELEASE=1.1.20
+ENV PYFLOW_VERSION=1.1.20
-ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" .
+RUN wget --no-check-certificate --quiet \
-RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \
+      "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" \
- && cd "pyflow-${PYFLOW_RELEASE}" \
+ && tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
 && cd "pyflow-${PYFLOW_VERSION}" \
 && apt-get install --no-install-recommends --yes \
      python2.7 \
 && python2.7 setup.py build install \
- && cd .. \
+ && cd - > /dev/null \
- && rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz"
+ && rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz"
 ## Install ocropy ##
-ENV OCROPY_RELEASE=1.3.3
+ENV OCROPY_VERSION=1.3.3
-ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_RELEASE}.tar.gz" .
+RUN wget --no-check-certificate --quiet \
-RUN tar -xzf "v${OCROPY_RELEASE}.tar.gz" \
+      "https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" \
- && cd "ocropy-${OCROPY_RELEASE}" \
+ && tar -xzf "v${OCROPY_VERSION}.tar.gz" \
 && cd "ocropy-${OCROPY_VERSION}" \
 && apt-get install --no-install-recommends --yes \
      python2.7 \
      python-pil \
      python-tk \
      $(cat PACKAGES) \
 && python2.7 setup.py install \
- && cd .. \
+ && cd - > /dev/null \
- && rm -r "ocropy-${OCROPY_RELEASE}" "v${OCROPY_RELEASE}.tar.gz"
+ && rm -r "ocropy-${OCROPY_VERSION}" "v${OCROPY_VERSION}.tar.gz"
 ## Install Tesseract OCR ##
-ENV TESSERACT_RELEASE=4.1.1
+ENV TESSERACT_VERSION=5.0.0
-ADD "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_RELEASE}.tar.gz" .
+RUN wget --no-check-certificate --quiet \
-RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \
+      "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \
- && cd "tesseract-${TESSERACT_RELEASE}" \
+ && tar -xzf "${TESSERACT_VERSION}.tar.gz" \
 && cd "tesseract-${TESSERACT_VERSION}" \
 && apt-get install --no-install-recommends --yes \
      autoconf \
      automake \
@ -55,47 +67,19 @@ RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \
      pkg-config \
      zlib1g-dev \
 && ./autogen.sh \
- && ./configure \
+ && ./configure --disable-openmp --disable-shared 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic' \
 && make \
 && make install \
 && ldconfig \
 && cd - > /dev/null \
- && rm -r "tesseract-${TESSERACT_RELEASE}" "${TESSERACT_RELEASE}.tar.gz"
+ && rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz"
 ENV TESSDATA_BEST_RELEASE=4.1.0
 ADD "https://github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_RELEASE}.tar.gz" .
 RUN tar -xzf "${TESSDATA_BEST_RELEASE}.tar.gz" \
 && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/ara.traineddata" "/usr/local/share/tessdata/" \
 && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/chi_tra.traineddata" "/usr/local/share/tessdata/" \
 && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/dan.traineddata" "/usr/local/share/tessdata/" \
 && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/deu.traineddata" "/usr/local/share/tessdata/" \
 && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/ell.traineddata" "/usr/local/share/tessdata/" \
 && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/eng.traineddata" "/usr/local/share/tessdata/" \
 && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/enm.traineddata" "/usr/local/share/tessdata/" \
 && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/fra.traineddata" "/usr/local/share/tessdata/" \
 && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/frk.traineddata" "/usr/local/share/tessdata/" \
 && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/frm.traineddata" "/usr/local/share/tessdata/" \
 && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/ita.traineddata" "/usr/local/share/tessdata/" \
 && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/por.traineddata" "/usr/local/share/tessdata/" \
 && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/rus.traineddata" "/usr/local/share/tessdata/" \
 && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/spa.traineddata" "/usr/local/share/tessdata/" \
 && rm -r "tessdata_best-${TESSDATA_BEST_RELEASE}" "${TESSDATA_BEST_RELEASE}.tar.gz"
 ## Further dependencies ##
 RUN apt-get install --no-install-recommends --yes \
      ghostscript \
      python-pip \
      python3.7 \
      zip \
 && pip install natsort
 RUN rm -r /var/lib/apt/lists/*
 ## Install Pipeline ##
-COPY hocrtotei ocr /usr/local/bin/
+COPY hocr2tei hocr-combine ocr /usr/local/bin/
 ENTRYPOINT ["ocr"]
--- a/21
+++ b/21
@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2021 Bielefeld University - CRC 1288 - INF
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/README.md
+++ b/README.md
@ -1,83 +1,49 @@
 # OCR - Optical Character Recognition
-This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided.
+This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided. The pipeline is designed to run on Linux operating systems, but with some tweaks it should also run on Windows with WSL installed.
 ## Software used in this pipeline implementation
- Official Debian Docker image (buster-slim) and programs from its free repositories: https://hub.docker.com/_/debian
+
 - Official Debian Docker image (buster-slim): https://hub.docker.com/_/debian
  - Software from Debian Buster's free repositories
 - ocropy (1.3.3): https://github.com/ocropus/ocropy/releases/tag/v1.3.3
 - pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20
- Tesseract OCR (4.1.1): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1
+- Tesseract OCR (5.0.0): https://github.com/tesseract-ocr/tesseract/releases/tag/5.0.0
 - tessdata_best (4.1.0): https://github.com/tesseract-ocr/tessdata_best/releases/tag/4.1.0
 ## Installation
-## Use this image
+1. Install Docker and Python 3.
 2. Clone this repository: `git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git`
 3. Build the Docker image: `docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:v0.1.0 ocr`
 4. Add the wrapper script (`wrapper/ocr` relative to this README file) to your `${PATH}`.
 5. Create working directories for the pipeline: `mkdir -p /<my_data_location>/{input,models,output}`.
 6. Place your Tesseract OCR model(s) inside `/<my_data_location>/models`.
-1. Create input and output directories for the pipeline.
+## Use the Pipeline
 ``` bash
 mkdir -p /<my_data_location>/input /<my_data_location>/output
 ```
-2. Place your PDF files inside `/<my_data_location>/input`. Files should all contain text of the same language.
+1. Place your PDF files inside `/<my_data_location>/input`. Files should all contain text of the same language.
-
+2. Clear your `/<my_data_location>/output` directory.
-3. Start the pipeline process. Check the [Pipeline arguments](#pipeline-arguments) section for more details.
+3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details.
-```
+```bash
 # Option one: Use the wrapper script
 ## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/raw/1.0.0/wrapper/ocr, make it executeable and add it to your ${PATH}
 cd /<my_data_location>
-ocr -i input -l <language_code> -o output <optional_pipeline_arguments>
+# <model_code> is the model filename without the ".traineddata" suffix
-
+ocr \
-# Option two: Classic Docker style
+  --input-dir input \
-docker run \
+  --output-dir output \
-    --rm \
+  --model-file models/<model>
-    -it \
+  -m <model_code> <optional_pipeline_arguments>
-    -u $(id -u $USER):$(id -g $USER) \
+# More then one model
-    -v /<my_data_location>/input:/input \
+ocr \
-    -v /<my_data_location>/output:/output \
+  --input-dir input \
-    gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:1.0.0 \
+  --output-dir output \
-        -i /input \
+  --model-file models/<model1>
-        -l <language_code>
+  --model-file models/<model2>
-        -o /output \
+  -m <model1_code>+<model2_code> <optional_pipeline_arguments>
-        <optional_pipeline_arguments>
+# Instead of multiple --model-file statements, you can also use
 ocr \
  --input-dir input \
  --output-dir output \
  --model-file models/*
  -m <model1_code>+<model2_code> <optional_pipeline_arguments>
 ```
 4. Check your results in the `/<my_data_location>/output` directory.
 ```
 ### Pipeline arguments
 `-l languagecode`
 * Tells tesseract which language will be used.
 * options = ara (Arabic), chi_tra (Chinese - Traditional), dan (Danish), deu (German), ell (Greek, Modern (1453-)), eng (English), enm (Middle englisch), fra (French), frk (German Fraktur), frm (Middle french), ita (Italian), por (Portuguese), rus (Russian), spa (Spanish)
 * required = True
 `--keep-intermediates`
 * If set, all intermediate files created during the OCR process will be
 kept.
 * default = False
 * required = False
 `--nCores corenumber`
 * Sets the number of CPU cores being used during the OCR process.
 * default = min(4, multiprocessing.cpu_count())
 * required = False
 `--skip-binarisation`
 * Used to skip binarization with ocropus. If skipped, only the tesseract binarization is used.
 * default = False
 ``` bash
 # Example with all arguments used
 docker run \
    --rm \
    -it \
    -u $(id -u $USER):$(id -g $USER) \
    -v "$HOME"/ocr/input:/input \
    -v "$HOME"/ocr/output:/output \
    gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:1.0.0 \
        -i /input \
        -l eng \
        -o /output \
        --keep_intermediates \
        --nCores 8 \
        --skip-binarisation
 ```
--- a/44
+++ b/44
@ -0,0 +1,44 @@
 #!/usr/bin/env python3.7
 # coding=utf-8
 ''' Combine multiple hOCR files. '''
 from argparse import ArgumentParser
 from lxml import html
 parser = ArgumentParser(description='Combine multiple hOCR files.')
 parser.add_argument(
    '-i', '--input-file',
    help='Input file',
    nargs='+',
    required=True
 )
 parser.add_argument(
    '-o', '--output-file',
    help='Output file',
    required=True
 )
 args = parser.parse_args()
 for input_file in args.input_file:
    input_files = []
    if input_file.startswith('@'):
        with open(input_file[1:], 'r') as f:
            input_files += [x for x in f.read().split("\n") if x != '']
    else:
        input_files.append(input_file)
 if len(input_files) == 0:
    exit(1)
 hocr = html.parse(input_files[0])
 hocr_body = hocr.find('body')
 for input_file in input_files[1:]:
    for ocr_page in html.parse(input_file).findall('//div[@class="ocr_page"]'):
        hocr_body.append(ocr_page)
 with open(args.output_file, 'wb') as f:
    hocr.write(f, encoding='UTF-8', method='html')
--- a/68
+++ b/68
@ -0,0 +1,68 @@
 #!/usr/bin/env python3.7
 # coding=utf-8
 ''' Convert hOCR to TEI XML. '''
 from argparse import ArgumentParser
 from lxml import html
 from xml.sax.saxutils import escape
 import re
 parser = ArgumentParser(description='Convert hOCR to TEI XML.')
 parser.add_argument(
    '-i', '--input-file',
    help='Input file',
    required=True
 )
 parser.add_argument(
    '-o', '--output-file',
    help='Output file',
    required=True
 )
 args = parser.parse_args()
 tei = ''
 tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n'
 tei += '  <teiHeader>\n'
 tei += '    <fileDesc>\n'
 tei += '      <titleStmt>\n'
 tei += '        <title></title>\n'
 tei += '      </titleStmt>\n'
 tei += '      <publicationStmt>\n'
 tei += '        <p></p>\n'
 tei += '      </publicationStmt>\n'
 tei += '      <sourceDesc>\n'
 tei += '        <p></p>\n'
 tei += '      </sourceDesc>\n'
 tei += '    </fileDesc>\n'
 tei += '  </teiHeader>\n'
 tei += '  <text>\n'
 tei += '    <body>\n'
 hocr = html.parse(args.input_file)
 for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
    ocr_page_title_attrib = ocr_page.attrib.get('title')
    facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1)
    page_number = re.search(r'ppageno (\d+)', ocr_page_title_attrib).group(1)
    tei += f'      <pb facs="{facsimile}" n="{page_number}"/>\n'
    for ocr_par in ocr_page.findall('.//p[@class="ocr_par"]'):
        tei += '      <p>\n'
        for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'):
            tei += '        <lb/>'
            is_first_word_in_line = True
            for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'):
                if ocrx_word.text is not None:
                    if not is_first_word_in_line:
                        tei += ' '
                    tei += escape(ocrx_word.text)
                    is_first_word_in_line = False
            tei += '\n'
        tei += '      </p>\n'
 tei += '    </body>\n'
 tei += '  </text>\n'
 tei += '</TEI>\n'
 with open(args.output_file, 'w') as f:
    f.write(tei)
--- a/49
+++ b/49
@ -1,49 +0,0 @@
 #!/usr/bin/env python3.7
 # coding=utf-8
 """"Merges hOCR files into a TEI file."""
 from xml.sax.saxutils import escape
 from argparse import ArgumentParser
 import xml.etree.ElementTree as ET
 parser = ArgumentParser(description='Merges hOCR files into a TEI file.')
 parser.add_argument('i', metavar='hOCR-sourcefile', nargs='+')
 parser.add_argument('o', metavar='TEI-destfile',)
 args = parser.parse_args()
 output_file = open(args.o, 'w')
 output_file.write(
      '<?xml version="1.0" encoding="UTF-8"?>\n'
    + '<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="dtabf">\n'
    + '    <teiHeader>\n'
    + '        <fileDesc>\n'
    + '            <titleStmt/>\n'
    + '            <publicationStmt/>\n'
    + '            <sourceDesc/>\n'
    + '        </fileDesc>\n'
    + '        <encodingDesc/>\n'
    + '        <profileDesc/>\n'
    + '    </teiHeader>\n'
    + '    <text>\n'
    + '        <body>\n'
 )
 for index, input_file in enumerate(args.i):
    tree = ET.parse(input_file)
    output_file.write('            <pb n="%i"/>\n' % (index + 1))
    for para in tree.findall('.//*[@class="ocr_par"]'):
        output_file.write('            <p>\n')
        for line in para.findall('.//*[@class="ocr_line"]'):
            first_word_in_line = True
            for word in line.findall('.//*[@class="ocrx_word"]'):
                if word.text is not None:
                    output_file.write(('                ' if first_word_in_line else ' ') + escape(word.text.strip()))
                    first_word_in_line = False
            if not first_word_in_line:
                output_file.write('<lb/>\n')
        output_file.write('            </p>\n')
 output_file.write(
      '        </body>\n'
    + '    </text>\n'
    + '</TEI>')
 output_file.close()
--- a/961
+++ b/961
--- a/wrapper/ocr
+++ b/wrapper/ocr
@ -1,43 +1,44 @@
 #!/usr/bin/env python3
 # coding=utf-8
 """A wrapper to execute the OCR pipeline in a Docker container"""
 from argparse import ArgumentParser
 import os
 import subprocess
 import sys
-CONTAINER_IMAGE_TAG = '1.0.0'
+CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:v0.1.0'
 CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:{}'.format(CONTAINER_IMAGE_TAG)  # noqa
 CONTAINER_INPUT_DIR = '/input'
 CONTAINER_INTERMEDIATE_DIR = '/intermediate'
 CONTAINER_OUTPUT_DIR = '/output'
 CONTAINER_MODELS_DIR = '/usr/local/share/tessdata'
 CONTAINER_LOG_DIR = '/logs'
 UID = str(os.getuid())
 GID = str(os.getgid())
 parser = ArgumentParser(add_help=False)
-parser.add_argument('-i', '--input-directory')
+parser.add_argument('-i', '--input-dir')
-parser.add_argument('-o', '--output-directory')
+parser.add_argument('-o', '--output-dir')
-parser.add_argument('--intermediate-directory')
+parser.add_argument('-t', '--model-file', action='extend', nargs='+')
 parser.add_argument('--log-dir')
 args, remaining_args = parser.parse_known_args()
-cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)]
+cmd = ['docker', 'run', '--rm', '-it', '-u', f'{UID}:{GID}']
-if args.intermediate_directory is not None:
+if args.input_dir is not None:
-    cmd += ['-v', '{}:{}'.format(os.path.abspath(args.intermediate_directory),
+    mapping = f'{os.path.abspath(args.input_dir)}:{CONTAINER_INPUT_DIR}'
-                                 CONTAINER_INTERMEDIATE_DIR)]
+    cmd += ['-v', mapping]
-    remaining_args.insert(0, CONTAINER_INTERMEDIATE_DIR)
+    remaining_args += ['-i', CONTAINER_INPUT_DIR]
-    remaining_args.insert(0, '--intermediate-directory')
+if args.output_dir is not None:
-if args.output_directory is not None:
+    mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}'
-    cmd += ['-v', '{}:{}'.format(os.path.abspath(args.output_directory),
+    cmd += ['-v', mapping]
-                                 CONTAINER_OUTPUT_DIR)]
+    remaining_args += ['-o', CONTAINER_OUTPUT_DIR]
-    remaining_args.insert(0, CONTAINER_OUTPUT_DIR)
+if args.model_file is not None:
-    remaining_args.insert(0, '-o')
+    for model_file in args.model_file:
-if args.input_directory is not None:
+        mapping = f'{os.path.abspath(model_file)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model_file)}'  # noqa
-    cmd += ['-v', '{}:{}'.format(os.path.abspath(args.input_directory),
+        cmd += ['-v', mapping]
-                                 CONTAINER_INPUT_DIR)]
+if args.log_dir is not None:
-    remaining_args.insert(0, CONTAINER_INPUT_DIR)
+    mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}'
-    remaining_args.insert(0, '-i')
+    cmd += ['-v', mapping]
    remaining_args += ['--log-dir', CONTAINER_LOG_DIR]
 cmd.append(CONTAINER_IMAGE)
 cmd += remaining_args
-subprocess.run(cmd)
+sys.exit(subprocess.run(cmd).returncode)
Author	SHA1	Message	Date
Patrick Jentsch	ca1803ab8a	Mark required arguments in scripts as required	2022-02-03 10:40:50 +01:00
Patrick Jentsch	4518ca1c83	Codestyle enhacements	2022-01-27 13:40:23 +01:00
Patrick Jentsch	aeab9b7802	Fix enumeration in readme	2022-01-18 13:46:52 +01:00
Patrick Jentsch	00c4b17018	Codestyle update	2022-01-18 13:45:17 +01:00
Patrick Jentsch	c057d324cf	Cleanup and change some output options	2022-01-17 15:07:46 +01:00
Patrick Jentsch	f51a8c4546	Change output files file format	2022-01-14 10:56:16 +01:00
Patrick Jentsch	c640d9743f	Add output_files.json (lists all output files) generation.	2022-01-05 11:25:00 +01:00
Patrick Jentsch	e3fd679b38	Mark all scripts as executeable	2022-01-04 13:21:38 +01:00
Patrick Jentsch	8a3816121c	fix image tag	2022-01-04 12:10:26 +01:00
Patrick Jentsch	e1b78b6ba4	Update to Tesseract 5.0.0, Set version 0.1.0	2022-01-04 11:42:55 +01:00
Patrick Jentsch	a0760487ae	Don't process files in subdirectories	2021-04-12 13:22:28 +02:00
Patrick Jentsch	a798457c43	Add mising --log-dir argument to wrapper script	2021-04-12 09:53:59 +02:00
Patrick Jentsch	e2da0fb839	Tweak the README and pipeline help.	2021-03-26 10:03:59 +01:00
Patrick Jentsch	e78f667438	Use more descriptive argument names then i and o (now: input and output)	2021-03-18 10:32:55 +01:00
Patrick Jentsch	41f70da8eb	Update the hocrtotei script	2021-03-17 16:58:13 +01:00
Patrick Jentsch	6db7f70446	Add back german language models	2021-03-17 14:26:24 +01:00
Patrick Jentsch	947658a7d8	Change intermediate image name in order to fix issues with building multiple branches/tags at the same time	2021-03-15 14:11:23 +01:00
Patrick Jentsch	acbf61be05	Cleanup and make use of globbing for input files for binarization and ocr	2021-03-15 12:45:05 +01:00
Patrick Jentsch	104598039e	Dockerfile codestyle	2021-02-24 15:28:04 +01:00
Patrick Jentsch	da29659a9b	Add back missing author mention	2021-02-24 15:17:42 +01:00