Compare commits

20 Commits

Author SHA1 Message Date
ca1803ab8a Mark required arguments in scripts as required 2022-02-03 10:40:50 +01:00
4518ca1c83 Codestyle enhacements 2022-01-27 13:40:23 +01:00
aeab9b7802 Fix enumeration in readme 2022-01-18 13:46:52 +01:00
00c4b17018 Codestyle update 2022-01-18 13:45:17 +01:00
c057d324cf Cleanup and change some output options 2022-01-17 15:07:46 +01:00
f51a8c4546 Change output files file format 2022-01-14 10:56:16 +01:00
c640d9743f Add output_files.json (lists all output files) generation. 2022-01-05 11:25:00 +01:00
e3fd679b38 Mark all scripts as executeable 2022-01-04 13:21:38 +01:00
8a3816121c fix image tag 2022-01-04 12:10:26 +01:00
e1b78b6ba4 Update to Tesseract 5.0.0, Set version 0.1.0 2022-01-04 11:42:55 +01:00
a0760487ae Don't process files in subdirectories 2021-04-12 13:22:28 +02:00
a798457c43 Add mising --log-dir argument to wrapper script 2021-04-12 09:53:59 +02:00
e2da0fb839 Tweak the README and pipeline help. 2021-03-26 10:03:59 +01:00
e78f667438 Use more descriptive argument names then i and o (now: input and output) 2021-03-18 10:32:55 +01:00
41f70da8eb Update the hocrtotei script 2021-03-17 16:58:13 +01:00
6db7f70446 Add back german language models 2021-03-17 14:26:24 +01:00
947658a7d8 Change intermediate image name in order to fix issues with building multiple branches/tags at the same time 2021-03-15 14:11:23 +01:00
acbf61be05 Cleanup and make use of globbing for input files for binarization and ocr 2021-03-15 12:45:05 +01:00
104598039e Dockerfile codestyle 2021-02-24 15:28:04 +01:00
da29659a9b Add back missing author mention 2021-02-24 15:17:42 +01:00
9 changed files with 831 additions and 566 deletions

View File

@ -1,8 +1,5 @@
image: docker:19.03.13
variables:
DOCKER_TLS_CERTDIR: "/certs"
services:
- docker:19.03.13-dind
@ -10,6 +7,10 @@ stages:
- build
- push
variables:
DOCKER_TLS_CERTDIR: "/certs"
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME-$CI_COMMIT_SHA
.reg_setup:
before_script:
- apk add --no-cache curl
@ -28,8 +29,6 @@ build_image:
stage: build
tags:
- docker
variables:
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
push_master:
extends:
@ -47,7 +46,6 @@ push_master:
- docker
variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE:latest
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
push_other:
extends:
@ -68,4 +66,3 @@ push_other:
- docker
variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA

View File

@ -1,47 +1,59 @@
FROM debian:buster-slim
LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>"
LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <porada@posteo.de>"
ENV LANG=C.UTF-8
RUN apt-get update
RUN apt-get update \
&& apt-get install --no-install-recommends --yes \
ghostscript \
procps \
python3.7 \
python3-pip \
rename \
wget \
zip \
&& python3 -m pip install lxml
# Install pipeline dependencies #
# Install the OCR pipeline and it's dependencies #
## Install pyFlow ##
ENV PYFLOW_RELEASE=1.1.20
ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" .
RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \
&& cd "pyflow-${PYFLOW_RELEASE}" \
ENV PYFLOW_VERSION=1.1.20
RUN wget --no-check-certificate --quiet \
"https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" \
&& tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
&& cd "pyflow-${PYFLOW_VERSION}" \
&& apt-get install --no-install-recommends --yes \
python2.7 \
&& python2.7 setup.py build install \
&& cd .. \
&& rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz"
&& cd - > /dev/null \
&& rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz"
## Install ocropy ##
ENV OCROPY_RELEASE=1.3.3
ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_RELEASE}.tar.gz" .
RUN tar -xzf "v${OCROPY_RELEASE}.tar.gz" \
&& cd "ocropy-${OCROPY_RELEASE}" \
ENV OCROPY_VERSION=1.3.3
RUN wget --no-check-certificate --quiet \
"https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" \
&& tar -xzf "v${OCROPY_VERSION}.tar.gz" \
&& cd "ocropy-${OCROPY_VERSION}" \
&& apt-get install --no-install-recommends --yes \
python2.7 \
python-pil \
python-tk \
$(cat PACKAGES) \
&& python2.7 setup.py install \
&& cd .. \
&& rm -r "ocropy-${OCROPY_RELEASE}" "v${OCROPY_RELEASE}.tar.gz"
&& cd - > /dev/null \
&& rm -r "ocropy-${OCROPY_VERSION}" "v${OCROPY_VERSION}.tar.gz"
## Install Tesseract OCR ##
ENV TESSERACT_RELEASE=4.1.1
ADD "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_RELEASE}.tar.gz" .
RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \
&& cd "tesseract-${TESSERACT_RELEASE}" \
ENV TESSERACT_VERSION=5.0.0
RUN wget --no-check-certificate --quiet \
"https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \
&& tar -xzf "${TESSERACT_VERSION}.tar.gz" \
&& cd "tesseract-${TESSERACT_VERSION}" \
&& apt-get install --no-install-recommends --yes \
autoconf \
automake \
@ -55,47 +67,19 @@ RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \
pkg-config \
zlib1g-dev \
&& ./autogen.sh \
&& ./configure \
&& ./configure --disable-openmp --disable-shared 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic' \
&& make \
&& make install \
&& ldconfig \
&& cd - > /dev/null \
&& rm -r "tesseract-${TESSERACT_RELEASE}" "${TESSERACT_RELEASE}.tar.gz"
ENV TESSDATA_BEST_RELEASE=4.1.0
ADD "https://github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_RELEASE}.tar.gz" .
RUN tar -xzf "${TESSDATA_BEST_RELEASE}.tar.gz" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/ara.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/chi_tra.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/dan.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/deu.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/ell.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/eng.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/enm.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/fra.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/frk.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/frm.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/ita.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/por.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/rus.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/spa.traineddata" "/usr/local/share/tessdata/" \
&& rm -r "tessdata_best-${TESSDATA_BEST_RELEASE}" "${TESSDATA_BEST_RELEASE}.tar.gz"
## Further dependencies ##
RUN apt-get install --no-install-recommends --yes \
ghostscript \
python-pip \
python3.7 \
zip \
&& pip install natsort
&& rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz"
RUN rm -r /var/lib/apt/lists/*
## Install Pipeline ##
COPY hocrtotei ocr /usr/local/bin/
COPY hocr2tei hocr-combine ocr /usr/local/bin/
ENTRYPOINT ["ocr"]

21
LICENSE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2021 Bielefeld University - CRC 1288 - INF
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

106
README.md
View File

@ -1,83 +1,49 @@
# OCR - Optical Character Recognition
This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided.
This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided. The pipeline is designed to run on Linux operating systems, but with some tweaks it should also run on Windows with WSL installed.
## Software used in this pipeline implementation
- Official Debian Docker image (buster-slim) and programs from its free repositories: https://hub.docker.com/_/debian
- Official Debian Docker image (buster-slim): https://hub.docker.com/_/debian
- Software from Debian Buster's free repositories
- ocropy (1.3.3): https://github.com/ocropus/ocropy/releases/tag/v1.3.3
- pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20
- Tesseract OCR (4.1.1): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1
- tessdata_best (4.1.0): https://github.com/tesseract-ocr/tessdata_best/releases/tag/4.1.0
- Tesseract OCR (5.0.0): https://github.com/tesseract-ocr/tesseract/releases/tag/5.0.0
## Installation
## Use this image
1. Install Docker and Python 3.
2. Clone this repository: `git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git`
3. Build the Docker image: `docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:v0.1.0 ocr`
4. Add the wrapper script (`wrapper/ocr` relative to this README file) to your `${PATH}`.
5. Create working directories for the pipeline: `mkdir -p /<my_data_location>/{input,models,output}`.
6. Place your Tesseract OCR model(s) inside `/<my_data_location>/models`.
1. Create input and output directories for the pipeline.
``` bash
mkdir -p /<my_data_location>/input /<my_data_location>/output
```
## Use the Pipeline
2. Place your PDF files inside `/<my_data_location>/input`. Files should all contain text of the same language.
3. Start the pipeline process. Check the [Pipeline arguments](#pipeline-arguments) section for more details.
```
# Option one: Use the wrapper script
## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/raw/1.0.0/wrapper/ocr, make it executeable and add it to your ${PATH}
1. Place your PDF files inside `/<my_data_location>/input`. Files should all contain text of the same language.
2. Clear your `/<my_data_location>/output` directory.
3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details.
```bash
cd /<my_data_location>
ocr -i input -l <language_code> -o output <optional_pipeline_arguments>
# Option two: Classic Docker style
docker run \
--rm \
-it \
-u $(id -u $USER):$(id -g $USER) \
-v /<my_data_location>/input:/input \
-v /<my_data_location>/output:/output \
gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:1.0.0 \
-i /input \
-l <language_code>
-o /output \
<optional_pipeline_arguments>
# <model_code> is the model filename without the ".traineddata" suffix
ocr \
--input-dir input \
--output-dir output \
--model-file models/<model>
-m <model_code> <optional_pipeline_arguments>
# More then one model
ocr \
--input-dir input \
--output-dir output \
--model-file models/<model1>
--model-file models/<model2>
-m <model1_code>+<model2_code> <optional_pipeline_arguments>
# Instead of multiple --model-file statements, you can also use
ocr \
--input-dir input \
--output-dir output \
--model-file models/*
-m <model1_code>+<model2_code> <optional_pipeline_arguments>
```
4. Check your results in the `/<my_data_location>/output` directory.
```
### Pipeline arguments
`-l languagecode`
* Tells tesseract which language will be used.
* options = ara (Arabic), chi_tra (Chinese - Traditional), dan (Danish), deu (German), ell (Greek, Modern (1453-)), eng (English), enm (Middle englisch), fra (French), frk (German Fraktur), frm (Middle french), ita (Italian), por (Portuguese), rus (Russian), spa (Spanish)
* required = True
`--keep-intermediates`
* If set, all intermediate files created during the OCR process will be
kept.
* default = False
* required = False
`--nCores corenumber`
* Sets the number of CPU cores being used during the OCR process.
* default = min(4, multiprocessing.cpu_count())
* required = False
`--skip-binarisation`
* Used to skip binarization with ocropus. If skipped, only the tesseract binarization is used.
* default = False
``` bash
# Example with all arguments used
docker run \
--rm \
-it \
-u $(id -u $USER):$(id -g $USER) \
-v "$HOME"/ocr/input:/input \
-v "$HOME"/ocr/output:/output \
gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:1.0.0 \
-i /input \
-l eng \
-o /output \
--keep_intermediates \
--nCores 8 \
--skip-binarisation
```

44
hocr-combine Executable file
View File

@ -0,0 +1,44 @@
#!/usr/bin/env python3.7
# coding=utf-8
''' Combine multiple hOCR files. '''
from argparse import ArgumentParser
from lxml import html
parser = ArgumentParser(description='Combine multiple hOCR files.')
parser.add_argument(
'-i', '--input-file',
help='Input file',
nargs='+',
required=True
)
parser.add_argument(
'-o', '--output-file',
help='Output file',
required=True
)
args = parser.parse_args()
for input_file in args.input_file:
input_files = []
if input_file.startswith('@'):
with open(input_file[1:], 'r') as f:
input_files += [x for x in f.read().split("\n") if x != '']
else:
input_files.append(input_file)
if len(input_files) == 0:
exit(1)
hocr = html.parse(input_files[0])
hocr_body = hocr.find('body')
for input_file in input_files[1:]:
for ocr_page in html.parse(input_file).findall('//div[@class="ocr_page"]'):
hocr_body.append(ocr_page)
with open(args.output_file, 'wb') as f:
hocr.write(f, encoding='UTF-8', method='html')

68
hocr2tei Executable file
View File

@ -0,0 +1,68 @@
#!/usr/bin/env python3.7
# coding=utf-8
''' Convert hOCR to TEI XML. '''
from argparse import ArgumentParser
from lxml import html
from xml.sax.saxutils import escape
import re
parser = ArgumentParser(description='Convert hOCR to TEI XML.')
parser.add_argument(
'-i', '--input-file',
help='Input file',
required=True
)
parser.add_argument(
'-o', '--output-file',
help='Output file',
required=True
)
args = parser.parse_args()
tei = ''
tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n'
tei += ' <teiHeader>\n'
tei += ' <fileDesc>\n'
tei += ' <titleStmt>\n'
tei += ' <title></title>\n'
tei += ' </titleStmt>\n'
tei += ' <publicationStmt>\n'
tei += ' <p></p>\n'
tei += ' </publicationStmt>\n'
tei += ' <sourceDesc>\n'
tei += ' <p></p>\n'
tei += ' </sourceDesc>\n'
tei += ' </fileDesc>\n'
tei += ' </teiHeader>\n'
tei += ' <text>\n'
tei += ' <body>\n'
hocr = html.parse(args.input_file)
for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
ocr_page_title_attrib = ocr_page.attrib.get('title')
facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1)
page_number = re.search(r'ppageno (\d+)', ocr_page_title_attrib).group(1)
tei += f' <pb facs="{facsimile}" n="{page_number}"/>\n'
for ocr_par in ocr_page.findall('.//p[@class="ocr_par"]'):
tei += ' <p>\n'
for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'):
tei += ' <lb/>'
is_first_word_in_line = True
for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'):
if ocrx_word.text is not None:
if not is_first_word_in_line:
tei += ' '
tei += escape(ocrx_word.text)
is_first_word_in_line = False
tei += '\n'
tei += ' </p>\n'
tei += ' </body>\n'
tei += ' </text>\n'
tei += '</TEI>\n'
with open(args.output_file, 'w') as f:
f.write(tei)

View File

@ -1,49 +0,0 @@
#!/usr/bin/env python3.7
# coding=utf-8
""""Merges hOCR files into a TEI file."""
from xml.sax.saxutils import escape
from argparse import ArgumentParser
import xml.etree.ElementTree as ET
parser = ArgumentParser(description='Merges hOCR files into a TEI file.')
parser.add_argument('i', metavar='hOCR-sourcefile', nargs='+')
parser.add_argument('o', metavar='TEI-destfile',)
args = parser.parse_args()
output_file = open(args.o, 'w')
output_file.write(
'<?xml version="1.0" encoding="UTF-8"?>\n'
+ '<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="dtabf">\n'
+ ' <teiHeader>\n'
+ ' <fileDesc>\n'
+ ' <titleStmt/>\n'
+ ' <publicationStmt/>\n'
+ ' <sourceDesc/>\n'
+ ' </fileDesc>\n'
+ ' <encodingDesc/>\n'
+ ' <profileDesc/>\n'
+ ' </teiHeader>\n'
+ ' <text>\n'
+ ' <body>\n'
)
for index, input_file in enumerate(args.i):
tree = ET.parse(input_file)
output_file.write(' <pb n="%i"/>\n' % (index + 1))
for para in tree.findall('.//*[@class="ocr_par"]'):
output_file.write(' <p>\n')
for line in para.findall('.//*[@class="ocr_line"]'):
first_word_in_line = True
for word in line.findall('.//*[@class="ocrx_word"]'):
if word.text is not None:
output_file.write((' ' if first_word_in_line else ' ') + escape(word.text.strip()))
first_word_in_line = False
if not first_word_in_line:
output_file.write('<lb/>\n')
output_file.write(' </p>\n')
output_file.write(
' </body>\n'
+ ' </text>\n'
+ '</TEI>')
output_file.close()

961
ocr

File diff suppressed because it is too large Load Diff

View File

@ -1,43 +1,44 @@
#!/usr/bin/env python3
# coding=utf-8
"""A wrapper to execute the OCR pipeline in a Docker container"""
from argparse import ArgumentParser
import os
import subprocess
import sys
CONTAINER_IMAGE_TAG = '1.0.0'
CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:{}'.format(CONTAINER_IMAGE_TAG) # noqa
CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:v0.1.0'
CONTAINER_INPUT_DIR = '/input'
CONTAINER_INTERMEDIATE_DIR = '/intermediate'
CONTAINER_OUTPUT_DIR = '/output'
CONTAINER_MODELS_DIR = '/usr/local/share/tessdata'
CONTAINER_LOG_DIR = '/logs'
UID = str(os.getuid())
GID = str(os.getgid())
parser = ArgumentParser(add_help=False)
parser.add_argument('-i', '--input-directory')
parser.add_argument('-o', '--output-directory')
parser.add_argument('--intermediate-directory')
parser.add_argument('-i', '--input-dir')
parser.add_argument('-o', '--output-dir')
parser.add_argument('-t', '--model-file', action='extend', nargs='+')
parser.add_argument('--log-dir')
args, remaining_args = parser.parse_known_args()
cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)]
if args.intermediate_directory is not None:
cmd += ['-v', '{}:{}'.format(os.path.abspath(args.intermediate_directory),
CONTAINER_INTERMEDIATE_DIR)]
remaining_args.insert(0, CONTAINER_INTERMEDIATE_DIR)
remaining_args.insert(0, '--intermediate-directory')
if args.output_directory is not None:
cmd += ['-v', '{}:{}'.format(os.path.abspath(args.output_directory),
CONTAINER_OUTPUT_DIR)]
remaining_args.insert(0, CONTAINER_OUTPUT_DIR)
remaining_args.insert(0, '-o')
if args.input_directory is not None:
cmd += ['-v', '{}:{}'.format(os.path.abspath(args.input_directory),
CONTAINER_INPUT_DIR)]
remaining_args.insert(0, CONTAINER_INPUT_DIR)
remaining_args.insert(0, '-i')
cmd = ['docker', 'run', '--rm', '-it', '-u', f'{UID}:{GID}']
if args.input_dir is not None:
mapping = f'{os.path.abspath(args.input_dir)}:{CONTAINER_INPUT_DIR}'
cmd += ['-v', mapping]
remaining_args += ['-i', CONTAINER_INPUT_DIR]
if args.output_dir is not None:
mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}'
cmd += ['-v', mapping]
remaining_args += ['-o', CONTAINER_OUTPUT_DIR]
if args.model_file is not None:
for model_file in args.model_file:
mapping = f'{os.path.abspath(model_file)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model_file)}' # noqa
cmd += ['-v', mapping]
if args.log_dir is not None:
mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}'
cmd += ['-v', mapping]
remaining_args += ['--log-dir', CONTAINER_LOG_DIR]
cmd.append(CONTAINER_IMAGE)
cmd += remaining_args
subprocess.run(cmd)
sys.exit(subprocess.run(cmd).returncode)