34 Commits
1.0 ... 1.0.0b

Author SHA1 Message Date
a0760487ae Don't process files in subdirectories 2021-04-12 13:22:28 +02:00
a798457c43 Add mising --log-dir argument to wrapper script 2021-04-12 09:53:59 +02:00
e2da0fb839 Tweak the README and pipeline help. 2021-03-26 10:03:59 +01:00
e78f667438 Use more descriptive argument names then i and o (now: input and output) 2021-03-18 10:32:55 +01:00
41f70da8eb Update the hocrtotei script 2021-03-17 16:58:13 +01:00
6db7f70446 Add back german language models 2021-03-17 14:26:24 +01:00
947658a7d8 Change intermediate image name in order to fix issues with building multiple branches/tags at the same time 2021-03-15 14:11:23 +01:00
acbf61be05 Cleanup and make use of globbing for input files for binarization and ocr 2021-03-15 12:45:05 +01:00
104598039e Dockerfile codestyle 2021-02-24 15:28:04 +01:00
da29659a9b Add back missing author mention 2021-02-24 15:17:42 +01:00
613bceb4ff Add new models 2021-02-23 11:11:50 +01:00
ca7df6d0ed First work on version 1.0.0 2021-02-19 13:04:03 +01:00
07635dcdfa Use "buster" instead of "10" in FROM 2020-10-08 23:17:48 +02:00
c0069d5453 Use new Dockerfile structure 2020-10-08 23:09:10 +02:00
e941f64ee4 test new ci config 2020-10-07 16:44:38 +02:00
cb68d6de2d One thread per page ocr patch 2020-10-07 13:46:22 +02:00
4b84488fe6 fix gitlab ci 2020-09-23 16:58:07 +02:00
7d52ad9f68 Update 2020-09-23 15:52:24 +02:00
ac4b5c2fd8 Add possibility to use an intermediate dir 2020-09-22 17:44:32 +02:00
6d90d43699 fix cleanup attempt 2020-09-21 15:36:03 +02:00
4bd0d3bb01 Use commit_sha for intermediate image 2020-09-21 15:02:04 +02:00
15061bfaaf add tag to clean stage 2020-09-21 15:00:09 +02:00
7cc8ebd666 compile tesseract in container 2020-09-21 14:46:03 +02:00
82285a8e6c better multithreading 2020-07-02 11:49:35 +02:00
7322a5bc7c More GhostScript, less dependencies! 2020-07-02 11:47:43 +02:00
2b63ba9e59 Remove unused dependencies and use ghostscript for image split 2020-07-01 11:03:34 +02:00
aee9628e5e fix pipeline 2020-06-23 15:19:27 +02:00
ec5b4eb521 Add PDF compression 2020-06-16 09:31:34 +02:00
b77ca5914f Set relative file paths in hocr 2020-06-10 11:48:58 +02:00
018939ae55 Add PoCo zips part 1 2020-06-09 16:58:22 +02:00
64fe706126 Keep uncompressed output files after zip jobs. 2020-05-13 09:11:01 +02:00
a75b32ca1d Bump versions 2020-04-06 09:21:52 +02:00
364e3d626d Fix zip creation 2020-04-04 15:37:21 +02:00
36a86887b0 Update OCR Pipeline 2020-04-03 17:35:30 +02:00
6 changed files with 506 additions and 712 deletions

View File

@ -1,44 +1,68 @@
image: docker:stable image: docker:19.03.13
services: services:
- docker:stable-dind - docker:19.03.13-dind
variables:
DOCKER_DRIVER: overlay2
stages: stages:
- build - build
- push - push
before_script: variables:
- docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY DOCKER_TLS_CERTDIR: "/certs"
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME-$CI_COMMIT_SHA
Build: .reg_setup:
before_script:
- apk add --no-cache curl
- curl --fail --show-error --location "https://github.com/genuinetools/reg/releases/download/v$REG_VERSION/reg-linux-amd64" --output /usr/local/bin/reg
- echo "$REG_SHA256 /usr/local/bin/reg" | sha256sum -c -
- chmod a+x /usr/local/bin/reg
variables:
REG_SHA256: ade837fc5224acd8c34732bf54a94f579b47851cc6a7fd5899a98386b782e228
REG_VERSION: 0.16.1
build_image:
script: script:
- docker build --pull -t $CI_REGISTRY_IMAGE:tmp . - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
- docker push $CI_REGISTRY_IMAGE:tmp - docker build -t $INTERMEDIATE_IMAGE_TAG .
- docker push $INTERMEDIATE_IMAGE_TAG
stage: build stage: build
tags: tags:
- docker - docker
Push latest: push_master:
extends:
- .reg_setup
only: only:
- master - master
script: script:
- docker pull $CI_REGISTRY_IMAGE:tmp - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
- docker tag $CI_REGISTRY_IMAGE:tmp $CI_REGISTRY_IMAGE:latest - docker pull $INTERMEDIATE_IMAGE_TAG
- docker push $CI_REGISTRY_IMAGE:latest - /usr/local/bin/reg rm -d --auth-url $CI_REGISTRY -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $INTERMEDIATE_IMAGE_TAG
- docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG
- docker push $IMAGE_TAG
stage: push stage: push
tags: tags:
- docker - docker
variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE:latest
Push tag: push_other:
extends:
- .reg_setup
except:
- master
only: only:
- branches
- tags - tags
script: script:
- docker pull $CI_REGISTRY_IMAGE:tmp - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
- docker tag $CI_REGISTRY_IMAGE:tmp $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME - docker pull $INTERMEDIATE_IMAGE_TAG
- docker push $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME - /usr/local/bin/reg rm -d --auth-url $CI_REGISTRY -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $INTERMEDIATE_IMAGE_TAG
- docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG
- docker push $IMAGE_TAG
stage: push stage: push
tags: tags:
- docker - docker
variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME

View File

@ -1,73 +1,96 @@
FROM debian:9-slim FROM debian:buster-slim
# Define image metadata LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <porada@posteo.de>"
LABEL maintainer="inf_sfb1288@lists.uni-bielefeld.de"
ENV LANG=C.UTF-8 ENV LANG=C.UTF-8
# Install prerequisites
RUN apt-get update \ RUN apt-get update \
&& apt-get install -y --no-install-recommends \ && apt-get install --no-install-recommends --yes \
apt-transport-https \ wget
ca-certificates \
gnupg2 \
imagemagick \
poppler-utils \
python2.7 \
python3.5 \
wget \
zip \
&& rm -rf /var/lib/apt/lists/*
ENV OCROPY_VERSION 1.3.3
ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" . # Install the OCR pipeline and it's dependencies #
RUN tar -xzf "v${OCROPY_VERSION}.tar.gz" \ ## Install pyFlow ##
ENV PYFLOW_VERSION=1.1.20
RUN wget --no-check-certificate --quiet \
"https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" \
&& tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
&& cd "pyflow-${PYFLOW_VERSION}" \
&& apt-get install --no-install-recommends --yes \
python2.7 \
&& python2.7 setup.py build install \
&& cd - > /dev/null \
&& rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz"
## Install ocropy ##
ENV OCROPY_VERSION=1.3.3
RUN wget --no-check-certificate --quiet \
"https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" \
&& tar -xzf "v${OCROPY_VERSION}.tar.gz" \
&& cd "ocropy-${OCROPY_VERSION}" \ && cd "ocropy-${OCROPY_VERSION}" \
&& apt-get update \ && apt-get install --no-install-recommends --yes \
&& apt-get install -y --no-install-recommends \ python2.7 \
python-pil \ python-pil \
python-tk \ python-tk \
$(cat PACKAGES) \ $(cat PACKAGES) \
&& rm -rf /var/lib/apt/lists/* \
&& python2.7 setup.py install \ && python2.7 setup.py install \
&& cd .. \ && cd - > /dev/null \
&& rm -rf \ && rm -r "ocropy-${OCROPY_VERSION}" "v${OCROPY_VERSION}.tar.gz"
"ocropy-${OCROPY_VERSION}" \
"v${OCROPY_VERSION}.tar.gz"
ENV PYFLOW_VERSION=1.1.20
ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" .
RUN tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
&& cd "pyflow-${PYFLOW_VERSION}" \
&& python2.7 setup.py build install \
&& cd .. \
&& rm -rf \
"pyflow-${PYFLOW_VERSION}" \
"pyflow-${PYFLOW_VERSION}.tar.gz"
RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list \
&& wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - \
&& apt-get update \
&& apt-get install -y --no-install-recommends \
tesseract-ocr \
tesseract-ocr-deu \
tesseract-ocr-eng \
tesseract-ocr-enm \
tesseract-ocr-fra \
tesseract-ocr-frk \
tesseract-ocr-frm \
tesseract-ocr-ita \
tesseract-ocr-por \
tesseract-ocr-spa \
&& rm -rf /var/lib/apt/lists/*
# Install OCR pipeline ## Install Tesseract OCR ##
COPY hocrtotei /usr/local/bin ENV TESSERACT_VERSION=4.1.1
COPY ocr /usr/local/bin RUN wget --no-check-certificate --quiet \
"https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \
&& tar -xzf "${TESSERACT_VERSION}.tar.gz" \
&& cd "tesseract-${TESSERACT_VERSION}" \
&& apt-get install --no-install-recommends --yes \
autoconf \
automake \
g++ \
libjpeg62-turbo-dev \
libleptonica-dev \
libtiff5-dev \
libtool \
libpng-dev \
make \
pkg-config \
zlib1g-dev \
&& ./autogen.sh \
&& ./configure \
&& make \
&& make install \
&& ldconfig \
&& cd - > /dev/null \
&& rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz"
ENV TESSERACT_MODELS="ara,chi_tra,dan,deu,ell,eng,enm,fra,frk,frm,ita,por,rus,spa"
ENV TESSDATA_BEST_VERSION=4.1.0
RUN wget --no-check-certificate --quiet \
"https://github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}.tar.gz" \
&& tar -xzf "${TESSDATA_BEST_VERSION}.tar.gz" \
&& for tesseract_model in $(echo ${TESSERACT_MODELS} | tr "," "\n"); do mv "tessdata_best-${TESSDATA_BEST_VERSION}/${tesseract_model}.traineddata" "/usr/local/share/tessdata/"; done \
&& rm -r "tessdata_best-${TESSDATA_BEST_VERSION}" "${TESSDATA_BEST_VERSION}.tar.gz"
## Further dependencies ##
RUN apt-get install --no-install-recommends --yes \
procps \
ghostscript \
python3.7 \
rename \
zip
## Install Pipeline ##
COPY hocrtotei ocr /usr/local/bin/
RUN rm -r /var/lib/apt/lists/*
ENTRYPOINT ["ocr"] ENTRYPOINT ["ocr"]

114
README.md
View File

@ -1,96 +1,44 @@
# OCR # OCR - Optical Character Recognition
## Build image This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided.
1. Clone this repository and navigate into it: ## Software used in this pipeline implementation
```
git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git && cd ocr - Official Debian Docker image (buster-slim): https://hub.docker.com/_/debian
- Software from Debian Buster's free repositories
- ocropy (1.3.3): https://github.com/ocropus/ocropy/releases/tag/v1.3.3
- pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20
- Tesseract OCR (4.1.1): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1
- tessdata_best (4.1.0): https://github.com/tesseract-ocr/tessdata_best/releases/tag/4.1.0
## Use this image
1. Create input and output directories for the pipeline.
``` bash
mkdir -p /<my_data_location>/input /<my_data_location>/output
``` ```
2. Build image: 2. Place your PDF files inside `/<my_data_location>/input`. Files should all contain text of the same language.
```
docker build -t sfb1288inf/ocr:latest .
```
Alternatively build from the GitLab repository without cloning: 3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details.
1. Build image:
```
docker build -t sfb1288inf/ocr:latest https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
``` ```
# Option one: Use the wrapper script
## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/raw/development/wrapper/ocr, make it executeable and add it to your ${PATH}
cd /<my_data_location>
ocr -i input -l <language_code> -o output <optional_pipeline_arguments>
## Download prebuilt image # Option two: Classic Docker style
The GitLab registry provides a prebuilt image. It is automatically created, utilizing the conquaire build servers.
1. Download image:
```
docker pull gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest
```
## Run
1. Create input and output directories for the OCR software:
```
mkdir -p /<mydatalocation>/files_for_ocr /<mydatalocation>/files_from_ocr
```
2. Place your files inside the `/<mydatalocation>/files_for_ocr` directory. Files can either be PDF (.pdf) or multipage TIFF (.tiff, .tif) files. Files should all contain text of the same language.
3. Start the OCR process.
```
docker run \ docker run \
--rm \ --rm \
-it \ -it \
-u $(id -u $USER):$(id -g $USER) \ -u $(id -u $USER):$(id -g $USER) \
-v /<mydatalocation>/files_for_ocr:/input \ -v /<my_data_location>/input:/input \
-v /<mydatalocation>/files_from_ocr:/output \ -v /<my_data_location>/output:/output \
sfb1288inf/ocr:latest \ gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:development \
-i /input \ -i /ocr_pipeline/input \
-l <languagecode> \ -l <language_code> \
-o /output -o /ocr_pipeline/output \
<optional_pipeline_arguments>
``` ```
The arguments below `sfb1288inf/ocr:latest` are described in the [OCR arguments](#ocr-arguments) part.
If you want to use the prebuilt image, replace `sfb1288inf/ocr:latest` with `gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest`. 4. Check your results in the `/<my_data_location>/output` directory.
4. Check your results in the `/<mydatalocation>/files_from_ocr` directory.
### OCR arguments
`-l languagecode`
* Tells tesseract which language will be used.
* options = deu (German), eng (English), enm (Middle englisch), fra (French), frk (German Fraktur), frm (Middle french), ita (Italian), por (Portuguese), spa (Spanish)
* required = True
`--keep-intermediates`
* If set, all intermediate files created during the OCR process will be
kept.
* default = False
* required = False
`--nCores corenumber`
* Sets the number of CPU cores being used during the OCR process.
* default = min(4, multiprocessing.cpu_count())
* required = False
`--skip-binarisation`
* Used to skip binarization with ocropus. If skipped, only the tesseract binarization is used.
* default = False
Example with all arguments used:
```
docker run \
--rm \
-it \
-u $(id -u $USER):$(id -g $USER) \
-v "$HOME"/ocr/files_for_ocr:/input \
-v "$HOME"/ocr/files_from_ocr:/output \
sfb1288inf/ocr:latest \
-i /input \
-l eng \
-o /output \
--keep_intermediates \
--nCores 8 \
--skip-binarisation
```

View File

@ -1,58 +1,57 @@
#!/usr/bin/env python3.5 #!/usr/bin/env python3.7
# coding=utf-8 # coding=utf-8
""""Convert hOCR to TEI XML."""
from xml.sax.saxutils import escape from xml.sax.saxutils import escape
import argparse from argparse import ArgumentParser
import re
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
parser = argparse.ArgumentParser( parser = ArgumentParser(description='Convert hOCR to TEI XML.')
description='Merges several hOCR files in order of their occurrence on command line to one TEI result file.' parser.add_argument('input', metavar='Path to hOCR input file')
) parser.add_argument('output', metavar='Path to TEI output file')
parser.add_argument(
'i',
metavar='hOCR-sourcefile',
nargs='+'
)
parser.add_argument(
'o',
metavar='TEI-destfile',
)
args = parser.parse_args() args = parser.parse_args()
output_file = open(args.o, 'w') tei = ''
tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n'
output_file.write( tei += ' <teiHeader>\n'
'<?xml version="1.0" encoding="UTF-8"?>\n' tei += ' <fileDesc>\n'
+ '<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="dtabf">\n' tei += ' <titleStmt>\n'
+ ' <teiHeader>\n' tei += ' <title></title>\n'
+ ' <fileDesc>\n' tei += ' </titleStmt>\n'
+ ' <titleStmt/>\n' tei += ' <publicationStmt>\n'
+ ' <publicationStmt/>\n' tei += ' <p></p>\n'
+ ' <sourceDesc/>\n' tei += ' </publicationStmt>\n'
+ ' </fileDesc>\n' tei += ' <sourceDesc>\n'
+ ' <encodingDesc/>\n' tei += ' <p></p>\n'
+ ' <profileDesc/>\n' tei += ' </sourceDesc>\n'
+ ' </teiHeader>\n' tei += ' </fileDesc>\n'
+ ' <text>\n' tei += ' </teiHeader>\n'
+ ' <body>\n' tei += ' <text>\n'
) tei += ' <body>\n'
for index, input_file in enumerate(args.i): # Conversion start
tree = ET.parse(input_file) hocr = ET.parse(args.input)
output_file.write(' <pb n="%i"/>\n' % (index + 1)) for page in hocr.findall('.//*[@class="ocr_page"]'):
for para in tree.findall('.//*[@class="ocr_par"]'): page_properties = page.attrib.get('title')
output_file.write(' <p>\n') facsimile = re.search(r'image \"(.*?)\"', page_properties).group(1)
page_number = re.search(r'ppageno (\d+)', page_properties).group(1)
tei += ' <pb facs="{}" n="{}"/>\n'.format(facsimile, page_number)
for para in page.findall('.//*[@class="ocr_par"]'):
tei += ' <p>\n'
for line in para.findall('.//*[@class="ocr_line"]'): for line in para.findall('.//*[@class="ocr_line"]'):
first_word_in_line = True tei += ' <lb/>'
indent = ''
for word in line.findall('.//*[@class="ocrx_word"]'): for word in line.findall('.//*[@class="ocrx_word"]'):
if word.text is not None: if word.text is not None:
output_file.write((' ' if first_word_in_line else ' ') + escape(word.text.strip())) tei += indent + escape(word.text.strip())
first_word_in_line = False indent = ' '
if not first_word_in_line: tei += '\n'
output_file.write('<lb/>\n') tei += ' </p>\n'
output_file.write(' </p>\n') # Conversion end
output_file.write( tei += ' </body>\n'
' </body>\n' tei += ' </text>\n'
+ ' </text>\n' tei += '</TEI>\n'
+ '</TEI>')
output_file.close() with open(args.output, 'w') as tei_file:
tei_file.write(tei)

763
ocr
View File

@ -1,556 +1,357 @@
#!/usr/bin/env python2.7 #!/usr/bin/env python2.7
# coding=utf-8 # coding=utf-8
"""OCR pipeline for PDF file processing."""
""" __author__ = 'Patrick Jentsch <p.jentsch@uni-bielefeld.de>,' \
ocr 'Stephan Porada <porada@posteo.de>'
__version__ = '1.0.0'
Usage: For usage instructions run with option --help from argparse import ArgumentParser
Author: Patrick Jentsch <p.jentsch@uni-bielefeld.de> from pyflow import WorkflowRunner
"""
import argparse
import multiprocessing import multiprocessing
import os import os
import re
import sys import sys
from pyflow import WorkflowRunner
def parse_arguments(): class OCRPipelineJob:
parser = argparse.ArgumentParser( """An OCR pipeline job class
description='''Performs OCR of (historical) documents utilizing OCRopus
for preprocessing and Tesseract OCR for OCR. The results Each input file of the pipeline is represented as an OCR pipeline job,
are served as hOCR, PDF, raw text and TEI compliant XML which holds all necessary information for the pipeline to process it.
files.\n
Software requirements: imagemagick, ocropus, pdftoppm, Arguments:
pdfunite, poppler-utils, pyflow, python2.7, python3.5, file -- Path to the file
tesseract''' output_dir -- Path to a directory, where job results a stored
) """
parser.add_argument(
'-i', def __init__(self, file, output_dir):
dest='input_dir', self.file = file
required=True self.name = os.path.basename(file).rsplit('.', 1)[0]
) self.output_dir = output_dir
parser.add_argument( self.page_dir = os.path.join(output_dir, 'pages')
'-l',
choices=[
'deu', 'eng', 'enm', 'fra', 'frk', 'frm', 'ita', 'por', 'spa'
],
dest='lang',
required=True
)
parser.add_argument(
'-o',
dest='output_dir',
required=True
)
parser.add_argument(
'--skip-binarisation',
action='store_true',
default=False,
dest='skip_binarisation',
help='skip ocropy binarisation',
required=False
)
parser.add_argument(
'--keep-intermediates',
action='store_true',
default=False,
dest='keep_intermediates',
help='keep intermediate files',
required=False
)
parser.add_argument(
'--nCores',
default=min(4, multiprocessing.cpu_count()),
dest='n_cores',
help='total number of cores available',
required=False,
type=int
)
parser.add_argument(
'--zip',
default='ocr-result-files',
dest='zip',
type=str,
help='package result files in zip bundles and asign an filename prefix',
required=False
)
return parser.parse_args()
class OCRWorkflow(WorkflowRunner): class OCRPipeline(WorkflowRunner):
def __init__(self, args): def __init__(self, input_dir, lang, output_dir, binarize, zip):
self.jobs = analyze_jobs(args.input_dir, args.output_dir) self.input_dir = input_dir
self.skip_binarisation = args.skip_binarisation self.lang = lang
self.keep_intermediates = args.keep_intermediates self.output_dir = output_dir
self.lang = args.lang self.binarize = binarize
self.n_cores = args.n_cores self.zip = zip
self.output_dir = args.output_dir self.jobs = collect_jobs(self.input_dir, self.output_dir)
self.zip = args.zip
def workflow(self): def workflow(self):
if len(self.jobs) == 0: if not self.jobs:
return return
''' '''
' ################################################## ' ##################################################
' # Create output directories # ' # setup output directory #
' ################################################## ' ##################################################
''' '''
create_output_directories_jobs = [] setup_output_directory_tasks = []
for index, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
cmd = 'mkdir -p "%s"' % ( cmd = 'mkdir -p "{}"'.format(job.page_dir)
os.path.join(job['output_dir'], 'tmp') lbl = 'setup_output_directory_-_{}'.format(i)
) task = self.addTask(command=cmd, label=lbl)
if self.keep_intermediates: setup_output_directory_tasks.append(task)
cmd += ' "%s" "%s" "%s" "%s"' % (
os.path.join(job['output_dir'], 'tmp', 'hocr'),
os.path.join(job['output_dir'], 'tmp', 'pdf'),
os.path.join(job['output_dir'], 'tmp', 'tiff'),
os.path.join(job['output_dir'], 'tmp', 'txt')
)
if not self.skip_binarisation:
cmd += ' "%s"' % (
os.path.join(job['output_dir'], 'tmp', 'bin.png')
)
create_output_directories_jobs.append(
self.addTask(
command=cmd,
label='create_output_directories_job_-_%i' % (index)
)
)
''' '''
' ################################################## ' ##################################################
' # Split # ' # split input #
' ################################################## ' ##################################################
''' '''
split_jobs = [] split_input_tasks = []
split_job_n_cores = min( n_cores = max(1, int(self.getNCores() / len(self.jobs)))
self.n_cores, for i, job in enumerate(self.jobs):
max(1, int(self.n_cores / len(self.jobs))) input_file = job.file
) output_file = '{}/page-%d.tif'.format(job.page_dir)
for index, job in enumerate(self.jobs): cmd = 'gs'
if job['filename'].endswith(('.tif', '.tiff')): cmd += ' -dBATCH'
''' cmd += ' -dNOPAUSE'
' This command also works for PDF input but ocropus-nlbin cmd += ' -dNumRenderingThreads={}'.format(n_cores)
' is not able to handle the TIFF output of it. cmd += ' -dQUIET'
''' cmd += ' -r300'
cmd = 'convert -density 300 "%s" -compress LZW -scene 1 "%s/page-%%d.tif"' % ( cmd += ' -sDEVICE=tiff24nc'
job['path'], cmd += ' -sCompression=lzw'
os.path.join(job['output_dir'], 'tmp') cmd += ' "-sOutputFile={}"'.format(output_file)
) cmd += ' "{}"'.format(input_file)
else: deps = 'setup_output_directory_-_{}'.format(i)
cmd = 'pdftoppm -r 300 -tiff -tiffcompression lzw "%s" "%s"' % ( lbl = 'split_input_-_{}'.format(i)
job['path'], task = self.addTask(command=cmd, dependencies=deps, label=lbl,
os.path.join(job['output_dir'], 'tmp', 'page') nCores=n_cores)
) split_input_tasks.append(task)
split_jobs.append( if self.binarize:
self.addTask(
command=cmd,
dependencies='create_output_directories_job_-_%i' % (index),
label='split_job_-_%i' % (index),
nCores=split_job_n_cores
)
)
if not self.skip_binarisation:
''' '''
' The binarisation_jobs are created based of the output files of ' ##################################################
' the split_jobs. So wait until they are finished. ' # pre binarization #
' ##################################################
''' '''
self.waitForTasks() pre_binarization_tasks = []
for i, job in enumerate(self.jobs):
input_file = os.path.join(job.output_dir, 'binarization_input_files.txt') # noqa
cmd = 'ls -dv "{}/"* >> "{}"'.format(job.page_dir, input_file)
deps = 'split_input_-_{}'.format(i)
lbl = 'pre_binarization_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
pre_binarization_tasks.append(task)
''' '''
' ################################################## ' ##################################################
' # Binarise # ' # binarization #
' ################################################## ' ##################################################
''' '''
binarisation_jobs = [] binarization_tasks = []
''' n_cores = self.getNCores()
' We run ocropus-nlbin with either four or, if there are less then mem_mb = self.getMemMb()
' four cores available for this workflow, the available core for i, job in enumerate(self.jobs):
' number. input_file = os.path.join(job.output_dir, 'binarization_input_files.txt') # noqa
''' cmd = 'ocropus-nlbin "@{}"'.format(input_file)
binarisation_job_n_cores = min(4, self.n_cores) cmd += ' --nocheck'
for index, job in enumerate(self.jobs): cmd += ' --output "{}"'.format(job.page_dir)
files = os.listdir(os.path.join(job['output_dir'], 'tmp')) cmd += ' --parallel "{}"'.format(n_cores)
files = filter(lambda x: x.endswith('.tif'), files) deps = 'pre_binarization_-_{}'.format(i)
files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) lbl = 'binarization_-_{}'.format(i)
files = map( task = self.addTask(command=cmd, dependencies=deps, label=lbl,
lambda x: '"' + os.path.join(job['output_dir'], 'tmp', x) + '"', memMb=mem_mb, nCores=n_cores)
files binarization_tasks.append(task)
)
cmd = 'ocropus-nlbin --output "%s" --parallel "%i" %s' % (
os.path.join(job['output_dir'], 'tmp'),
binarisation_job_n_cores,
' '.join(files)
)
binarisation_jobs.append(
self.addTask(
command=cmd,
dependencies='split_job_-_%i' % (index),
label='binarisation_job_-_%i' % (index),
nCores=binarisation_job_n_cores
)
)
'''
' The post_binarisation_jobs are created based of the output files
' of the binarisation_jobs. So wait until they are finished.
'''
self.waitForTasks()
''' '''
' ################################################## ' ##################################################
' # Normalise file names from binarisation # ' # post binarization #
' ################################################## ' ##################################################
''' '''
post_binarisation_jobs = [] post_binarization_tasks = []
for index, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
number = 0 input_file = os.path.join(job.output_dir, 'binarization_input_files.txt') # noqa
files = os.listdir(os.path.join(job['output_dir'], 'tmp')) cmd = 'rm "{}"'.format(input_file)
files = filter(lambda x: x.endswith('.bin.png'), files) cmd += ' && '
files.sort() cmd += 'cd "{}"'.format(job.page_dir)
for file in files: cmd += ' && '
cmd = 'mv "%s" "%s"' % ( cmd += 'rm *.{nrm.png,tif}'
os.path.join(job['output_dir'], 'tmp', file), cmd += ' && '
os.path.join( cmd += 'rename \'s/^0*/page-/\' *'
job['output_dir'], cmd += ' && '
'tmp', cmd += 'cd -'
'page-%i.bin.png' % (int(file.split('.', 1)[0])) deps = 'binarization_-_{}'.format(i)
) lbl = 'post_binarization_-_{}'.format(i)
) task = self.addTask(command=cmd, dependencies=deps, label=lbl)
post_binarisation_jobs.append( post_binarization_tasks.append(task)
self.addTask(
command=cmd,
dependencies='binarisation_job_-_%i' % (index),
label='post_binarisation_job_-_%i-%i' % (
index,
number
)
)
)
number += 1
'''
' The ocr_jobs are created based of the output files of either the
' split_jobs or post_binarisation_jobs. So wait until they are
' finished.
'''
self.waitForTasks()
''' '''
' ################################################## ' ##################################################
' # Optical Character Recognition # ' # pre ocr #
' ################################################## ' ##################################################
''' '''
ocr_jobs = [] pre_ocr_tasks = []
''' for i, job in enumerate(self.jobs):
' Tesseract runs fastest with four cores. So we run it with either four input_file = os.path.join(job.output_dir, 'ocr_input_files.txt')
' or, if there are less then four cores available for this workflow, cmd = 'ls -dv "{}/"* >> "{}"'.format(job.page_dir, input_file)
' the available core number. deps = 'post_binarization_-_{}'.format(i) if self.binarize else 'split_input_-_{}'.format(i) # noqa
''' lbl = 'pre_ocr_-_{}'.format(i)
ocr_job_n_cores = min(4, self.n_cores) task = self.addTask(command=cmd, dependencies=deps, label=lbl)
for index, job in enumerate(self.jobs): pre_ocr_tasks.append(task)
files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
if self.skip_binarisation:
files = filter(lambda x: x.endswith('.tif'), files)
else:
files = filter(lambda x: x.endswith('.bin.png'), files)
files.sort(key=lambda x: int(re.search(r'\d+', x).group(0)))
files = map(
lambda x: os.path.join(job['output_dir'], 'tmp', x),
files
)
number = 0
for file in files:
cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % (
file,
os.path.join(
job['output_dir'],
'tmp',
file.rsplit('.', 1 if self.skip_binarisation else 2)[0]
),
self.lang
)
if self.skip_binarisation:
ocr_job_dependencies = 'split_job_-_%i' % (index)
else:
ocr_job_dependencies = filter(
lambda x: x == 'post_binarisation_job_-_%i-%i' % (
index,
number
),
post_binarisation_jobs
)
ocr_jobs.append(
self.addTask(
command=cmd,
dependencies=ocr_job_dependencies,
label='ocr_job_-_%i-%i' % (index, number),
nCores=ocr_job_n_cores
)
)
number += 1
'''
' The following jobs are created based of the output files of the
' ocr_jobs. So wait until they are finished.
'''
self.waitForTasks()
''' '''
' ################################################## ' ##################################################
' # Create TEI P5 files # ' # ocr #
' ################################################## ' ##################################################
''' '''
hocr_to_tei_jobs = [] ocr_tasks = []
for index, job in enumerate(self.jobs): n_cores = min(4, self.getNCores())
files = os.listdir(os.path.join(job['output_dir'], 'tmp')) mem_mb = min(n_cores * 2048, self.getMemMb())
files = filter(lambda x: x.endswith('.hocr'), files) for i, job in enumerate(self.jobs):
files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) input_file = os.path.join(job.output_dir, 'ocr_input_files.txt')
files = map( output_file_base = os.path.join(job.output_dir, job.name)
lambda x: '"' + os.path.join(job['output_dir'], 'tmp', x) + '"', cmd = 'tesseract "{}" "{}"'.format(input_file, output_file_base)
files cmd += ' -l "{}"'.format(self.lang)
) cmd += ' hocr pdf txt'
cmd = 'hocrtotei %s "%s"' % ( deps = 'pre_ocr_-_{}'.format(i)
' '.join(files), lbl = 'ocr_-_{}'.format(i)
os.path.join( task = self.addTask(command=cmd, dependencies=deps,
job['output_dir'], env={'OMP_THREAD_LIMIT': '{}'.format(n_cores)},
os.path.join(job['output_dir'], job['name'] + '.xml') label=lbl, memMb=mem_mb, nCores=n_cores)
) ocr_tasks.append(task)
)
hocr_to_tei_jobs.append(
self.addTask(
command=cmd,
dependencies=filter(
lambda x: x.startswith('ocr_job_-_%i' % (index)),
ocr_jobs
),
label='hocr_to_tei_job_-_%i' % (index)
)
)
''' '''
' ################################################## ' ##################################################
' # Merge PDF files # ' # post ocr #
' ################################################## ' ##################################################
''' '''
pdf_merge_jobs = [] post_ocr_tasks = []
for index, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
files = os.listdir(os.path.join(job['output_dir'], 'tmp')) input_file = os.path.join(job.output_dir, 'ocr_input_files.txt')
files = filter(lambda x: x.endswith('.pdf'), files) output_file_base = os.path.join(job.output_dir, job.name)
files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) cmd = 'rm "{}"'.format(input_file)
files = map( cmd += ' && '
lambda x: '"' + os.path.join(job['output_dir'], 'tmp', x) + '"', cmd += 'sed -i \'s+{}+pages+g\' "{}.hocr"'.format(job.page_dir, output_file_base) # noqa
files deps = 'ocr_-_{}'.format(i)
) lbl = 'post_ocr_-_{}'.format(i)
cmd = 'pdfunite %s "%s"' % ( task = self.addTask(command=cmd, dependencies=deps, label=lbl)
' '.join(files), post_ocr_tasks.append(task)
os.path.join(
job['output_dir'],
os.path.join(job['output_dir'], job['name'] + '.pdf')
)
)
pdf_merge_jobs.append(
self.addTask(
command=cmd,
dependencies=filter(
lambda x: x.startswith('ocr_job_-_%i' % (index)),
ocr_jobs
),
label='pdf_merge_job_-_%i' % (index)
)
)
''' '''
' ################################################## ' ##################################################
' # Merge text files # ' # hocr to tei #
' ################################################## ' ##################################################
''' '''
txt_merge_jobs = [] hocr_to_tei_tasks = []
for index, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
files = os.listdir(os.path.join(job['output_dir'], 'tmp')) output_file_base = os.path.join(job.output_dir, job.name)
files = filter(lambda x: x.endswith('.txt'), files) cmd = 'hocrtotei "{}.hocr" "{}.xml"'.format(output_file_base, output_file_base) # noqa
files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) deps = 'post_ocr_-_{}'.format(i)
files = map( lbl = 'hocr_to_tei_-_{}'.format(i)
lambda x: '"' + os.path.join(job['output_dir'], 'tmp', x) + '"', task = self.addTask(command=cmd, dependencies=deps, label=lbl)
files hocr_to_tei_tasks.append(task)
)
cmd = 'cat %s > "%s"' % (
' '.join(files),
os.path.join(
job['output_dir'],
os.path.join(job['output_dir'], job['name'] + '.txt')
)
)
txt_merge_jobs.append(
self.addTask(
command=cmd,
dependencies=filter(
lambda x: x.startswith('ocr_job_-_%i' % (index)),
ocr_jobs
),
label='txt_merge_job_-_%i' % (index)
)
)
if self.zip:
all_zip_jobs = []
all_zip_job_dependencies = (hocr_to_tei_jobs
+ pdf_merge_jobs
+ txt_merge_jobs)
cmd = 'cd "%s" && zip "%s"-all-ocr-files.zip */*.{pdf,txt,xml} -x "pyflow.data*" && cd -' % (
self.output_dir,
self.zip
)
all_zip_jobs.append(
self.addTask(
command=cmd,
dependencies=all_zip_job_dependencies,
label='all_zip_job'
)
)
pdf_zip_jobs = []
pdf_zip_job_dependencies = all_zip_jobs
cmd = 'cd "%s" && zip -m "%s"-ocr-pdf.zip */*.pdf -x "pyflow.data*" && cd -' % (
self.output_dir,
self.zip
)
pdf_zip_jobs.append(
self.addTask(
command=cmd,
dependencies=pdf_zip_job_dependencies,
label='pdf_zip_job'
)
)
txt_zip_jobs = []
txt_zip_job_dependencies = all_zip_jobs
cmd = 'cd "%s" && zip -m "%s"-ocr-txt.zip */*.txt -x "pyflow.data*" && cd -' % (
self.output_dir,
self.zip
)
txt_zip_jobs.append(
self.addTask(
command=cmd,
dependencies=txt_zip_job_dependencies,
label='txt_zip_job'
)
)
xml_zip_jobs = []
xml_zip_job_dependencies = all_zip_jobs
cmd = 'cd "%s" && zip -m "%s"-ocr-xml.zip */*.xml -x "pyflow.data*" && cd -' % (
self.output_dir,
self.zip
)
xml_zip_jobs.append(
self.addTask(
command=cmd,
dependencies=xml_zip_job_dependencies,
label='xml_zip_job'
)
)
''' '''
' ################################################## ' ##################################################
' # Cleanup # ' # zip creation #
' ################################################## ' ##################################################
''' '''
cleanup_jobs = [] zip_creation_tasks = []
if self.keep_intermediates: if self.zip is not None:
for index, job in enumerate(self.jobs): # zip all files
cleanup_job_dependencies = [ cmd = 'cd "{}"'.format(self.output_dir)
'hocr_to_tei_job_-_%i' % (index), cmd += ' && '
'pdf_merge_job_-_%i' % (index), cmd += 'zip'
'txt_merge_job_-_%i' % (index) cmd += ' -r'
] cmd += ' "{}.all.zip" .'.format(self.zip)
cmd = 'mv "%s"/*.hocr "%s"' % ( cmd += ' -x "pyflow.data*" "*tmp*"'
os.path.join(job['output_dir'], 'tmp'), cmd += ' -i "*.pdf" "*.txt" "*.xml" "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif') # noqa
os.path.join(job['output_dir'], 'tmp', 'hocr'), cmd += ' && '
) cmd += 'cd -'
cmd += ' && mv "%s"/*.pdf "%s"' % ( deps = hocr_to_tei_tasks
os.path.join(job['output_dir'], 'tmp'), lbl = 'zip_creation_-_all'
os.path.join(job['output_dir'], 'tmp', 'pdf'), task = self.addTask(command=cmd, dependencies=deps, label=lbl)
) zip_creation_tasks.append(task)
cmd += ' && mv "%s"/*.tif "%s"' % ( # zip PDF files
os.path.join(job['output_dir'], 'tmp'), cmd = 'cd "{}"'.format(self.output_dir)
os.path.join(job['output_dir'], 'tmp', 'tiff'), cmd += ' && '
) cmd += 'zip'
cmd += ' && mv "%s"/*.txt "%s"' % ( cmd += ' -r'
os.path.join(job['output_dir'], 'tmp'), cmd += ' "{}.pdf.zip" .'.format(self.zip)
os.path.join(job['output_dir'], 'tmp', 'txt'), cmd += ' -x "pyflow.data*" "*tmp*"'
) cmd += ' -i "*.pdf"'
if not self.skip_binarisation: cmd += ' && '
cmd += ' && mv "%s"/*.bin.png "%s"' % ( cmd += 'cd -'
os.path.join(job['output_dir'], 'tmp'), deps = ocr_tasks
os.path.join(job['output_dir'], 'tmp', 'bin.png'), lbl = 'zip_creation_-_pdf'
) task = self.addTask(command=cmd, dependencies=deps, label=lbl)
cmd += ' && rm "%s"/*.nrm.png' % ( zip_creation_tasks.append(task)
os.path.join(job['output_dir'], 'tmp') # zip TXT files
) cmd = 'cd "{}"'.format(self.output_dir)
cleanup_jobs.append( cmd += ' && '
self.addTask( cmd += 'zip'
command=cmd, cmd += ' -r'
dependencies=cleanup_job_dependencies, cmd += ' "{}.txt.zip" .'.format(self.zip)
label='cleanup_job_-_%i' % (index) cmd += ' -x "pyflow.data*" "*tmp*"'
) cmd += ' -i "*.txt"'
) cmd += ' && '
else: cmd += 'cd -'
for index, job in enumerate(self.jobs): deps = ocr_tasks
cleanup_job_dependencies = [ lbl = 'zip_creation_-_txt'
'hocr_to_tei_job_-_%i' % (index), task = self.addTask(command=cmd, dependencies=deps, label=lbl)
'pdf_merge_job_-_%i' % (index), zip_creation_tasks.append(task)
'txt_merge_job_-_%i' % (index) # zip XML files
] cmd = 'cd "{}"'.format(self.output_dir)
cmd = 'rm -r "%s"' % ( cmd += ' && '
os.path.join(job['output_dir'], 'tmp') cmd += 'zip'
) cmd += ' -r'
cleanup_jobs.append( cmd += ' "{}.xml.zip" .'.format(self.zip)
self.addTask( cmd += ' -x "pyflow.data*" "*tmp*"'
command=cmd, cmd += ' -i "*.xml"'
dependencies=cleanup_job_dependencies, cmd += ' && '
label='cleanup_job_-_%i' % (index) cmd += 'cd -'
) deps = hocr_to_tei_tasks
) lbl = 'zip_creation_-_xml'
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
zip_creation_tasks.append(task)
# zip PoCo bundles
cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' -r'
cmd += ' "{}.poco.zip" .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif') # noqa
cmd += ' && '
cmd += 'cd -'
deps = post_ocr_tasks
lbl = 'zip_creation_-_poco'
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
zip_creation_tasks.append(task)
def analyze_jobs(input_dir, output_dir): def collect_jobs(input_dir, output_dir):
jobs = [] jobs = []
for file in os.listdir(input_dir): for file in os.listdir(input_dir):
if os.path.isdir(os.path.join(input_dir, file)): if os.path.isdir(os.path.join(input_dir, file)):
jobs += analyze_jobs( continue
os.path.join(input_dir, file), if file.lower().endswith('.pdf'):
os.path.join(output_dir, file) job = OCRPipelineJob(os.path.join(input_dir, file),
) os.path.join(output_dir, file))
elif file.endswith(('.pdf', '.tif', '.tiff')): jobs.append(job)
jobs.append(
{
'filename': file,
'name': file.rsplit('.', 1)[0],
'output_dir': os.path.join(output_dir, file),
'path': os.path.join(input_dir, file)
}
)
return jobs return jobs
def parse_args():
parser = ArgumentParser(description='OCR pipeline for PDF file processing',
prog='OCR pipeline')
parser.add_argument('-i', '--input-dir',
help='Input directory',
required=True)
parser.add_argument('-o', '--output-dir',
help='Output directory',
required=True)
parser.add_argument('-l', '--language',
choices=list(map(lambda x: x[:-12], filter(lambda x: x.endswith('.traineddata'), os.listdir('/usr/local/share/tessdata')))), # noqa
help='Language of the input '
'(3-character ISO 639-2 language codes)',
required=True)
parser.add_argument('--binarize',
action='store_true',
help='Add binarization as a preprocessing step')
parser.add_argument('--log-dir',
help='Logging directory')
parser.add_argument('--mem-mb',
help='Amount of system memory to be used (Default: min(--n-cores * 2048, available system memory))', # noqa
type=int)
parser.add_argument('--n-cores',
default=min(4, multiprocessing.cpu_count()),
help='Number of CPU threads to be used (Default: min(4, number of CPUs))', # noqa
type=int)
parser.add_argument('--zip',
help='Create one zip file per filetype')
parser.add_argument('-v', '--version',
action='version',
help='Returns the current version of the OCR pipeline',
version='%(prog)s {}'.format(__version__))
args = parser.parse_args()
# Set some tricky default values and check for insufficient input
if args.log_dir is None:
args.log_dir = args.output_dir
if args.n_cores < 1:
raise Exception('--n-cores must be greater or equal 1')
if args.mem_mb is None:
max_mem_mb = int(os.popen('free -t -m').readlines()[-1].split()[1:][0])
args.mem_mb = min(args.n_cores * 2048, max_mem_mb)
if args.mem_mb < 2048:
raise Exception('--mem-mb must be greater or equal 2048')
if args.zip is not None and args.zip.lower().endswith('.zip'):
# Remove .zip file extension if provided
args.zip = args.zip[:-4]
args.zip = args.zip if args.zip else 'output'
return args
def main(): def main():
args = parse_arguments() args = parse_args()
ocr_pipeline = OCRPipeline(args.input_dir, args.language, args.output_dir, args.binarize, args.zip) # noqa
wflow = OCRWorkflow(args) retval = ocr_pipeline.run(dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores) # noqa
retval = wflow.run(dataDirRoot=args.output_dir, nCores=args.n_cores)
sys.exit(retval) sys.exit(retval)

View File

@ -1,39 +1,38 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# coding=utf-8 # coding=utf-8
import argparse from argparse import ArgumentParser
import os import os
import subprocess import subprocess
import sys
container_image = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest' CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:1.0.0'
container_input_dir = '/input' CONTAINER_INPUT_DIR = '/input'
container_output_dir = '/output' CONTAINER_OUTPUT_DIR = '/output'
uid = str(os.getuid()) CONTAINER_LOG_DIR = '/logs'
gid = str(os.getgid()) UID = str(os.getuid())
GID = str(os.getgid())
parser = argparse.ArgumentParser(add_help=False) parser = ArgumentParser(add_help=False)
parser.add_argument( parser.add_argument('-i', '--input-dir')
'-i', parser.add_argument('-o', '--output-dir')
dest='input_dir', parser.add_argument('--log-dir')
required=False
)
parser.add_argument(
'-o',
dest='output_dir',
required=False
)
args, remaining_args = parser.parse_known_args() args, remaining_args = parser.parse_known_args()
cmd = ['docker', 'run', '--rm', '-it', '-u', uid + ':' + gid] cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)]
if args.input_dir is not None: if args.input_dir is not None:
host_input_dir = os.path.abspath(args.input_dir) mapping = os.path.abspath(args.input_dir) + ':' + CONTAINER_INPUT_DIR
cmd += ['-v', host_input_dir + ':' + container_input_dir] cmd += ['-v', mapping]
remaining_args += ['-i', container_input_dir] remaining_args += ['-i', CONTAINER_INPUT_DIR]
if args.output_dir is not None: if args.output_dir is not None:
host_output_dir = os.path.abspath(args.output_dir) mapping = os.path.abspath(args.output_dir) + ':' + CONTAINER_OUTPUT_DIR
cmd += ['-v', host_output_dir + ':' + container_output_dir] cmd += ['-v', mapping]
remaining_args += ['-o', container_output_dir] remaining_args += ['-o', CONTAINER_OUTPUT_DIR]
cmd.append(container_image) if args.log_dir is not None:
mapping = os.path.abspath(args.log_dir) + ':' + CONTAINER_LOG_DIR
cmd += ['-v', mapping]
remaining_args += ['--log-dir', CONTAINER_LOG_DIR]
cmd.append(CONTAINER_IMAGE)
cmd += remaining_args cmd += remaining_args
subprocess.run(cmd) sys.exit(subprocess.run(cmd).returncode)