Compare commits

24 Commits

Author SHA1 Message Date
613bceb4ff Add new models 2021-02-23 11:11:50 +01:00
ca7df6d0ed First work on version 1.0.0 2021-02-19 13:04:03 +01:00
07635dcdfa Use "buster" instead of "10" in FROM 2020-10-08 23:17:48 +02:00
c0069d5453 Use new Dockerfile structure 2020-10-08 23:09:10 +02:00
e941f64ee4 test new ci config 2020-10-07 16:44:38 +02:00
cb68d6de2d One thread per page ocr patch 2020-10-07 13:46:22 +02:00
4b84488fe6 fix gitlab ci 2020-09-23 16:58:07 +02:00
7d52ad9f68 Update 2020-09-23 15:52:24 +02:00
ac4b5c2fd8 Add possibility to use an intermediate dir 2020-09-22 17:44:32 +02:00
6d90d43699 fix cleanup attempt 2020-09-21 15:36:03 +02:00
4bd0d3bb01 Use commit_sha for intermediate image 2020-09-21 15:02:04 +02:00
15061bfaaf add tag to clean stage 2020-09-21 15:00:09 +02:00
7cc8ebd666 compile tesseract in container 2020-09-21 14:46:03 +02:00
82285a8e6c better multithreading 2020-07-02 11:49:35 +02:00
7322a5bc7c More GhostScript, less dependencies! 2020-07-02 11:47:43 +02:00
2b63ba9e59 Remove unused dependencies and use ghostscript for image split 2020-07-01 11:03:34 +02:00
aee9628e5e fix pipeline 2020-06-23 15:19:27 +02:00
ec5b4eb521 Add PDF compression 2020-06-16 09:31:34 +02:00
b77ca5914f Set relative file paths in hocr 2020-06-10 11:48:58 +02:00
018939ae55 Add PoCo zips part 1 2020-06-09 16:58:22 +02:00
64fe706126 Keep uncompressed output files after zip jobs. 2020-05-13 09:11:01 +02:00
a75b32ca1d Bump versions 2020-04-06 09:21:52 +02:00
364e3d626d Fix zip creation 2020-04-04 15:37:21 +02:00
36a86887b0 Update OCR Pipeline 2020-04-03 17:35:30 +02:00
6 changed files with 575 additions and 627 deletions

View File

@ -1,44 +1,71 @@
image: docker:stable image: docker:19.03.13
services:
- docker:stable-dind
variables: variables:
DOCKER_DRIVER: overlay2 DOCKER_TLS_CERTDIR: "/certs"
services:
- docker:19.03.13-dind
stages: stages:
- build - build
- push - push
before_script: .reg_setup:
- docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY before_script:
- apk add --no-cache curl
- curl --fail --show-error --location "https://github.com/genuinetools/reg/releases/download/v$REG_VERSION/reg-linux-amd64" --output /usr/local/bin/reg
- echo "$REG_SHA256 /usr/local/bin/reg" | sha256sum -c -
- chmod a+x /usr/local/bin/reg
variables:
REG_SHA256: ade837fc5224acd8c34732bf54a94f579b47851cc6a7fd5899a98386b782e228
REG_VERSION: 0.16.1
Build: build_image:
script: script:
- docker build --pull -t $CI_REGISTRY_IMAGE:tmp . - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
- docker push $CI_REGISTRY_IMAGE:tmp - docker build -t $INTERMEDIATE_IMAGE_TAG .
- docker push $INTERMEDIATE_IMAGE_TAG
stage: build stage: build
tags: tags:
- docker - docker
variables:
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
Push latest: push_master:
extends:
- .reg_setup
only: only:
- master - master
script: script:
- docker pull $CI_REGISTRY_IMAGE:tmp - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
- docker tag $CI_REGISTRY_IMAGE:tmp $CI_REGISTRY_IMAGE:latest - docker pull $INTERMEDIATE_IMAGE_TAG
- docker push $CI_REGISTRY_IMAGE:latest - /usr/local/bin/reg rm -d --auth-url $CI_REGISTRY -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $INTERMEDIATE_IMAGE_TAG
- docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG
- docker push $IMAGE_TAG
stage: push stage: push
tags: tags:
- docker - docker
variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE:latest
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
Push tag: push_other:
extends:
- .reg_setup
except:
- master
only: only:
- branches
- tags - tags
script: script:
- docker pull $CI_REGISTRY_IMAGE:tmp - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
- docker tag $CI_REGISTRY_IMAGE:tmp $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME - docker pull $INTERMEDIATE_IMAGE_TAG
- docker push $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME - /usr/local/bin/reg rm -d --auth-url $CI_REGISTRY -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $INTERMEDIATE_IMAGE_TAG
- docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG
- docker push $IMAGE_TAG
stage: push stage: push
tags: tags:
- docker - docker
variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA

View File

@ -1,73 +1,101 @@
FROM debian:9-slim FROM debian:buster-slim
# Define image metadata LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>"
LABEL maintainer="inf_sfb1288@lists.uni-bielefeld.de"
ENV LANG=C.UTF-8 ENV LANG=C.UTF-8
# Install prerequisites RUN apt-get update
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
apt-transport-https \
ca-certificates \
gnupg2 \
imagemagick \
poppler-utils \
python2.7 \
python3.5 \
wget \
zip \
&& rm -rf /var/lib/apt/lists/*
ENV OCROPY_VERSION 1.3.3
ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" . # Install pipeline dependencies #
RUN tar -xzf "v${OCROPY_VERSION}.tar.gz" \ ## Install pyFlow ##
&& cd "ocropy-${OCROPY_VERSION}" \ ENV PYFLOW_RELEASE=1.1.20
&& apt-get update \ ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" .
&& apt-get install -y --no-install-recommends \ RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \
&& cd "pyflow-${PYFLOW_RELEASE}" \
&& apt-get install --no-install-recommends --yes \
python2.7 \
&& python2.7 setup.py build install \
&& cd .. \
&& rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz"
## Install ocropy ##
ENV OCROPY_RELEASE=1.3.3
ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_RELEASE}.tar.gz" .
RUN tar -xzf "v${OCROPY_RELEASE}.tar.gz" \
&& cd "ocropy-${OCROPY_RELEASE}" \
&& apt-get install --no-install-recommends --yes \
python-pil \ python-pil \
python-tk \ python-tk \
$(cat PACKAGES) \ $(cat PACKAGES) \
&& rm -rf /var/lib/apt/lists/* \
&& python2.7 setup.py install \ && python2.7 setup.py install \
&& cd .. \ && cd .. \
&& rm -rf \ && rm -r "ocropy-${OCROPY_RELEASE}" "v${OCROPY_RELEASE}.tar.gz"
"ocropy-${OCROPY_VERSION}" \
"v${OCROPY_VERSION}.tar.gz"
ENV PYFLOW_VERSION=1.1.20
ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" .
RUN tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
&& cd "pyflow-${PYFLOW_VERSION}" \
&& python2.7 setup.py build install \
&& cd .. \
&& rm -rf \
"pyflow-${PYFLOW_VERSION}" \
"pyflow-${PYFLOW_VERSION}.tar.gz"
RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list \
&& wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - \
&& apt-get update \
&& apt-get install -y --no-install-recommends \
tesseract-ocr \
tesseract-ocr-deu \
tesseract-ocr-eng \
tesseract-ocr-enm \
tesseract-ocr-fra \
tesseract-ocr-frk \
tesseract-ocr-frm \
tesseract-ocr-ita \
tesseract-ocr-por \
tesseract-ocr-spa \
&& rm -rf /var/lib/apt/lists/*
# Install OCR pipeline ## Install Tesseract OCR ##
COPY hocrtotei /usr/local/bin ENV TESSERACT_RELEASE=4.1.1
COPY ocr /usr/local/bin ADD "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_RELEASE}.tar.gz" .
RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \
&& cd "tesseract-${TESSERACT_RELEASE}" \
&& apt-get install --no-install-recommends --yes \
autoconf \
automake \
g++ \
libjpeg62-turbo-dev \
libleptonica-dev \
libtiff5-dev \
libtool \
libpng-dev \
make \
pkg-config \
zlib1g-dev \
&& ./autogen.sh \
&& ./configure \
&& make \
&& make install \
&& ldconfig \
&& cd - > /dev/null \
&& rm -r "tesseract-${TESSERACT_RELEASE}" "${TESSERACT_RELEASE}.tar.gz"
ENV TESSDATA_BEST_RELEASE=4.1.0
ADD "https://github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_RELEASE}.tar.gz" .
RUN tar -xzf "${TESSDATA_BEST_RELEASE}.tar.gz" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/ara.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/chi_tra.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/dan.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/deu.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/ell.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/eng.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/enm.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/fra.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/frk.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/frm.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/ita.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/por.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/rus.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/spa.traineddata" "/usr/local/share/tessdata/" \
&& rm -r "tessdata_best-${TESSDATA_BEST_RELEASE}" "${TESSDATA_BEST_RELEASE}.tar.gz"
## Further dependencies ##
RUN apt-get install --no-install-recommends --yes \
ghostscript \
python-pip \
python3.7 \
zip \
&& pip install natsort
RUN rm -r /var/lib/apt/lists/*
## Install Pipeline ##
COPY hocrtotei ocr /usr/local/bin/
ENTRYPOINT ["ocr"] ENTRYPOINT ["ocr"]

View File

@ -1,66 +1,53 @@
# OCR # OCR - Optical Character Recognition
## Build image This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided.
1. Clone this repository and navigate into it: ## Software used in this pipeline implementation
``` - Official Debian Docker image (buster-slim) and programs from its free repositories: https://hub.docker.com/_/debian
git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git && cd ocr - ocropy (1.3.3): https://github.com/ocropus/ocropy/releases/tag/v1.3.3
- pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20
- Tesseract OCR (4.1.1): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1
- tessdata_best (4.1.0): https://github.com/tesseract-ocr/tessdata_best/releases/tag/4.1.0
## Use this image
1. Create input and output directories for the pipeline.
``` bash
mkdir -p /<my_data_location>/input /<my_data_location>/output
``` ```
2. Build image: 2. Place your PDF files inside `/<my_data_location>/input`. Files should all contain text of the same language.
```
docker build -t sfb1288inf/ocr:latest .
```
Alternatively build from the GitLab repository without cloning: 3. Start the pipeline process. Check the [Pipeline arguments](#pipeline-arguments) section for more details.
1. Build image:
```
docker build -t sfb1288inf/ocr:latest https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
``` ```
# Option one: Use the wrapper script
## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/raw/1.0.0/wrapper/ocr, make it executeable and add it to your ${PATH}
cd /<my_data_location>
ocr -i input -l <language_code> -o output <optional_pipeline_arguments>
## Download prebuilt image # Option two: Classic Docker style
The GitLab registry provides a prebuilt image. It is automatically created, utilizing the conquaire build servers.
1. Download image:
```
docker pull gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest
```
## Run
1. Create input and output directories for the OCR software:
```
mkdir -p /<mydatalocation>/files_for_ocr /<mydatalocation>/files_from_ocr
```
2. Place your files inside the `/<mydatalocation>/files_for_ocr` directory. Files can either be PDF (.pdf) or multipage TIFF (.tiff, .tif) files. Files should all contain text of the same language.
3. Start the OCR process.
```
docker run \ docker run \
--rm \ --rm \
-it \ -it \
-u $(id -u $USER):$(id -g $USER) \ -u $(id -u $USER):$(id -g $USER) \
-v /<mydatalocation>/files_for_ocr:/input \ -v /<my_data_location>/input:/input \
-v /<mydatalocation>/files_from_ocr:/output \ -v /<my_data_location>/output:/output \
sfb1288inf/ocr:latest \ gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:1.0.0 \
-i /input \ -i /input \
-l <languagecode> \ -l <language_code>
-o /output -o /output \
<optional_pipeline_arguments>
``` ```
The arguments below `sfb1288inf/ocr:latest` are described in the [OCR arguments](#ocr-arguments) part.
If you want to use the prebuilt image, replace `sfb1288inf/ocr:latest` with `gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest`. 4. Check your results in the `/<my_data_location>/output` directory.
```
4. Check your results in the `/<mydatalocation>/files_from_ocr` directory. ### Pipeline arguments
### OCR arguments
`-l languagecode` `-l languagecode`
* Tells tesseract which language will be used. * Tells tesseract which language will be used.
* options = deu (German), eng (English), enm (Middle englisch), fra (French), frk (German Fraktur), frm (Middle french), ita (Italian), por (Portuguese), spa (Spanish) * options = ara (Arabic), chi_tra (Chinese - Traditional), dan (Danish), deu (German), ell (Greek, Modern (1453-)), eng (English), enm (Middle englisch), fra (French), frk (German Fraktur), frm (Middle french), ita (Italian), por (Portuguese), rus (Russian), spa (Spanish)
* required = True * required = True
`--keep-intermediates` `--keep-intermediates`
@ -78,15 +65,15 @@ kept.
* Used to skip binarization with ocropus. If skipped, only the tesseract binarization is used. * Used to skip binarization with ocropus. If skipped, only the tesseract binarization is used.
* default = False * default = False
Example with all arguments used: ``` bash
``` # Example with all arguments used
docker run \ docker run \
--rm \ --rm \
-it \ -it \
-u $(id -u $USER):$(id -g $USER) \ -u $(id -u $USER):$(id -g $USER) \
-v "$HOME"/ocr/files_for_ocr:/input \ -v "$HOME"/ocr/input:/input \
-v "$HOME"/ocr/files_from_ocr:/output \ -v "$HOME"/ocr/output:/output \
sfb1288inf/ocr:latest \ gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:1.0.0 \
-i /input \ -i /input \
-l eng \ -l eng \
-o /output \ -o /output \

View File

@ -1,26 +1,18 @@
#!/usr/bin/env python3.5 #!/usr/bin/env python3.7
# coding=utf-8 # coding=utf-8
""""Merges hOCR files into a TEI file."""
from xml.sax.saxutils import escape from xml.sax.saxutils import escape
import argparse from argparse import ArgumentParser
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
parser = argparse.ArgumentParser( parser = ArgumentParser(description='Merges hOCR files into a TEI file.')
description='Merges several hOCR files in order of their occurrence on command line to one TEI result file.' parser.add_argument('i', metavar='hOCR-sourcefile', nargs='+')
) parser.add_argument('o', metavar='TEI-destfile',)
parser.add_argument(
'i',
metavar='hOCR-sourcefile',
nargs='+'
)
parser.add_argument(
'o',
metavar='TEI-destfile',
)
args = parser.parse_args() args = parser.parse_args()
output_file = open(args.o, 'w') output_file = open(args.o, 'w')
output_file.write( output_file.write(
'<?xml version="1.0" encoding="UTF-8"?>\n' '<?xml version="1.0" encoding="UTF-8"?>\n'
+ '<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="dtabf">\n' + '<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="dtabf">\n'
@ -54,5 +46,4 @@ output_file.write(
' </body>\n' ' </body>\n'
+ ' </text>\n' + ' </text>\n'
+ '</TEI>') + '</TEI>')
output_file.close() output_file.close()

821
ocr
View File

@ -1,556 +1,467 @@
#!/usr/bin/env python2.7 #!/usr/bin/env python2.7
# coding=utf-8 # coding=utf-8
"""An OCR pipeline for PDF file processing."""
""" __author__ = 'Patrick Jentsch <p.jentsch@uni-bielefeld.de>'
ocr __version__ = '1.0.0'
Usage: For usage instructions run with option --help from argparse import ArgumentParser
Author: Patrick Jentsch <p.jentsch@uni-bielefeld.de> from natsort import natsorted
""" from pyflow import WorkflowRunner
import argparse
import multiprocessing import multiprocessing
import os import os
import re
import sys import sys
from pyflow import WorkflowRunner import tempfile
def parse_arguments(): TESSERACT_MODELS = ['deu', 'eng', 'enm', 'fra', 'frk', 'frm', 'ita', 'por', 'spa'] # noqa
parser = argparse.ArgumentParser(
description='''Performs OCR of (historical) documents utilizing OCRopus
for preprocessing and Tesseract OCR for OCR. The results def parse_args():
are served as hOCR, PDF, raw text and TEI compliant XML parser = ArgumentParser(
files.\n description='An OCR pipeline for PDF file processing.',
Software requirements: imagemagick, ocropus, pdftoppm, prog='OCR pipeline'
pdfunite, poppler-utils, pyflow, python2.7, python3.5,
tesseract'''
) )
parser.add_argument( parser.add_argument('-i', '--input-directory',
'-i', help='Input directory (only PDF files get processed)',
dest='input_dir', required=True)
required=True parser.add_argument('-o', '--output-directory',
) help='Output directory',
parser.add_argument( required=True)
'-l', parser.add_argument('-l', '--language',
choices=[ choices=TESSERACT_MODELS,
'deu', 'eng', 'enm', 'fra', 'frk', 'frm', 'ita', 'por', 'spa' required=True)
], parser.add_argument('--binarize',
dest='lang',
required=True
)
parser.add_argument(
'-o',
dest='output_dir',
required=True
)
parser.add_argument(
'--skip-binarisation',
action='store_true', action='store_true',
default=False, help='Use ocropy binarisation as preprocessing step.')
dest='skip_binarisation', parser.add_argument('--log-dir')
help='skip ocropy binarisation', parser.add_argument('--n-cores',
required=False
)
parser.add_argument(
'--keep-intermediates',
action='store_true',
default=False,
dest='keep_intermediates',
help='keep intermediate files',
required=False
)
parser.add_argument(
'--nCores',
default=min(4, multiprocessing.cpu_count()), default=min(4, multiprocessing.cpu_count()),
dest='n_cores', help='Total number of cores available.',
help='total number of cores available', type=int)
required=False, parser.add_argument('--intermediate-directory')
type=int parser.add_argument('--zip',
) help='Zips all results in different archives depending'
parser.add_argument( ' on result types. Also zips everything into one '
'--zip', 'archive.')
default='ocr-result-files', parser.add_argument('-v', '--version',
dest='zip', action='version',
type=str, version='%(prog)s {}'.format(__version__))
help='package result files in zip bundles and asign an filename prefix',
required=False
)
return parser.parse_args() return parser.parse_args()
class OCRWorkflow(WorkflowRunner): class OCRPipelineJob:
def __init__(self, args): """An OCR pipeline job class
self.jobs = analyze_jobs(args.input_dir, args.output_dir)
self.skip_binarisation = args.skip_binarisation Each input file of the pipeline is represented as an OCR pipeline job,
self.keep_intermediates = args.keep_intermediates which holds all necessary information for the pipeline to process it.
self.lang = args.lang
self.n_cores = args.n_cores Arguments:
self.output_dir = args.output_dir file -- Path to the file
self.zip = args.zip output_dir -- Path to a directory, where job results a stored
intermediate_dir -- Path to a directory, where intermediate files are
stored.
"""
def __init__(self, file, output_dir, intermediate_dir):
self.file = file
self.intermediate_dir = intermediate_dir
self.name = os.path.basename(file).rsplit('.', 1)[0]
self.output_dir = output_dir
class OCRPipeline(WorkflowRunner):
def __init__(self, input_dir, lang, output_dir, binarize, intermediate_dir,
n_cores, zip):
self.input_dir = input_dir
self.lang = lang
self.output_dir = output_dir
self.binarize = binarize
if intermediate_dir is None:
self.intermediate_dir = os.path.join(output_dir, 'tmp')
else:
self.intermediate_dir = tempfile.mkdtemp(dir=intermediate_dir)
self.n_cores = n_cores
if zip is None:
self.zip = zip
else:
if zip.lower().endswith('.zip'):
# Remove .zip file extension if provided
self.zip = zip[:-4]
self.zip = self.zip if self.zip else 'output'
else:
self.zip = zip
self.jobs = collect_jobs(self.input_dir,
self.output_dir,
self.intermediate_dir)
def workflow(self): def workflow(self):
if len(self.jobs) == 0: if not self.jobs:
return return
''' '''
' ################################################## ' ##################################################
' # Create output directories # ' # setup output directory #
' ################################################## ' ##################################################
''' '''
create_output_directories_jobs = [] setup_output_directory_tasks = []
for index, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
cmd = 'mkdir -p "%s"' % ( cmd = 'mkdir'
os.path.join(job['output_dir'], 'tmp') cmd += ' -p'
) cmd += ' "{}"'.format(job.intermediate_dir)
if self.keep_intermediates: cmd += ' "{}"'.format(os.path.join(job.output_dir, 'poco'))
cmd += ' "%s" "%s" "%s" "%s"' % ( lbl = 'setup_output_directory_-_{}'.format(i)
os.path.join(job['output_dir'], 'tmp', 'hocr'), task = self.addTask(command=cmd, label=lbl)
os.path.join(job['output_dir'], 'tmp', 'pdf'), setup_output_directory_tasks.append(task)
os.path.join(job['output_dir'], 'tmp', 'tiff'),
os.path.join(job['output_dir'], 'tmp', 'txt')
)
if not self.skip_binarisation:
cmd += ' "%s"' % (
os.path.join(job['output_dir'], 'tmp', 'bin.png')
)
create_output_directories_jobs.append(
self.addTask(
command=cmd,
label='create_output_directories_job_-_%i' % (index)
)
)
''' '''
' ################################################## ' ##################################################
' # Split # ' # split input #
' ################################################## ' ##################################################
''' '''
split_jobs = [] split_input_tasks = []
split_job_n_cores = min( n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs))))
self.n_cores, for i, job in enumerate(self.jobs):
max(1, int(self.n_cores / len(self.jobs))) input_file = job.file
) output_file = '{}/page-%d.tif'.format(job.intermediate_dir)
for index, job in enumerate(self.jobs): cmd = 'gs'
if job['filename'].endswith(('.tif', '.tiff')): cmd += ' -dBATCH'
''' cmd += ' -dNOPAUSE'
' This command also works for PDF input but ocropus-nlbin cmd += ' -dNumRenderingThreads={}'.format(n_cores)
' is not able to handle the TIFF output of it. cmd += ' -dQUIET'
''' cmd += ' -r300'
cmd = 'convert -density 300 "%s" -compress LZW -scene 1 "%s/page-%%d.tif"' % ( cmd += ' -sDEVICE=tiff24nc'
job['path'], cmd += ' -sCompression=lzw'
os.path.join(job['output_dir'], 'tmp') cmd += ' "-sOutputFile={}"'.format(output_file)
) cmd += ' "{}"'.format(input_file)
else: deps = 'setup_output_directory_-_{}'.format(i)
cmd = 'pdftoppm -r 300 -tiff -tiffcompression lzw "%s" "%s"' % ( lbl = 'split_input_-_{}'.format(i)
job['path'], task = self.addTask(command=cmd, dependencies=deps, label=lbl, nCores=n_cores) # noqa
os.path.join(job['output_dir'], 'tmp', 'page') split_input_tasks.append(task)
)
split_jobs.append( if self.binarize:
self.addTask(
command=cmd,
dependencies='create_output_directories_job_-_%i' % (index),
label='split_job_-_%i' % (index),
nCores=split_job_n_cores
)
)
if not self.skip_binarisation:
''' '''
' The binarisation_jobs are created based of the output files of ' The binarization_tasks list is created based on the output files
' the split_jobs. So wait until they are finished. ' of the split_tasks. So wait until they are finished.
''' '''
self.waitForTasks() self.waitForTasks()
''' '''
' ################################################## ' ##################################################
' # Binarise # ' # binarization #
' ################################################## ' ##################################################
''' '''
binarisation_jobs = [] binarization_tasks = []
''' '''
' We run ocropus-nlbin with either four or, if there are less then ' We run ocropus-nlbin with either four or, if there are less then
' four cores available for this workflow, the available core ' four cores available for this workflow, the available core
' number. ' number.
''' '''
binarisation_job_n_cores = min(4, self.n_cores) n_cores = min(4, self.n_cores)
for index, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
files = os.listdir(os.path.join(job['output_dir'], 'tmp')) input_dir = job.intermediate_dir
files = filter(lambda x: x.endswith('.tif'), files) output_dir = job.intermediate_dir
files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) files = filter(lambda x: x.endswith('.tif'), os.listdir(input_dir)) # noqa
files = map( files = natsorted(files)
lambda x: '"' + os.path.join(job['output_dir'], 'tmp', x) + '"', files = map(lambda x: os.path.join(input_dir, x), files)
files cmd = 'ocropus-nlbin "{}"'.format('" "'.join(files))
) cmd += ' --nocheck'
cmd = 'ocropus-nlbin --output "%s" --parallel "%i" %s' % ( cmd += ' --output "{}"'.format(output_dir)
os.path.join(job['output_dir'], 'tmp'), cmd += ' --parallel "{}"'.format(n_cores)
binarisation_job_n_cores, print(cmd)
' '.join(files) deps = 'split_input_-_{}'.format(i)
) lbl = 'binarization_-_{}'.format(i)
binarisation_jobs.append( task = self.addTask(command=cmd, dependencies=deps, label=lbl, nCores=n_cores) # noqa
self.addTask( binarization_tasks.append(task)
command=cmd,
dependencies='split_job_-_%i' % (index),
label='binarisation_job_-_%i' % (index),
nCores=binarisation_job_n_cores
)
)
'''
' The post_binarisation_jobs are created based of the output files
' of the binarisation_jobs. So wait until they are finished.
'''
self.waitForTasks() self.waitForTasks()
''' '''
' ################################################## ' ##################################################
' # Normalise file names from binarisation # ' # Renaming of binarization output files #
' ################################################## ' ##################################################
''' '''
post_binarisation_jobs = [] for i, job in enumerate(self.jobs):
for index, job in enumerate(self.jobs): input_dir = job.intermediate_dir
number = 0 output_dir = job.intermediate_dir
files = os.listdir(os.path.join(job['output_dir'], 'tmp')) files = filter(lambda x: x.endswith('.bin.png'), os.listdir(input_dir)) # noqa
files = filter(lambda x: x.endswith('.bin.png'), files)
files.sort()
for file in files: for file in files:
cmd = 'mv "%s" "%s"' % ( # int conversion is done in order to trim leading zeros
os.path.join(job['output_dir'], 'tmp', file), page_number = int(file.split('.', 1)[0])
os.path.join( output_file = 'page-{}.bin.png'.format(page_number)
job['output_dir'], os.rename(os.path.join(output_dir, file),
'tmp', os.path.join(output_dir, output_file))
'page-%i.bin.png' % (int(file.split('.', 1)[0]))
)
)
post_binarisation_jobs.append(
self.addTask(
command=cmd,
dependencies='binarisation_job_-_%i' % (index),
label='post_binarisation_job_-_%i-%i' % (
index,
number
)
)
)
number += 1
''' '''
' The ocr_jobs are created based of the output files of either the ' The ocr_tasks are created based of the output files of either the
' split_jobs or post_binarisation_jobs. So wait until they are ' split_tasks or binarization_tasks. So wait until they are
' finished. ' finished.
''' '''
self.waitForTasks() self.waitForTasks()
''' '''
' ################################################## ' ##################################################
' # Optical Character Recognition # ' # ocr #
' ################################################## ' ##################################################
''' '''
ocr_jobs = [] ocr_tasks = []
''' for i, job in enumerate(self.jobs):
' Tesseract runs fastest with four cores. So we run it with either four input_dir = job.intermediate_dir
' or, if there are less then four cores available for this workflow, output_dir = job.intermediate_dir
' the available core number. files = os.listdir(input_dir)
''' if self.binarize:
ocr_job_n_cores = min(4, self.n_cores) deps = 'binarization_-_{}'.format(i)
for index, job in enumerate(self.jobs):
files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
if self.skip_binarisation:
files = filter(lambda x: x.endswith('.tif'), files)
else:
files = filter(lambda x: x.endswith('.bin.png'), files) files = filter(lambda x: x.endswith('.bin.png'), files)
files.sort(key=lambda x: int(re.search(r'\d+', x).group(0)))
files = map(
lambda x: os.path.join(job['output_dir'], 'tmp', x),
files
)
number = 0
for file in files:
cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % (
file,
os.path.join(
job['output_dir'],
'tmp',
file.rsplit('.', 1 if self.skip_binarisation else 2)[0]
),
self.lang
)
if self.skip_binarisation:
ocr_job_dependencies = 'split_job_-_%i' % (index)
else: else:
ocr_job_dependencies = filter( deps = 'split_input_-_{}'.format(i)
lambda x: x == 'post_binarisation_job_-_%i-%i' % ( files = filter(lambda x: x.endswith('.tif'), files)
index, files = natsorted(files)
number files = map(lambda x: os.path.join(input_dir, x), files)
), for j, file in enumerate(files):
post_binarisation_jobs if self.binarize:
) output_file_base = os.path.join(output_dir, file.rsplit('.', 2)[0]) # noqa
ocr_jobs.append( else:
self.addTask( output_file_base = os.path.join(output_dir, file.rsplit('.', 1)[0]) # noqa
command=cmd, cmd = 'tesseract "{}" "{}"'.format(file, output_file_base)
dependencies=ocr_job_dependencies, cmd += ' -l "{}"'.format(self.lang)
label='ocr_job_-_%i-%i' % (index, number), cmd += ' hocr pdf txt'
nCores=ocr_job_n_cores cmd += ' && '
) cmd += 'sed -i \'s+{}/++g\' "{}".hocr'.format(input_dir, output_file_base) # noqa
) lbl = 'ocr_-_{}-{}'.format(i, j)
number += 1 task = self.addTask(command=cmd, dependencies=deps, label=lbl, env={"OMP_THREAD_LIMIT": "1"}) # noqa
ocr_tasks.append(task)
''' '''
' The following jobs are created based of the output files of the ' The following jobs are created based of the output files of the
' ocr_jobs. So wait until they are finished. ' ocr_tasks. So wait until they are finished.
''' '''
self.waitForTasks() self.waitForTasks()
''' '''
' ################################################## ' ##################################################
' # Create TEI P5 files # ' # combined pdf creation #
' ################################################## ' ##################################################
''' '''
hocr_to_tei_jobs = [] combined_pdf_creation_tasks = []
for index, job in enumerate(self.jobs): n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs))))
files = os.listdir(os.path.join(job['output_dir'], 'tmp')) for i, job in enumerate(self.jobs):
files = filter(lambda x: x.endswith('.hocr'), files) input_dir = job.intermediate_dir
files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) output_file = os.path.join(job.output_dir, '{}.pdf'.format(job.name)) # noqa
files = map( files = filter(lambda x: x.endswith('.pdf'), os.listdir(input_dir))
lambda x: '"' + os.path.join(job['output_dir'], 'tmp', x) + '"', files = natsorted(files)
files files = map(lambda x: os.path.join(input_dir, x), files)
) cmd = 'gs'
cmd = 'hocrtotei %s "%s"' % ( cmd += ' -dBATCH'
' '.join(files), cmd += ' -dNOPAUSE'
os.path.join( cmd += ' -dNumRenderingThreads={}'.format(n_cores)
job['output_dir'], cmd += ' -dPDFSETTINGS=/ebook'
os.path.join(job['output_dir'], job['name'] + '.xml') cmd += ' -dQUIET'
) cmd += ' -sDEVICE=pdfwrite'
) cmd += ' "-sOutputFile={}"'.format(output_file)
hocr_to_tei_jobs.append( cmd += ' "{}"'.format('" "'.join(files))
self.addTask( deps = filter(lambda x: x.startswith('ocr_-_{}'.format(i)), ocr_tasks) # noqa
command=cmd, lbl = 'combined_pdf_creation_-_{}'.format(i)
dependencies=filter( task = self.addTask(command=cmd, dependencies=deps, label=lbl, nCores=n_cores) # noqa
lambda x: x.startswith('ocr_job_-_%i' % (index)), combined_pdf_creation_tasks.append(task)
ocr_jobs
),
label='hocr_to_tei_job_-_%i' % (index)
)
)
''' '''
' ################################################## ' ##################################################
' # Merge PDF files # ' # combined txt creation #
' ################################################## ' ##################################################
''' '''
pdf_merge_jobs = [] combined_txt_creation_tasks = []
for index, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
files = os.listdir(os.path.join(job['output_dir'], 'tmp')) input_dir = job.intermediate_dir
files = filter(lambda x: x.endswith('.pdf'), files) output_file = os.path.join(job.output_dir, '{}.txt'.format(job.name)) # noqa
files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) files = filter(lambda x: x.endswith('.txt'), os.listdir(input_dir))
files = map( files = natsorted(files)
lambda x: '"' + os.path.join(job['output_dir'], 'tmp', x) + '"', files = map(lambda x: os.path.join(input_dir, x), files)
files cmd = 'cat "{}" > "{}"'.format('" "'.join(files), output_file)
) deps = filter(lambda x: x.startswith('ocr_-_{}'.format(i)), ocr_tasks) # noqa
cmd = 'pdfunite %s "%s"' % ( lbl = 'combined_txt_creation_-_{}'.format(i)
' '.join(files), task = self.addTask(command=cmd, dependencies=deps, label=lbl)
os.path.join( combined_txt_creation_tasks.append(task)
job['output_dir'],
os.path.join(job['output_dir'], job['name'] + '.pdf')
)
)
pdf_merge_jobs.append(
self.addTask(
command=cmd,
dependencies=filter(
lambda x: x.startswith('ocr_job_-_%i' % (index)),
ocr_jobs
),
label='pdf_merge_job_-_%i' % (index)
)
)
''' '''
' ################################################## ' ##################################################
' # Merge text files # ' # tei p5 creation #
' ################################################## ' ##################################################
''' '''
txt_merge_jobs = [] tei_p5_creation_tasks = []
for index, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
files = os.listdir(os.path.join(job['output_dir'], 'tmp')) input_dir = job.intermediate_dir
files = filter(lambda x: x.endswith('.txt'), files) output_file = os.path.join(job.output_dir, '{}.xml'.format(job.name)) # noqa
files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) files = filter(lambda x: x.endswith('.hocr'),
files = map( os.listdir(input_dir))
lambda x: '"' + os.path.join(job['output_dir'], 'tmp', x) + '"', files = natsorted(files)
files files = map(lambda x: os.path.join(input_dir, x), files)
) cmd = 'hocrtotei "{}" "{}"'.format('" "'.join(files),
cmd = 'cat %s > "%s"' % ( output_file)
' '.join(files), deps = filter(lambda x: x.startswith('ocr_-_{}'.format(i)), ocr_tasks) # noqa
os.path.join( lbl = 'tei_p5_creation_-_{}'.format(i)
job['output_dir'], task = self.addTask(command=cmd, dependencies=deps, label=lbl)
os.path.join(job['output_dir'], job['name'] + '.txt') tei_p5_creation_tasks.append(task)
)
)
txt_merge_jobs.append(
self.addTask(
command=cmd,
dependencies=filter(
lambda x: x.startswith('ocr_job_-_%i' % (index)),
ocr_jobs
),
label='txt_merge_job_-_%i' % (index)
)
)
if self.zip:
all_zip_jobs = []
all_zip_job_dependencies = (hocr_to_tei_jobs
+ pdf_merge_jobs
+ txt_merge_jobs)
cmd = 'cd "%s" && zip "%s"-all-ocr-files.zip */*.{pdf,txt,xml} -x "pyflow.data*" && cd -' % (
self.output_dir,
self.zip
)
all_zip_jobs.append(
self.addTask(
command=cmd,
dependencies=all_zip_job_dependencies,
label='all_zip_job'
)
)
pdf_zip_jobs = []
pdf_zip_job_dependencies = all_zip_jobs
cmd = 'cd "%s" && zip -m "%s"-ocr-pdf.zip */*.pdf -x "pyflow.data*" && cd -' % (
self.output_dir,
self.zip
)
pdf_zip_jobs.append(
self.addTask(
command=cmd,
dependencies=pdf_zip_job_dependencies,
label='pdf_zip_job'
)
)
txt_zip_jobs = []
txt_zip_job_dependencies = all_zip_jobs
cmd = 'cd "%s" && zip -m "%s"-ocr-txt.zip */*.txt -x "pyflow.data*" && cd -' % (
self.output_dir,
self.zip
)
txt_zip_jobs.append(
self.addTask(
command=cmd,
dependencies=txt_zip_job_dependencies,
label='txt_zip_job'
)
)
xml_zip_jobs = []
xml_zip_job_dependencies = all_zip_jobs
cmd = 'cd "%s" && zip -m "%s"-ocr-xml.zip */*.xml -x "pyflow.data*" && cd -' % (
self.output_dir,
self.zip
)
xml_zip_jobs.append(
self.addTask(
command=cmd,
dependencies=xml_zip_job_dependencies,
label='xml_zip_job'
)
)
''' '''
' ################################################## ' ##################################################
' # Cleanup # ' # poco bundle creation #
' ################################################## ' ##################################################
''' '''
cleanup_jobs = [] poco_bundle_creation_tasks = []
if self.keep_intermediates: for i, job in enumerate(self.jobs):
for index, job in enumerate(self.jobs): input_dir = job.intermediate_dir
cleanup_job_dependencies = [ output_dir = os.path.join(job.output_dir, 'poco')
'hocr_to_tei_job_-_%i' % (index), files = os.listdir(input_dir)
'pdf_merge_job_-_%i' % (index), if self.binarize:
'txt_merge_job_-_%i' % (index) files = filter(lambda x: x.endswith(('.bin.png', '.hocr')), files) # noqa
]
cmd = 'mv "%s"/*.hocr "%s"' % (
os.path.join(job['output_dir'], 'tmp'),
os.path.join(job['output_dir'], 'tmp', 'hocr'),
)
cmd += ' && mv "%s"/*.pdf "%s"' % (
os.path.join(job['output_dir'], 'tmp'),
os.path.join(job['output_dir'], 'tmp', 'pdf'),
)
cmd += ' && mv "%s"/*.tif "%s"' % (
os.path.join(job['output_dir'], 'tmp'),
os.path.join(job['output_dir'], 'tmp', 'tiff'),
)
cmd += ' && mv "%s"/*.txt "%s"' % (
os.path.join(job['output_dir'], 'tmp'),
os.path.join(job['output_dir'], 'tmp', 'txt'),
)
if not self.skip_binarisation:
cmd += ' && mv "%s"/*.bin.png "%s"' % (
os.path.join(job['output_dir'], 'tmp'),
os.path.join(job['output_dir'], 'tmp', 'bin.png'),
)
cmd += ' && rm "%s"/*.nrm.png' % (
os.path.join(job['output_dir'], 'tmp')
)
cleanup_jobs.append(
self.addTask(
command=cmd,
dependencies=cleanup_job_dependencies,
label='cleanup_job_-_%i' % (index)
)
)
else: else:
for index, job in enumerate(self.jobs): files = filter(lambda x: x.endswith(('.tif', '.hocr')), files)
cleanup_job_dependencies = [ files = natsorted(files)
'hocr_to_tei_job_-_%i' % (index), files = map(lambda x: os.path.join(input_dir, x), files)
'pdf_merge_job_-_%i' % (index), cmd = 'mv "{}" "{}"'.format('" "'.join(files), output_dir)
'txt_merge_job_-_%i' % (index) deps = filter(lambda x: x.startswith('ocr_-_{}'.format(i)), ocr_tasks) # noqa
] deps.append('tei_p5_creation_-_{}'.format(i))
cmd = 'rm -r "%s"' % ( lbl = 'poco_bundle_creation_-_{}'.format(i)
os.path.join(job['output_dir'], 'tmp') task = self.addTask(command=cmd, dependencies=deps, label=lbl)
) poco_bundle_creation_tasks.append(task)
cleanup_jobs.append(
self.addTask( '''
command=cmd, ' The following jobs are created based of the output files of the
dependencies=cleanup_job_dependencies, ' combined_pdf_creation_tasks. So wait until they are finished.
label='cleanup_job_-_%i' % (index) '''
) self.waitForTasks()
)
'''
' ##################################################
' # cleanup #
' ##################################################
'''
cleanup_tasks = []
for i, job in enumerate(self.jobs):
input_dir = job.intermediate_dir
cmd = 'rm -r "{}"'.format(input_dir)
deps = ['combined_pdf_creation_-_{}'.format(i),
'combined_txt_creation_-_{}'.format(i),
'poco_bundle_creation_-_{}'.format(i),
'tei_p5_creation_-_{}'.format(i)]
lbl = 'job_cleanup_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
cleanup_tasks.append(task)
input_dir = self.intermediate_dir
cmd = 'rm -r "{}"'.format(input_dir)
deps = cleanup_tasks
lbl = 'pipeline_cleanup'
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
cleanup_tasks.append(task)
self.waitForTasks()
'''
' ##################################################
' # zip creation #
' ##################################################
'''
zip_creation_tasks = []
if self.zip is not None:
# zip all files
cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' -r'
cmd += ' "{}.all.zip" .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.pdf" "*.txt" "*.xml" "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif') # noqa
cmd += ' && '
cmd += 'cd -'
deps = combined_pdf_creation_tasks + combined_txt_creation_tasks + poco_bundle_creation_tasks # noqa
lbl = 'zip_creation_-_all'
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
zip_creation_tasks.append(task)
# zip PDF files
cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' -r'
cmd += ' "{}.pdf.zip" .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.pdf"'
cmd += ' && '
cmd += 'cd -'
deps = combined_pdf_creation_tasks
lbl = 'zip_creation_-_pdf'
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
zip_creation_tasks.append(task)
# zip TXT files
cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' -r'
cmd += ' "{}.txt.zip" .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.txt"'
cmd += ' && '
cmd += 'cd -'
deps = combined_txt_creation_tasks
lbl = 'zip_creation_-_txt'
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
zip_creation_tasks.append(task)
# zip XML files
cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' -r'
cmd += ' "{}.xml.zip" .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.xml"'
cmd += ' && '
cmd += 'cd -'
deps = tei_p5_creation_tasks
lbl = 'zip_creation_-_xml'
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
zip_creation_tasks.append(task)
# zip PoCo bundles
cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' -r'
cmd += ' "{}.poco.zip" .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif') # noqa
cmd += ' && '
cmd += 'cd -'
deps = poco_bundle_creation_tasks
lbl = 'zip_creation_-_poco'
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
zip_creation_tasks.append(task)
def analyze_jobs(input_dir, output_dir): def collect_jobs(input_dir, output_dir, intermediate_dir):
jobs = [] jobs = []
for file in os.listdir(input_dir): for file in os.listdir(input_dir):
if os.path.isdir(os.path.join(input_dir, file)): if os.path.isdir(os.path.join(input_dir, file)):
jobs += analyze_jobs( jobs += collect_jobs(os.path.join(input_dir, file),
os.path.join(input_dir, file), os.path.join(output_dir, file),
os.path.join(output_dir, file) os.path.join(intermediate_dir, file))
) elif file.lower().endswith('.pdf'):
elif file.endswith(('.pdf', '.tif', '.tiff')): job = OCRPipelineJob(os.path.join(input_dir, file),
jobs.append( os.path.join(output_dir, file),
{ os.path.join(intermediate_dir, file))
'filename': file, jobs.append(job)
'name': file.rsplit('.', 1)[0],
'output_dir': os.path.join(output_dir, file),
'path': os.path.join(input_dir, file)
}
)
return jobs return jobs
def main(): def main():
args = parse_arguments() args = parse_args()
ocr_pipeline = OCRPipeline(args.input_directory, args.language,
wflow = OCRWorkflow(args) args.output_directory, args.binarize,
args.intermediate_directory, args.n_cores,
retval = wflow.run(dataDirRoot=args.output_dir, nCores=args.n_cores) args.zip)
retval = ocr_pipeline.run(
dataDirRoot=(args.log_dir or args.output_directory),
nCores=args.n_cores
)
sys.exit(retval) sys.exit(retval)

View File

@ -1,39 +1,43 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# coding=utf-8 # coding=utf-8
import argparse """A wrapper to execute the OCR pipeline in a Docker container"""
from argparse import ArgumentParser
import os import os
import subprocess import subprocess
container_image = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest' CONTAINER_IMAGE_TAG = '1.0.0'
container_input_dir = '/input' CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:{}'.format(CONTAINER_IMAGE_TAG) # noqa
container_output_dir = '/output' CONTAINER_INPUT_DIR = '/input'
uid = str(os.getuid()) CONTAINER_INTERMEDIATE_DIR = '/intermediate'
gid = str(os.getgid()) CONTAINER_OUTPUT_DIR = '/output'
UID = str(os.getuid())
GID = str(os.getgid())
parser = argparse.ArgumentParser(add_help=False) parser = ArgumentParser(add_help=False)
parser.add_argument( parser.add_argument('-i', '--input-directory')
'-i', parser.add_argument('-o', '--output-directory')
dest='input_dir', parser.add_argument('--intermediate-directory')
required=False
)
parser.add_argument(
'-o',
dest='output_dir',
required=False
)
args, remaining_args = parser.parse_known_args() args, remaining_args = parser.parse_known_args()
cmd = ['docker', 'run', '--rm', '-it', '-u', uid + ':' + gid] cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)]
if args.input_dir is not None: if args.intermediate_directory is not None:
host_input_dir = os.path.abspath(args.input_dir) cmd += ['-v', '{}:{}'.format(os.path.abspath(args.intermediate_directory),
cmd += ['-v', host_input_dir + ':' + container_input_dir] CONTAINER_INTERMEDIATE_DIR)]
remaining_args += ['-i', container_input_dir] remaining_args.insert(0, CONTAINER_INTERMEDIATE_DIR)
if args.output_dir is not None: remaining_args.insert(0, '--intermediate-directory')
host_output_dir = os.path.abspath(args.output_dir) if args.output_directory is not None:
cmd += ['-v', host_output_dir + ':' + container_output_dir] cmd += ['-v', '{}:{}'.format(os.path.abspath(args.output_directory),
remaining_args += ['-o', container_output_dir] CONTAINER_OUTPUT_DIR)]
cmd.append(container_image) remaining_args.insert(0, CONTAINER_OUTPUT_DIR)
remaining_args.insert(0, '-o')
if args.input_directory is not None:
cmd += ['-v', '{}:{}'.format(os.path.abspath(args.input_directory),
CONTAINER_INPUT_DIR)]
remaining_args.insert(0, CONTAINER_INPUT_DIR)
remaining_args.insert(0, '-i')
cmd.append(CONTAINER_IMAGE)
cmd += remaining_args cmd += remaining_args
subprocess.run(cmd) subprocess.run(cmd)