Add possibility to use an intermediate dir

This commit is contained in:
Patrick Jentsch 2020-09-22 17:44:32 +02:00
parent 6d90d43699
commit ac4b5c2fd8
3 changed files with 184 additions and 156 deletions

View File

@ -76,8 +76,10 @@ RUN chmod 644 /usr/local/share/tessdata/*.traineddata
## Install Pipeline ## ## Install Pipeline ##
RUN apt-get install -y --no-install-recommends \ RUN apt-get install -y --no-install-recommends \
ghostscript \ ghostscript \
python-pip \
python3.7 \ python3.7 \
zip zip \
&& pip install natsort
COPY "hocrtotei" "ocr" "/usr/local/bin/" COPY "hocrtotei" "ocr" "/usr/local/bin/"

327
ocr
View File

@ -12,11 +12,12 @@ Authors: Patrick Jentsch <p.jentsch@uni-bielefeld.de
from argparse import ArgumentParser from argparse import ArgumentParser
from natsort import natsorted
from pyflow import WorkflowRunner from pyflow import WorkflowRunner
import multiprocessing import multiprocessing
import os import os
import re
import sys import sys
import tempfile
TESSERACT_MODELS = ['deu', 'eng', 'enm', 'fra', 'frk', 'frm', 'ita', 'por', TESSERACT_MODELS = ['deu', 'eng', 'enm', 'fra', 'frk', 'frm', 'ita', 'por',
@ -42,6 +43,7 @@ def parse_args():
default=min(4, multiprocessing.cpu_count()), default=min(4, multiprocessing.cpu_count()),
help='Total number of cores available.', help='Total number of cores available.',
type=int) type=int)
parser.add_argument('--intermediate-directory')
parser.add_argument('--zip', parser.add_argument('--zip',
help='Zips all results in different archives depending' help='Zips all results in different archives depending'
' on result types. Also zips everything into one ' ' on result types. Also zips everything into one '
@ -50,20 +52,49 @@ def parse_args():
class OCRPipelineJob: class OCRPipelineJob:
def __init__(self, file, output_dir): """An OCR pipeline job class
Each input file of the pipeline is represented as an OCR pipeline job,
which holds all necessary information for the pipeline to process it.
Arguments:
file -- Path to the file
output_dir -- Path to a directory, where job results a stored
intermediate_dir -- Path to a directory, where intermediate files are
stored.
"""
def __init__(self, file, output_dir, intermediate_dir):
self.file = file self.file = file
self.intermediate_dir = intermediate_dir
self.name = os.path.basename(file).rsplit('.', 1)[0] self.name = os.path.basename(file).rsplit('.', 1)[0]
self.output_dir = output_dir self.output_dir = output_dir
class OCRPipeline(WorkflowRunner): class OCRPipeline(WorkflowRunner):
def __init__(self, binarize, jobs, lang, n_cores, output_dir, zip): def __init__(self, input_dir, lang, output_dir, binarize, intermediate_dir,
self.binarize = binarize n_cores, zip):
self.jobs = jobs self.input_dir = input_dir
self.lang = lang self.lang = lang
self.n_cores = n_cores
self.output_dir = output_dir self.output_dir = output_dir
self.binarize = binarize
if intermediate_dir is None:
self.intermediate_dir = os.path.join(output_dir, 'tmp')
else:
self.intermediate_dir = tempfile.mkdtemp(dir=intermediate_dir)
self.n_cores = n_cores
if zip is None:
self.zip = zip self.zip = zip
else:
if zip.lower().endswith('.zip'):
# Remove .zip file extension if provided
self.zip = zip[:-4]
self.zip = self.zip if self.zip else 'output'
else:
self.zip = zip
self.jobs = collect_jobs(self.input_dir,
self.output_dir,
self.intermediate_dir)
def workflow(self): def workflow(self):
if not self.jobs: if not self.jobs:
@ -74,26 +105,26 @@ class OCRPipeline(WorkflowRunner):
' # setup output directory # ' # setup output directory #
' ################################################## ' ##################################################
''' '''
setup_output_directory_jobs = [] setup_output_directory_tasks = []
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
intermediate_dir = os.path.join(job.output_dir, 'tmp')
cmd = 'mkdir' cmd = 'mkdir'
cmd += ' -p' cmd += ' -p'
cmd += ' "{}"'.format(intermediate_dir) cmd += ' "{}"'.format(job.intermediate_dir)
cmd += ' "{}"'.format(os.path.join(job.output_dir, 'poco')) cmd += ' "{}"'.format(os.path.join(job.output_dir, 'poco'))
lbl = 'setup_output_directory_-_{}'.format(i) lbl = 'setup_output_directory_-_{}'.format(i)
setup_output_directory_jobs.append(self.addTask(command=cmd, task = self.addTask(command=cmd, label=lbl)
label=lbl)) setup_output_directory_tasks.append(task)
''' '''
' ################################################## ' ##################################################
' # split input # ' # split input #
' ################################################## ' ##################################################
''' '''
split_input_jobs = [] split_input_tasks = []
n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs)))) n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs))))
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
output_dir = os.path.join(job.output_dir, 'tmp') input_file = job.file
output_file = '{}/page-%d.tif'.format(job.intermediate_dir)
cmd = 'gs' cmd = 'gs'
cmd += ' -dBATCH' cmd += ' -dBATCH'
cmd += ' -dNOPAUSE' cmd += ' -dNOPAUSE'
@ -102,19 +133,17 @@ class OCRPipeline(WorkflowRunner):
cmd += ' -r300' cmd += ' -r300'
cmd += ' -sDEVICE=tiff24nc' cmd += ' -sDEVICE=tiff24nc'
cmd += ' -sCompression=lzw' cmd += ' -sCompression=lzw'
cmd += ' "-sOutputFile={}/page-%d.tif"'.format(output_dir) cmd += ' "-sOutputFile={}"'.format(output_file)
cmd += ' "{}"'.format(job.file) cmd += ' "{}"'.format(input_file)
deps = 'setup_output_directory_-_{}'.format(i) deps = 'setup_output_directory_-_{}'.format(i)
lbl = 'split_input_-_{}'.format(i) lbl = 'split_input_-_{}'.format(i)
split_input_jobs.append(self.addTask(command=cmd, task = self.addTask(command=cmd, dependencies=deps, label=lbl, nCores=n_cores) # noqa
dependencies=deps, split_input_tasks.append(task)
label=lbl,
nCores=n_cores))
if self.binarize: if self.binarize:
''' '''
' The binarization_jobs list is created based on the output files ' The binarization_tasks list is created based on the output files
' of the split_jobs. So wait until they are finished. ' of the split_tasks. So wait until they are finished.
''' '''
self.waitForTasks() self.waitForTasks()
@ -123,7 +152,7 @@ class OCRPipeline(WorkflowRunner):
' # binarization # ' # binarization #
' ################################################## ' ##################################################
''' '''
binarization_jobs = [] binarization_tasks = []
''' '''
' We run ocropus-nlbin with either four or, if there are less then ' We run ocropus-nlbin with either four or, if there are less then
' four cores available for this workflow, the available core ' four cores available for this workflow, the available core
@ -131,27 +160,21 @@ class OCRPipeline(WorkflowRunner):
''' '''
n_cores = min(4, self.n_cores) n_cores = min(4, self.n_cores)
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
input_dir = os.path.join(job.output_dir, 'tmp') input_dir = job.intermediate_dir
output_dir = input_dir output_dir = job.intermediate_dir
files = filter(lambda x: x.endswith('.tif'), files = filter(lambda x: x.endswith('.tif'), os.listdir(input_dir)) # noqa
os.listdir(input_dir)) files = natsorted(files)
files.sort(key=lambda x: int(re.search(r'\d+', x).group(0)))
files = map(lambda x: os.path.join(input_dir, x), files) files = map(lambda x: os.path.join(input_dir, x), files)
cmd = 'ocropus-nlbin "{}"'.format('" "'.join(files)) cmd = 'ocropus-nlbin "{}"'.format('" "'.join(files))
cmd += ' --nocheck' cmd += ' --nocheck'
cmd += ' --output "{}"'.format(output_dir) cmd += ' --output "{}"'.format(output_dir)
cmd += ' --parallel "{}"'.format(n_cores) cmd += ' --parallel "{}"'.format(n_cores)
print(cmd)
deps = 'split_input_-_{}'.format(i) deps = 'split_input_-_{}'.format(i)
lbl = 'binarization_-_{}'.format(i) lbl = 'binarization_-_{}'.format(i)
binarization_jobs.append(self.addTask(command=cmd, task = self.addTask(command=cmd, dependencies=deps, label=lbl, nCores=n_cores) # noqa
dependencies=deps, binarization_tasks.append(task)
label=lbl,
nCores=n_cores))
'''
' The post_binarization_jobs are created based on the output files
' of the binarization_jobs. So wait until they are finished.
'''
self.waitForTasks() self.waitForTasks()
''' '''
@ -160,10 +183,9 @@ class OCRPipeline(WorkflowRunner):
' ################################################## ' ##################################################
''' '''
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
input_dir = os.path.join(job.output_dir, 'tmp') input_dir = job.intermediate_dir
output_dir = input_dir output_dir = job.intermediate_dir
files = filter(lambda x: x.endswith('.bin.png'), files = filter(lambda x: x.endswith('.bin.png'), os.listdir(input_dir)) # noqa
os.listdir(input_dir))
for file in files: for file in files:
# int conversion is done in order to trim leading zeros # int conversion is done in order to trim leading zeros
page_number = int(file.split('.', 1)[0]) page_number = int(file.split('.', 1)[0])
@ -172,8 +194,8 @@ class OCRPipeline(WorkflowRunner):
os.path.join(output_dir, output_file)) os.path.join(output_dir, output_file))
''' '''
' The ocr_jobs are created based of the output files of either the ' The ocr_tasks are created based of the output files of either the
' split_jobs or post_binarization_jobs. So wait until they are ' split_tasks or binarization_tasks. So wait until they are
' finished. ' finished.
''' '''
self.waitForTasks() self.waitForTasks()
@ -183,7 +205,7 @@ class OCRPipeline(WorkflowRunner):
' # ocr # ' # ocr #
' ################################################## ' ##################################################
''' '''
ocr_jobs = [] ocr_tasks = []
''' '''
' Tesseract runs fastest with four cores. So we run it with either four ' Tesseract runs fastest with four cores. So we run it with either four
' or, if there are less then four cores available for this workflow, ' or, if there are less then four cores available for this workflow,
@ -191,33 +213,34 @@ class OCRPipeline(WorkflowRunner):
''' '''
n_cores = min(4, self.n_cores) n_cores = min(4, self.n_cores)
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
input_dir = os.path.join(job.output_dir, 'tmp') input_dir = job.intermediate_dir
output_dir = input_dir output_dir = job.intermediate_dir
files = filter(lambda x: x.endswith('.bin.png' if self.binarize else '.tif'), os.listdir(input_dir)) # noqa files = os.listdir(input_dir)
files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) if self.binarize:
deps = 'binarization_-_{}'.format(i)
files = filter(lambda x: x.endswith('.bin.png'), files)
else:
deps = 'split_input_-_{}'.format(i)
files = filter(lambda x: x.endswith('.tif'), files)
files = natsorted(files)
files = map(lambda x: os.path.join(input_dir, x), files) files = map(lambda x: os.path.join(input_dir, x), files)
number = 0 for j, file in enumerate(files):
for file in files: if self.binarize:
output_file_base = os.path.join(output_dir, file.rsplit('.', 2 if self.binarize else 1)[0]) # noqa output_file_base = os.path.join(output_dir, file.rsplit('.', 2)[0]) # noqa
else:
output_file_base = os.path.join(output_dir, file.rsplit('.', 1)[0]) # noqa
cmd = 'tesseract "{}" "{}"'.format(file, output_file_base) cmd = 'tesseract "{}" "{}"'.format(file, output_file_base)
cmd += ' -l "{}"'.format(self.lang) cmd += ' -l "{}"'.format(self.lang)
cmd += ' hocr pdf txt' cmd += ' hocr pdf txt'
cmd += ' && ' cmd += ' && '
cmd += 'sed -i \'s+{}/++g\' "{}".hocr'.format(input_dir, output_file_base) # noqa cmd += 'sed -i \'s+{}/++g\' "{}".hocr'.format(input_dir, output_file_base) # noqa
if self.binarize: lbl = 'ocr_-_{}-{}'.format(i, j)
deps = 'binarization_-_{}'.format(i) task = self.addTask(command=cmd, dependencies=deps, label=lbl, nCores=n_cores) # noqa
else: ocr_tasks.append(task)
deps = 'split_input_-_{}'.format(i)
label = 'ocr_-_{}-{}'.format(i, number)
ocr_jobs.append(self.addTask(command=cmd,
dependencies=deps,
label=label,
nCores=n_cores))
number += 1
''' '''
' The following jobs are created based of the output files of the ' The following jobs are created based of the output files of the
' ocr_jobs. So wait until they are finished. ' ocr_tasks. So wait until they are finished.
''' '''
self.waitForTasks() self.waitForTasks()
@ -226,16 +249,14 @@ class OCRPipeline(WorkflowRunner):
' # combined pdf creation # ' # combined pdf creation #
' ################################################## ' ##################################################
''' '''
combined_pdf_creation_jobs = [] combined_pdf_creation_tasks = []
n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs)))) n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs))))
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
input_dir = os.path.join(job.output_dir, 'tmp') input_dir = job.intermediate_dir
output_dir = job.output_dir output_file = os.path.join(job.output_dir, '{}.pdf'.format(job.name)) # noqa
files = filter(lambda x: x.endswith('.pdf'), files = filter(lambda x: x.endswith('.pdf'), os.listdir(input_dir))
os.listdir(input_dir)) files = natsorted(files)
files.sort(key=lambda x: int(re.search(r'\d+', x).group(0)))
files = map(lambda x: os.path.join(input_dir, x), files) files = map(lambda x: os.path.join(input_dir, x), files)
output_file = os.path.join(output_dir, '{}.pdf'.format(job.name))
cmd = 'gs' cmd = 'gs'
cmd += ' -dBATCH' cmd += ' -dBATCH'
cmd += ' -dNOPAUSE' cmd += ' -dNOPAUSE'
@ -245,77 +266,75 @@ class OCRPipeline(WorkflowRunner):
cmd += ' -sDEVICE=pdfwrite' cmd += ' -sDEVICE=pdfwrite'
cmd += ' "-sOutputFile={}"'.format(output_file) cmd += ' "-sOutputFile={}"'.format(output_file)
cmd += ' "{}"'.format('" "'.join(files)) cmd += ' "{}"'.format('" "'.join(files))
deps = filter(lambda x: x.startswith('ocr_-_{}'.format(i)), deps = filter(lambda x: x.startswith('ocr_-_{}'.format(i)), ocr_tasks) # noqa
ocr_jobs)
lbl = 'combined_pdf_creation_-_{}'.format(i) lbl = 'combined_pdf_creation_-_{}'.format(i)
combined_pdf_creation_jobs.append(self.addTask(command=cmd, task = self.addTask(command=cmd, dependencies=deps, label=lbl, nCores=n_cores) # noqa
dependencies=deps, combined_pdf_creation_tasks.append(task)
label=lbl,
nCores=n_cores))
''' '''
' ################################################## ' ##################################################
' # combined txt creation # ' # combined txt creation #
' ################################################## ' ##################################################
''' '''
combined_txt_creation_jobs = [] combined_txt_creation_tasks = []
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
input_dir = os.path.join(job.output_dir, 'tmp') input_dir = job.intermediate_dir
files = filter(lambda x: x.endswith('.txt'), os.listdir(input_dir))
files.sort(key=lambda x: int(re.search(r'\d+', x).group(0)))
files = map(lambda x: os.path.join(input_dir, x), files)
output_file = os.path.join(job.output_dir, '{}.txt'.format(job.name)) # noqa output_file = os.path.join(job.output_dir, '{}.txt'.format(job.name)) # noqa
files = filter(lambda x: x.endswith('.txt'), os.listdir(input_dir))
files = natsorted(files)
files = map(lambda x: os.path.join(input_dir, x), files)
cmd = 'cat "{}" > "{}"'.format('" "'.join(files), output_file) cmd = 'cat "{}" > "{}"'.format('" "'.join(files), output_file)
deps = filter(lambda x: x.startswith('ocr_-_{}'.format(i)), deps = filter(lambda x: x.startswith('ocr_-_{}'.format(i)), ocr_tasks) # noqa
ocr_jobs)
lbl = 'combined_txt_creation_-_{}'.format(i) lbl = 'combined_txt_creation_-_{}'.format(i)
combined_txt_creation_jobs.append(self.addTask(command=cmd, task = self.addTask(command=cmd, dependencies=deps, label=lbl)
dependencies=deps, combined_txt_creation_tasks.append(task)
label=lbl))
''' '''
' ################################################## ' ##################################################
' # tei p5 creation # ' # tei p5 creation #
' ################################################## ' ##################################################
''' '''
tei_p5_creation_jobs = [] tei_p5_creation_tasks = []
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
input_dir = os.path.join(job.output_dir, 'tmp') input_dir = job.intermediate_dir
files = filter(lambda x: x.endswith('.hocr'), os.listdir(input_dir)) # noqa
files.sort(key=lambda x: int(re.search(r'\d+', x).group(0)))
files = map(lambda x: os.path.join(input_dir, x), files)
output_file = os.path.join(job.output_dir, '{}.xml'.format(job.name)) # noqa output_file = os.path.join(job.output_dir, '{}.xml'.format(job.name)) # noqa
cmd = 'hocrtotei "{}" "{}"'.format('" "'.join(files), output_file) files = filter(lambda x: x.endswith('.hocr'),
deps = filter(lambda x: x.startswith('ocr_-_{}'.format(i)), os.listdir(input_dir))
ocr_jobs) files = natsorted(files)
files = map(lambda x: os.path.join(input_dir, x), files)
cmd = 'hocrtotei "{}" "{}"'.format('" "'.join(files),
output_file)
deps = filter(lambda x: x.startswith('ocr_-_{}'.format(i)), ocr_tasks) # noqa
lbl = 'tei_p5_creation_-_{}'.format(i) lbl = 'tei_p5_creation_-_{}'.format(i)
tei_p5_creation_jobs.append(self.addTask(command=cmd, task = self.addTask(command=cmd, dependencies=deps, label=lbl)
dependencies=deps, tei_p5_creation_tasks.append(task)
label=lbl))
''' '''
' ################################################## ' ##################################################
' # poco bundle creation # ' # poco bundle creation #
' ################################################## ' ##################################################
''' '''
poco_bundle_creation_jobs = [] poco_bundle_creation_tasks = []
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
input_dir = os.path.join(job.output_dir, 'tmp') input_dir = job.intermediate_dir
output_dir = os.path.join(job.output_dir, 'poco') output_dir = os.path.join(job.output_dir, 'poco')
cmd = 'mv "{}"/*.hocr "{}"'.format(input_dir, output_dir) files = os.listdir(input_dir)
cmd += ' && ' if self.binarize:
cmd += 'mv "{}"/*.{} "{}"'.format(input_dir, 'bin.png' if self.binarize else 'tif', output_dir) # noqa files = filter(lambda x: x.endswith(('.bin.png', '.hocr')), files) # noqa
deps = filter(lambda x: x.startswith('ocr_-_{}'.format(i)), else:
ocr_jobs) files = filter(lambda x: x.endswith(('.tif', '.hocr')), files)
files = natsorted(files)
files = map(lambda x: os.path.join(input_dir, x), files)
cmd = 'mv "{}" "{}"'.format('" "'.join(files), output_dir)
deps = filter(lambda x: x.startswith('ocr_-_{}'.format(i)), ocr_tasks) # noqa
deps.append('tei_p5_creation_-_{}'.format(i)) deps.append('tei_p5_creation_-_{}'.format(i))
lbl = 'poco_bundle_creation_-_{}'.format(i) lbl = 'poco_bundle_creation_-_{}'.format(i)
poco_bundle_creation_jobs.append(self.addTask(command=cmd, task = self.addTask(command=cmd, dependencies=deps, label=lbl)
dependencies=deps, poco_bundle_creation_tasks.append(task)
label=lbl))
''' '''
' The following jobs are created based of the output files of the ' The following jobs are created based of the output files of the
' combined_pdf_creation_jobs. So wait until they are finished. ' combined_pdf_creation_tasks. So wait until they are finished.
''' '''
self.waitForTasks() self.waitForTasks()
@ -324,126 +343,126 @@ class OCRPipeline(WorkflowRunner):
' # cleanup # ' # cleanup #
' ################################################## ' ##################################################
''' '''
cleanup_jobs = [] cleanup_tasks = []
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
input_dir = os.path.join(job.output_dir, 'tmp') input_dir = job.intermediate_dir
cmd = 'rm -r "{}"'.format(input_dir) cmd = 'rm -r "{}"'.format(input_dir)
deps = ['combined_pdf_creation_-_{}'.format(i)] deps = ['combined_pdf_creation_-_{}'.format(i),
deps.append('combined_txt_creation_-_{}'.format(i)) 'combined_txt_creation_-_{}'.format(i),
deps.append('poco_bundle_creation_-_{}'.format(i)) 'poco_bundle_creation_-_{}'.format(i),
deps.append('tei_p5_creation_-_{}'.format(i)) 'tei_p5_creation_-_{}'.format(i)]
lbl = 'cleanup_-_{}'.format(i) lbl = 'job_cleanup_-_{}'.format(i)
cleanup_jobs.append(self.addTask(command=cmd, task = self.addTask(command=cmd, dependencies=deps, label=lbl)
dependencies=deps, cleanup_tasks.append(task)
label=lbl))
input_dir = self.intermediate_dir
cmd = 'rm -r "{}"'.format(input_dir)
deps = cleanup_tasks
lbl = 'pipeline_cleanup'
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
cleanup_tasks.append(task)
self.waitForTasks()
''' '''
' ################################################## ' ##################################################
' # zip creation # ' # zip creation #
' ################################################## ' ##################################################
''' '''
zip_creation_jobs = [] zip_creation_tasks = []
if self.zip is not None: if self.zip is not None:
# Remove .zip file extension if provided
if self.zip.endswith('.zip'):
self.zip = self.zip[:-4]
self.zip = self.zip if self.zip else 'output'
# zip all files # zip all files
cmd = 'cd "{}"'.format(self.output_dir) cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && ' cmd += ' && '
cmd += 'zip' cmd += 'zip'
cmd += ' -r' cmd += ' -r'
cmd += ' "{}".all.zip .'.format(self.zip) cmd += ' "{}.all.zip" .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"' cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.pdf" "*.txt" "*.xml" "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif') # noqa cmd += ' -i "*.pdf" "*.txt" "*.xml" "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif') # noqa
cmd += ' && ' cmd += ' && '
cmd += 'cd -' cmd += 'cd -'
deps = combined_pdf_creation_jobs deps = combined_pdf_creation_tasks + combined_txt_creation_tasks + poco_bundle_creation_tasks # noqa
deps += combined_txt_creation_jobs
deps += poco_bundle_creation_jobs
lbl = 'zip_creation_-_all' lbl = 'zip_creation_-_all'
zip_creation_jobs.append(self.addTask(command=cmd, task = self.addTask(command=cmd, dependencies=deps, label=lbl)
dependencies=deps, zip_creation_tasks.append(task)
label=lbl))
# zip PDF files # zip PDF files
cmd = 'cd "{}"'.format(self.output_dir) cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && ' cmd += ' && '
cmd += 'zip' cmd += 'zip'
cmd += ' -r' cmd += ' -r'
cmd += ' "{}".pdf.zip .'.format(self.zip) cmd += ' "{}.pdf.zip" .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"' cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.pdf"' cmd += ' -i "*.pdf"'
cmd += ' && ' cmd += ' && '
cmd += 'cd -' cmd += 'cd -'
deps = combined_pdf_creation_jobs deps = combined_pdf_creation_tasks
lbl = 'zip_creation_-_pdf' lbl = 'zip_creation_-_pdf'
zip_creation_jobs.append(self.addTask(command=cmd, task = self.addTask(command=cmd, dependencies=deps, label=lbl)
dependencies=deps, zip_creation_tasks.append(task)
label=lbl))
# zip TXT files # zip TXT files
cmd = 'cd "{}"'.format(self.output_dir) cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && ' cmd += ' && '
cmd += 'zip' cmd += 'zip'
cmd += ' -r' cmd += ' -r'
cmd += ' "{}".txt.zip .'.format(self.zip) cmd += ' "{}.txt.zip" .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"' cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.txt"' cmd += ' -i "*.txt"'
cmd += ' && ' cmd += ' && '
cmd += 'cd -' cmd += 'cd -'
deps = combined_txt_creation_jobs deps = combined_txt_creation_tasks
lbl = 'zip_creation_-_txt' lbl = 'zip_creation_-_txt'
zip_creation_jobs.append(self.addTask(command=cmd, task = self.addTask(command=cmd, dependencies=deps, label=lbl)
dependencies=deps, zip_creation_tasks.append(task)
label=lbl))
# zip XML files # zip XML files
cmd = 'cd "{}"'.format(self.output_dir) cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && ' cmd += ' && '
cmd += 'zip' cmd += 'zip'
cmd += ' -r' cmd += ' -r'
cmd += ' "{}".xml.zip .'.format(self.zip) cmd += ' "{}.xml.zip" .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"' cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.xml"' cmd += ' -i "*.xml"'
cmd += ' && ' cmd += ' && '
cmd += 'cd -' cmd += 'cd -'
deps = tei_p5_creation_jobs deps = tei_p5_creation_tasks
lbl = 'zip_creation_-_xml' lbl = 'zip_creation_-_xml'
zip_creation_jobs.append(self.addTask(command=cmd, task = self.addTask(command=cmd, dependencies=deps, label=lbl)
dependencies=deps, zip_creation_tasks.append(task)
label=lbl))
# zip PoCo bundles # zip PoCo bundles
cmd = 'cd "{}"'.format(self.output_dir) cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && ' cmd += ' && '
cmd += 'zip' cmd += 'zip'
cmd += ' -r' cmd += ' -r'
cmd += ' "{}".poco.zip .'.format(self.zip) cmd += ' "{}.poco.zip" .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"' cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif') # noqa cmd += ' -i "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif') # noqa
cmd += ' && ' cmd += ' && '
cmd += 'cd -' cmd += 'cd -'
deps = poco_bundle_creation_jobs deps = poco_bundle_creation_tasks
lbl = 'zip_creation_-_poco' lbl = 'zip_creation_-_poco'
zip_creation_jobs.append(self.addTask(command=cmd, task = self.addTask(command=cmd, dependencies=deps, label=lbl)
dependencies=deps, zip_creation_tasks.append(task)
label=lbl))
def collect_jobs(input_dir, output_dir): def collect_jobs(input_dir, output_dir, intermediate_dir):
jobs = [] jobs = []
for file in os.listdir(input_dir): for file in os.listdir(input_dir):
if os.path.isdir(os.path.join(input_dir, file)): if os.path.isdir(os.path.join(input_dir, file)):
jobs += collect_jobs(os.path.join(input_dir, file), jobs += collect_jobs(os.path.join(input_dir, file),
os.path.join(output_dir, file)) os.path.join(output_dir, file))
elif file.endswith('.pdf'): elif file.lower().endswith('.pdf'):
jobs.append(OCRPipelineJob(os.path.join(input_dir, file), job = OCRPipelineJob(os.path.join(input_dir, file),
os.path.join(output_dir, file))) os.path.join(output_dir, file),
os.path.join(intermediate_dir, file))
jobs.append(job)
return jobs return jobs
def main(): def main():
args = parse_args() args = parse_args()
jobs = collect_jobs(args.input_directory, args.output_directory) ocr_pipeline = OCRPipeline(args.input_directory, args.language,
ocr_pipeline = OCRPipeline(args.binarize, jobs, args.language, args.output_directory, args.binarize,
args.n_cores, args.output_directory, args.zip) args.intermediate_directory, args.n_cores,
args.zip)
retval = ocr_pipeline.run( retval = ocr_pipeline.run(
dataDirRoot=(args.log_dir or args.output_directory), dataDirRoot=(args.log_dir or args.output_directory),
nCores=args.n_cores nCores=args.n_cores

View File

@ -7,6 +7,7 @@ import subprocess
CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest' CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest'
CONTAINER_INPUT_DIR = '/input' CONTAINER_INPUT_DIR = '/input'
CONTAINER_INTERMEDIATE_DIR = '/intermediate'
CONTAINER_OUTPUT_DIR = '/output' CONTAINER_OUTPUT_DIR = '/output'
UID = str(os.getuid()) UID = str(os.getuid())
GID = str(os.getgid()) GID = str(os.getgid())
@ -14,9 +15,15 @@ GID = str(os.getgid())
parser = ArgumentParser(add_help=False) parser = ArgumentParser(add_help=False)
parser.add_argument('-i', '--input-directory') parser.add_argument('-i', '--input-directory')
parser.add_argument('-o', '--output-directory') parser.add_argument('-o', '--output-directory')
parser.add_argument('--intermediate-directory')
args, remaining_args = parser.parse_known_args() args, remaining_args = parser.parse_known_args()
cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)] cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)]
if args.intermediate_directory is not None:
cmd += ['-v', '{}:{}'.format(os.path.abspath(args.intermediate_directory),
CONTAINER_INTERMEDIATE_DIR)]
remaining_args.insert(0, CONTAINER_INTERMEDIATE_DIR)
remaining_args.insert(0, '--intermediate-directory')
if args.output_directory is not None: if args.output_directory is not None:
cmd += ['-v', '{}:{}'.format(os.path.abspath(args.output_directory), cmd += ['-v', '{}:{}'.format(os.path.abspath(args.output_directory),
CONTAINER_OUTPUT_DIR)] CONTAINER_OUTPUT_DIR)]