Update OCR Pipeline

This commit is contained in:
Patrick Jentsch 2020-04-03 17:35:30 +02:00
parent eb5ccf4e21
commit 36a86887b0
3 changed files with 305 additions and 472 deletions

View File

@ -2,25 +2,15 @@
# coding=utf-8 # coding=utf-8
from xml.sax.saxutils import escape from xml.sax.saxutils import escape
import argparse from argparse import ArgumentParser
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
parser = argparse.ArgumentParser( parser = ArgumentParser(description='Merges hOCR files to one P5 file.')
description='Merges several hOCR files in order of their occurrence on command line to one TEI result file.' parser.add_argument('i', metavar='hOCR-sourcefile', nargs='+')
) parser.add_argument('o', metavar='TEI-destfile',)
parser.add_argument(
'i',
metavar='hOCR-sourcefile',
nargs='+'
)
parser.add_argument(
'o',
metavar='TEI-destfile',
)
args = parser.parse_args() args = parser.parse_args()
output_file = open(args.o, 'w') output_file = open(args.o, 'w')
output_file.write( output_file.write(
'<?xml version="1.0" encoding="UTF-8"?>\n' '<?xml version="1.0" encoding="UTF-8"?>\n'
+ '<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="dtabf">\n' + '<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="dtabf">\n'
@ -54,5 +44,4 @@ output_file.write(
' </body>\n' ' </body>\n'
+ ' </text>\n' + ' </text>\n'
+ '</TEI>') + '</TEI>')
output_file.close() output_file.close()

712
ocr
View File

@ -10,292 +10,206 @@ Author: Patrick Jentsch <p.jentsch@uni-bielefeld.de>
""" """
import argparse from argparse import ArgumentParser
from pyflow import WorkflowRunner
import multiprocessing import multiprocessing
import os import os
import re import re
import sys import sys
from pyflow import WorkflowRunner
def parse_arguments(): TESSERACT_MODELS = ['deu', 'eng', 'enm', 'fra', 'frk', 'frm', 'ita', 'por',
parser = argparse.ArgumentParser( 'spa']
description='''Performs OCR of (historical) documents utilizing OCRopus
for preprocessing and Tesseract OCR for OCR. The results
are served as hOCR, PDF, raw text and TEI compliant XML def parse_args():
files.\n parser = ArgumentParser(description='OCR Pipeline utilizing tesseract.')
Software requirements: imagemagick, ocropus, pdftoppm, parser.add_argument('i')
pdfunite, poppler-utils, pyflow, python2.7, python3.5, parser.add_argument('o')
tesseract''' parser.add_argument('-l', '--language', choices=TESSERACT_MODELS,
) required=True)
parser.add_argument( parser.add_argument('--binarize', action='store_true',
'-i', help='use ocropy binarisation')
dest='input_dir', parser.add_argument('--keep-intermediates', action='store_true',
required=True help='keep intermediate files')
) parser.add_argument('--n-cores',
parser.add_argument(
'-l',
choices=[
'deu', 'eng', 'enm', 'fra', 'frk', 'frm', 'ita', 'por', 'spa'
],
dest='lang',
required=True
)
parser.add_argument(
'-o',
dest='output_dir',
required=True
)
parser.add_argument(
'--skip-binarisation',
action='store_true',
default=False,
dest='skip_binarisation',
help='skip ocropy binarisation',
required=False
)
parser.add_argument(
'--keep-intermediates',
action='store_true',
default=False,
dest='keep_intermediates',
help='keep intermediate files',
required=False
)
parser.add_argument(
'--nCores',
default=min(4, multiprocessing.cpu_count()), default=min(4, multiprocessing.cpu_count()),
dest='n_cores', help='total number of cores available', type=int)
help='total number of cores available', parser.add_argument('--log-dir')
required=False, parser.add_argument('--zip')
type=int
)
parser.add_argument(
'--zip',
default='ocr-result-files',
dest='zip',
type=str,
help='package result files in zip bundles and asign an filename prefix',
required=False
)
return parser.parse_args() return parser.parse_args()
class OCRWorkflow(WorkflowRunner): class OCRPipelineJob:
def __init__(self, args): def __init__(self, file, output_dir):
self.jobs = analyze_jobs(args.input_dir, args.output_dir) self.file = file
self.skip_binarisation = args.skip_binarisation self.name = os.path.basename(file).rsplit('.', 1)[0]
self.keep_intermediates = args.keep_intermediates self.output_dir = output_dir
self.lang = args.lang
self.n_cores = args.n_cores
self.output_dir = args.output_dir class OCRPipeline(WorkflowRunner):
self.zip = args.zip def __init__(self, binarize, jobs, keep_intermediates, lang, n_cores,
output_dir, zip):
self.binarize = binarize
self.jobs = jobs
self.keep_intermediates = keep_intermediates
self.lang = lang
self.n_cores = n_cores
self.output_dir = output_dir
self.zip = zip
def workflow(self): def workflow(self):
if len(self.jobs) == 0: if not self.jobs:
return return
''' '''
' ################################################## ' ##################################################
' # Create output directories # ' # mkdir_jobs #
' ################################################## ' ##################################################
''' '''
create_output_directories_jobs = [] mkdir_jobs = []
for index, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
cmd = 'mkdir -p "%s"' % ( output_dir = os.path.join(job.output_dir, 'tmp')
os.path.join(job['output_dir'], 'tmp') cmd = 'mkdir'
) cmd += ' -p'
cmd += ' "{}"'.format(output_dir)
if self.keep_intermediates: if self.keep_intermediates:
cmd += ' "%s" "%s" "%s" "%s"' % ( cmd += ' "{}"'.format(os.path.join(output_dir, 'hocr'))
os.path.join(job['output_dir'], 'tmp', 'hocr'), cmd += ' "{}"'.format(os.path.join(output_dir, 'pdf'))
os.path.join(job['output_dir'], 'tmp', 'pdf'), cmd += ' "{}"'.format(os.path.join(output_dir, 'tiff'))
os.path.join(job['output_dir'], 'tmp', 'tiff'), cmd += ' "{}"'.format(os.path.join(output_dir, 'txt'))
os.path.join(job['output_dir'], 'tmp', 'txt') if self.binarize:
) cmd += ' "{}"'.format(os.path.join(output_dir, 'bin.png'))
if not self.skip_binarisation: cmd += ' "{}"'.format(os.path.join(output_dir, 'nrm.png'))
cmd += ' "%s"' % ( lbl = 'mkdir_job_-_{}'.format(i)
os.path.join(job['output_dir'], 'tmp', 'bin.png') mkdir_jobs.append(self.addTask(command=cmd, label=lbl))
)
create_output_directories_jobs.append(
self.addTask(
command=cmd,
label='create_output_directories_job_-_%i' % (index)
)
)
''' '''
' ################################################## ' ##################################################
' # Split # ' # pdftoppm_jobs #
' ################################################## ' ##################################################
''' '''
split_jobs = [] pdftoppm_jobs = []
split_job_n_cores = min( n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs))))
self.n_cores, for i, job in enumerate(self.jobs):
max(1, int(self.n_cores / len(self.jobs))) output_dir = os.path.join(job.output_dir, 'tmp')
) output_file_base = os.path.join(output_dir, 'page')
for index, job in enumerate(self.jobs): cmd = 'pdftoppm'
if job['filename'].endswith(('.tif', '.tiff')): cmd += ' -r 300'
''' cmd += ' -tiff'
' This command also works for PDF input but ocropus-nlbin cmd += ' -tiffcompression lzw'
' is not able to handle the TIFF output of it. cmd += ' "{}" "{}"'.format(job.file, output_file_base)
''' deps = 'mkdir_job_-_{}'.format(i)
cmd = 'convert -density 300 "%s" -compress LZW -scene 1 "%s/page-%%d.tif"' % ( lbl = 'pdftoppm_job_-_{}'.format(i)
job['path'], pdftoppm_jobs.append(self.addTask(command=cmd, dependencies=deps,
os.path.join(job['output_dir'], 'tmp') label=lbl, nCores=n_cores))
)
else:
cmd = 'pdftoppm -r 300 -tiff -tiffcompression lzw "%s" "%s"' % (
job['path'],
os.path.join(job['output_dir'], 'tmp', 'page')
)
split_jobs.append( if self.binarize:
self.addTask(
command=cmd,
dependencies='create_output_directories_job_-_%i' % (index),
label='split_job_-_%i' % (index),
nCores=split_job_n_cores
)
)
if not self.skip_binarisation:
''' '''
' The binarisation_jobs are created based of the output files of ' The ocropus_nlbin_jobs list is created based on the output files
' the split_jobs. So wait until they are finished. ' of the pdftoppm_jobs. So wait until they are finished.
''' '''
self.waitForTasks() self.waitForTasks()
''' '''
' ################################################## ' ##################################################
' # Binarise # ' # ocropus_nlbin_jobs #
' ################################################## ' ##################################################
''' '''
binarisation_jobs = [] ocropus_nlbin_jobs = []
''' '''
' We run ocropus-nlbin with either four or, if there are less then ' We run ocropus-nlbin with either four or, if there are less then
' four cores available for this workflow, the available core ' four cores available for this workflow, the available core
' number. ' number.
''' '''
binarisation_job_n_cores = min(4, self.n_cores) n_cores = min(4, self.n_cores)
for index, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
files = os.listdir(os.path.join(job['output_dir'], 'tmp')) input_dir = os.path.join(job.output_dir, 'tmp')
files = filter(lambda x: x.endswith('.tif'), files) output_dir = input_dir
files = filter(lambda x: x.endswith('.tif'),
os.listdir(input_dir))
files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) files.sort(key=lambda x: int(re.search(r'\d+', x).group(0)))
files = map( files = map(lambda x: os.path.join(input_dir, x), files)
lambda x: '"' + os.path.join(job['output_dir'], 'tmp', x) + '"', cmd = 'ocropus-nlbin "{}"'.format('" "'.join(files))
files cmd += ' -o "{}"'.format(output_dir)
) cmd += ' -Q "{}"'.format(n_cores)
cmd = 'ocropus-nlbin --output "%s" --parallel "%i" %s' % ( deps = 'pdftoppm_job_-_{}'.format(i)
os.path.join(job['output_dir'], 'tmp'), lbl = 'ocropus_nlbin_job_-_{}'.format(i)
binarisation_job_n_cores, ocropus_nlbin_jobs.append(
' '.join(files) self.addTask(command=cmd, dependencies=deps, label=lbl,
) nCores=n_cores))
binarisation_jobs.append(
self.addTask(
command=cmd,
dependencies='split_job_-_%i' % (index),
label='binarisation_job_-_%i' % (index),
nCores=binarisation_job_n_cores
)
)
''' '''
' The post_binarisation_jobs are created based of the output files ' The post_ocropus_nlbin_jobs are created based on the output files
' of the binarisation_jobs. So wait until they are finished. ' of the ocropus_nlbin_jobs. So wait until they are finished.
''' '''
self.waitForTasks() self.waitForTasks()
''' '''
' ################################################## ' ##################################################
' # Normalise file names from binarisation # ' # post_ocropus_nlbin_jobs #
' ################################################## ' ##################################################
''' '''
post_binarisation_jobs = [] post_ocropus_nlbin_jobs = []
for index, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
input_dir = os.path.join(job.output_dir, 'tmp')
output_dir = input_dir
number = 0 number = 0
files = os.listdir(os.path.join(job['output_dir'], 'tmp')) files = filter(lambda x: x.endswith('.bin.png'),
files = filter(lambda x: x.endswith('.bin.png'), files) os.listdir(input_dir))
files.sort() files.sort()
for file in files: for file in files:
cmd = 'mv "%s" "%s"' % ( # int conversion is done in order to trim leading zeros
os.path.join(job['output_dir'], 'tmp', file), output_file = os.path.join(output_dir, 'page-{}.bin.png'.format(int(file.split('.', 1)[0]))) # noqa
os.path.join( cmd = 'mv "{}" "{}"'.format(os.path.join(output_dir, file),
job['output_dir'], output_file)
'tmp', deps = 'ocropus_nlbin_job_-_{}'.format(i)
'page-%i.bin.png' % (int(file.split('.', 1)[0])) lbl = 'post_ocropus_nlbin_job_-_{}-{}'.format(i, number)
) post_ocropus_nlbin_jobs.append(
) self.addTask(command=cmd, dependencies=deps,
post_binarisation_jobs.append( label=lbl))
self.addTask(
command=cmd,
dependencies='binarisation_job_-_%i' % (index),
label='post_binarisation_job_-_%i-%i' % (
index,
number
)
)
)
number += 1 number += 1
''' '''
' The ocr_jobs are created based of the output files of either the ' The tesseract_jobs are created based of the output files of either
' split_jobs or post_binarisation_jobs. So wait until they are ' the pdftoppm_jobs or post_ocropus_nlbin_jobs. So wait until they are
' finished. ' finished.
''' '''
self.waitForTasks() self.waitForTasks()
''' '''
' ################################################## ' ##################################################
' # Optical Character Recognition # ' # tesseract_jobs #
' ################################################## ' ##################################################
''' '''
ocr_jobs = [] tesseract_jobs = []
''' '''
' Tesseract runs fastest with four cores. So we run it with either four ' Tesseract runs fastest with four cores. So we run it with either four
' or, if there are less then four cores available for this workflow, ' or, if there are less then four cores available for this workflow,
' the available core number. ' the available core number.
''' '''
ocr_job_n_cores = min(4, self.n_cores) n_cores = min(4, self.n_cores)
for index, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
files = os.listdir(os.path.join(job['output_dir'], 'tmp')) input_dir = os.path.join(job.output_dir, 'tmp')
if self.skip_binarisation: output_dir = input_dir
files = filter(lambda x: x.endswith('.tif'), files) files = filter(lambda x: x.endswith('.bin.png' if self.binarize else '.tif'), # noqa
else: os.listdir(input_dir))
files = filter(lambda x: x.endswith('.bin.png'), files)
files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) files.sort(key=lambda x: int(re.search(r'\d+', x).group(0)))
files = map( files = map(lambda x: os.path.join(input_dir, x), files)
lambda x: os.path.join(job['output_dir'], 'tmp', x),
files
)
number = 0 number = 0
for file in files: for file in files:
cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % ( output_file_base = os.path.join(output_dir, file.rsplit('.', 2 if self.binarize else 1)[0]) # noqa
file, cmd = 'tesseract "{}" "{}"'.format(file, output_file_base)
os.path.join( cmd += ' -l "{}"'.format(self.lang)
job['output_dir'], cmd += ' hocr pdf txt'
'tmp', if self.binarize:
file.rsplit('.', 1 if self.skip_binarisation else 2)[0] deps = 'post_ocropus_nlbin_job_-_{}-{}'.format(i, number)
),
self.lang
)
if self.skip_binarisation:
ocr_job_dependencies = 'split_job_-_%i' % (index)
else: else:
ocr_job_dependencies = filter( deps = 'pdftoppm_job_-_{}'.format(i)
lambda x: x == 'post_binarisation_job_-_%i-%i' % ( label = 'tesseract_jobs_-_{}-{}'.format(i, number)
index, tesseract_jobs.append(
number self.addTask(command=cmd, dependencies=deps, label=label,
), nCores=n_cores))
post_binarisation_jobs
)
ocr_jobs.append(
self.addTask(
command=cmd,
dependencies=ocr_job_dependencies,
label='ocr_job_-_%i-%i' % (index, number),
nCores=ocr_job_n_cores
)
)
number += 1 number += 1
''' '''
@ -306,251 +220,191 @@ class OCRWorkflow(WorkflowRunner):
''' '''
' ################################################## ' ##################################################
' # Create TEI P5 files # ' # hocrtotei_jobs #
' ################################################## ' ##################################################
''' '''
hocr_to_tei_jobs = [] hocrtotei_jobs = []
for index, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
files = os.listdir(os.path.join(job['output_dir'], 'tmp')) input_dir = os.path.join(job.output_dir, 'tmp')
files = filter(lambda x: x.endswith('.hocr'), files) files = filter(lambda x: x.endswith('.hocr'),
os.listdir(input_dir))
files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) files.sort(key=lambda x: int(re.search(r'\d+', x).group(0)))
files = map( files = map(lambda x: os.path.join(input_dir, x), files)
lambda x: '"' + os.path.join(job['output_dir'], 'tmp', x) + '"', output_file = os.path.join(job.output_dir,
files '{}.xml'.format(job.name))
) cmd = 'hocrtotei "{}" "{}"'.format('" "'.join(files), output_file)
cmd = 'hocrtotei %s "%s"' % ( deps = filter(lambda x: x.startswith('ocr_job_-_{}'.format(i)),
' '.join(files), tesseract_jobs)
os.path.join( lbl = 'hocrtotei_job_-_{}'.format(i)
job['output_dir'], hocrtotei_jobs.append(self.addTask(command=cmd, dependencies=deps,
os.path.join(job['output_dir'], job['name'] + '.xml') label=lbl))
)
)
hocr_to_tei_jobs.append(
self.addTask(
command=cmd,
dependencies=filter(
lambda x: x.startswith('ocr_job_-_%i' % (index)),
ocr_jobs
),
label='hocr_to_tei_job_-_%i' % (index)
)
)
''' '''
' ################################################## ' ##################################################
' # Merge PDF files # ' # pdfunite_jobs #
' ################################################## ' ##################################################
''' '''
pdf_merge_jobs = [] pdfunite_jobs = []
for index, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
files = os.listdir(os.path.join(job['output_dir'], 'tmp')) input_dir = os.path.join(job.output_dir, 'tmp')
files = filter(lambda x: x.endswith('.pdf'), files) files = filter(lambda x: x.endswith('.pdf'), os.listdir(input_dir))
files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) files.sort(key=lambda x: int(re.search(r'\d+', x).group(0)))
files = map( files = map(lambda x: os.path.join(input_dir, x), files)
lambda x: '"' + os.path.join(job['output_dir'], 'tmp', x) + '"', output_file = os.path.join(job.output_dir,
files '{}.pdf'.format(job.name))
) cmd = 'pdfunite "{}" "{}"'.format('" "'.join(files), output_file)
cmd = 'pdfunite %s "%s"' % ( deps = filter(lambda x: x.startswith('ocr_job_-_{}'.format(i)),
' '.join(files), tesseract_jobs)
os.path.join( lbl = 'pdfunite_job_-_{}'.format(i)
job['output_dir'], pdfunite_jobs.append(self.addTask(command=cmd, dependencies=deps,
os.path.join(job['output_dir'], job['name'] + '.pdf') label=lbl))
)
)
pdf_merge_jobs.append(
self.addTask(
command=cmd,
dependencies=filter(
lambda x: x.startswith('ocr_job_-_%i' % (index)),
ocr_jobs
),
label='pdf_merge_job_-_%i' % (index)
)
)
''' '''
' ################################################## ' ##################################################
' # Merge text files # ' # cat_jobs #
' ################################################## ' ##################################################
''' '''
txt_merge_jobs = [] cat_jobs = []
for index, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
files = os.listdir(os.path.join(job['output_dir'], 'tmp')) input_dir = os.path.join(job.output_dir, 'tmp')
files = filter(lambda x: x.endswith('.txt'), files) files = filter(lambda x: x.endswith('.txt'), os.listdir(input_dir))
files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) files.sort(key=lambda x: int(re.search(r'\d+', x).group(0)))
files = map( files = map(lambda x: os.path.join(input_dir, x), files)
lambda x: '"' + os.path.join(job['output_dir'], 'tmp', x) + '"', output_file = os.path.join(job.output_dir,
files '{}.txt'.format(job.name))
) cmd = 'cat "{}" > "{}"'.format('" "'.join(files), output_file)
cmd = 'cat %s > "%s"' % ( deps = filter(lambda x: x.startswith('ocr_job_-_{}'.format(i)),
' '.join(files), tesseract_jobs)
os.path.join( lbl = 'cat_job_-_{}'.format(i)
job['output_dir'], cat_jobs.append(self.addTask(command=cmd, dependencies=deps,
os.path.join(job['output_dir'], job['name'] + '.txt') label=lbl))
)
)
txt_merge_jobs.append(
self.addTask(
command=cmd,
dependencies=filter(
lambda x: x.startswith('ocr_job_-_%i' % (index)),
ocr_jobs
),
label='txt_merge_job_-_%i' % (index)
)
)
if self.zip:
all_zip_jobs = []
all_zip_job_dependencies = (hocr_to_tei_jobs
+ pdf_merge_jobs
+ txt_merge_jobs)
cmd = 'cd "%s" && zip "%s"-all-ocr-files.zip */*.{pdf,txt,xml} -x "pyflow.data*" && cd -' % (
self.output_dir,
self.zip
)
all_zip_jobs.append(
self.addTask(
command=cmd,
dependencies=all_zip_job_dependencies,
label='all_zip_job'
)
)
pdf_zip_jobs = []
pdf_zip_job_dependencies = all_zip_jobs
cmd = 'cd "%s" && zip -m "%s"-ocr-pdf.zip */*.pdf -x "pyflow.data*" && cd -' % (
self.output_dir,
self.zip
)
pdf_zip_jobs.append(
self.addTask(
command=cmd,
dependencies=pdf_zip_job_dependencies,
label='pdf_zip_job'
)
)
txt_zip_jobs = []
txt_zip_job_dependencies = all_zip_jobs
cmd = 'cd "%s" && zip -m "%s"-ocr-txt.zip */*.txt -x "pyflow.data*" && cd -' % (
self.output_dir,
self.zip
)
txt_zip_jobs.append(
self.addTask(
command=cmd,
dependencies=txt_zip_job_dependencies,
label='txt_zip_job'
)
)
xml_zip_jobs = []
xml_zip_job_dependencies = all_zip_jobs
cmd = 'cd "%s" && zip -m "%s"-ocr-xml.zip */*.xml -x "pyflow.data*" && cd -' % (
self.output_dir,
self.zip
)
xml_zip_jobs.append(
self.addTask(
command=cmd,
dependencies=xml_zip_job_dependencies,
label='xml_zip_job'
)
)
''' '''
' ################################################## ' ##################################################
' # Cleanup # ' # zip_jobs #
' ################################################## ' ##################################################
''' '''
cleanup_jobs = [] zip_jobs = []
if self.zip is not None:
cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' -r'
cmd += ' "{}_-_all" .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.pdf" "*.txt" "*.xml"'
cmd += ' && '
cmd += 'cd -'
deps = hocrtotei_jobs + pdfunite_jobs + cat_jobs
lbl = 'zip_job_-_all'
zip_jobs.append(self.addTask(command=cmd, dependencies=deps,
label=lbl))
cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' -m'
cmd += ' -r'
cmd += ' "{}_-_pdf" .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.pdf"'
cmd += ' && '
cmd += 'cd -'
deps = 'zip_job_-_all'
lbl = 'zip_job_-_pdf'
zip_jobs.append(self.addTask(command=cmd, dependencies=deps,
label=lbl))
cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' -m'
cmd += ' -r'
cmd += ' "{}_-_txt" .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.txt"'
cmd += ' && '
cmd += 'cd -'
deps = 'zip_job_-_all'
lbl = 'zip_job_-_txt'
zip_jobs.append(self.addTask(command=cmd, dependencies=deps,
label=lbl))
cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' -m'
cmd += ' -r'
cmd += ' "{}_-_xml" .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.xml"'
cmd += ' && '
cmd += 'cd -'
deps = 'zip_job_-_all'
lbl = 'zip_job_-_xml'
zip_jobs.append(self.addTask(command=cmd, dependencies=deps,
label=lbl))
'''
' ##################################################
' # mv_jobs #
' ##################################################
'''
mv_jobs = []
if self.keep_intermediates: if self.keep_intermediates:
for index, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
cleanup_job_dependencies = [ input_dir = os.path.join(job.output_dir, 'tmp')
'hocr_to_tei_job_-_%i' % (index), output_dir = input_dir
'pdf_merge_job_-_%i' % (index), cmd = 'mv "{}"/*.hocr "{}"'.format(
'txt_merge_job_-_%i' % (index) input_dir, os.path.join(output_dir, 'hocr'))
] cmd += ' && '
cmd = 'mv "%s"/*.hocr "%s"' % ( cmd += 'mv "{}"/*.pdf "{}"'.format(input_dir, os.path.join(output_dir, 'pdf')) # noqa
os.path.join(job['output_dir'], 'tmp'), cmd += ' && '
os.path.join(job['output_dir'], 'tmp', 'hocr'), cmd += 'mv "{}"/*.tif "{}"'.format(input_dir, os.path.join(output_dir, 'tiff')) # noqa
) cmd += ' && '
cmd += ' && mv "%s"/*.pdf "%s"' % ( cmd += 'mv "{}"/*.txt "{}"'.format(input_dir, os.path.join(output_dir, 'txt')) # noqa
os.path.join(job['output_dir'], 'tmp'), if self.binarize:
os.path.join(job['output_dir'], 'tmp', 'pdf'), cmd += ' && '
) cmd += 'mv "{}"/*.bin.png "{}"'.format(input_dir, os.path.join(output_dir, 'bin.png')) # noqa
cmd += ' && mv "%s"/*.tif "%s"' % ( cmd += ' && '
os.path.join(job['output_dir'], 'tmp'), cmd += 'mv "{}"/*.nrm.png "{}"'.format(input_dir, os.path.join(output_dir, 'nrm.png')) # noqa
os.path.join(job['output_dir'], 'tmp', 'tiff'), deps = ['hocrtotei_job_-_{}'.format(i),
) 'pdfunite_job_-_{}'.format(i),
cmd += ' && mv "%s"/*.txt "%s"' % ( 'cat_job_-_{}'.format(i)]
os.path.join(job['output_dir'], 'tmp'), lbl = 'mv_job_-_{}'.format(i)
os.path.join(job['output_dir'], 'tmp', 'txt'), mv_jobs.append(self.addTask(command=cmd, dependencies=deps,
) label=lbl))
if not self.skip_binarisation:
cmd += ' && mv "%s"/*.bin.png "%s"' % (
os.path.join(job['output_dir'], 'tmp'),
os.path.join(job['output_dir'], 'tmp', 'bin.png'),
)
cmd += ' && rm "%s"/*.nrm.png' % (
os.path.join(job['output_dir'], 'tmp')
)
cleanup_jobs.append(
self.addTask(
command=cmd,
dependencies=cleanup_job_dependencies,
label='cleanup_job_-_%i' % (index)
)
)
else: else:
for index, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
cleanup_job_dependencies = [ input_dir = os.path.join(job.output_dir, 'tmp')
'hocr_to_tei_job_-_%i' % (index), cmd = 'rm -r "{}"'.format(input_dir)
'pdf_merge_job_-_%i' % (index), deps = ['hocrtotei_job_-_{}'.format(i),
'txt_merge_job_-_%i' % (index) 'pdfunite_job_-_{}'.format(i),
] 'cat_job_-_{}'.format(i)]
cmd = 'rm -r "%s"' % ( lbl = 'mv_job_-_{}'.format(i)
os.path.join(job['output_dir'], 'tmp') mv_jobs.append(self.addTask(command=cmd, dependencies=deps,
) label=lbl))
cleanup_jobs.append(
self.addTask(
command=cmd,
dependencies=cleanup_job_dependencies,
label='cleanup_job_-_%i' % (index)
)
)
def analyze_jobs(input_dir, output_dir): def collect_jobs(input_dir, output_dir):
jobs = [] jobs = []
for file in os.listdir(input_dir): for file in os.listdir(input_dir):
if os.path.isdir(os.path.join(input_dir, file)): if os.path.isdir(os.path.join(input_dir, file)):
jobs += analyze_jobs( jobs += collect_jobs(os.path.join(input_dir, file),
os.path.join(input_dir, file), os.path.join(output_dir, file))
os.path.join(output_dir, file) elif file.endswith('.pdf'):
) jobs.append(OCRPipelineJob(os.path.join(input_dir, file),
elif file.endswith(('.pdf', '.tif', '.tiff')): os.path.join(output_dir, file)))
jobs.append(
{
'filename': file,
'name': file.rsplit('.', 1)[0],
'output_dir': os.path.join(output_dir, file),
'path': os.path.join(input_dir, file)
}
)
return jobs return jobs
def main(): def main():
args = parse_arguments() args = parse_args()
jobs = collect_jobs(args.i, args.o)
wflow = OCRWorkflow(args) ocr_pipeline = OCRPipeline(args.binarize, jobs, args.keep_intermediates,
args.language, args.n_cores, args.o, args.zip)
retval = wflow.run(dataDirRoot=args.output_dir, nCores=args.n_cores) retval = ocr_pipeline.run(dataDirRoot=(args.log_dir or args.o),
nCores=args.n_cores)
sys.exit(retval) sys.exit(retval)

View File

@ -1,39 +1,29 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# coding=utf-8 # coding=utf-8
import argparse from argparse import ArgumentParser
import os import os
import subprocess import subprocess
container_image = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest' CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest'
container_input_dir = '/input' CONTAINER_INPUT_DIR = '/input'
container_output_dir = '/output' CONTAINER_OUTPUT_DIR = '/output'
uid = str(os.getuid()) UID = str(os.getuid())
gid = str(os.getgid()) GID = str(os.getgid())
parser = argparse.ArgumentParser(add_help=False) parser = ArgumentParser(add_help=False)
parser.add_argument( parser.add_argument('-i')
'-i', parser.add_argument('-o')
dest='input_dir',
required=False
)
parser.add_argument(
'-o',
dest='output_dir',
required=False
)
args, remaining_args = parser.parse_known_args() args, remaining_args = parser.parse_known_args()
cmd = ['docker', 'run', '--rm', '-it', '-u', uid + ':' + gid] cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)]
if args.input_dir is not None: if args.o is not None:
host_input_dir = os.path.abspath(args.input_dir) cmd += ['-v', '{}:{}'.format(os.path.abspath(args.o), CONTAINER_OUTPUT_DIR)]
cmd += ['-v', host_input_dir + ':' + container_input_dir] remaining_args.insert(0, CONTAINER_OUTPUT_DIR)
remaining_args += ['-i', container_input_dir] if args.i is not None:
if args.output_dir is not None: cmd += ['-v', '{}:{}'.format(os.path.abspath(args.i), CONTAINER_INPUT_DIR)]
host_output_dir = os.path.abspath(args.output_dir) remaining_args.insert(0, CONTAINER_INPUT_DIR)
cmd += ['-v', host_output_dir + ':' + container_output_dir] cmd.append(CONTAINER_IMAGE)
remaining_args += ['-o', container_output_dir]
cmd.append(container_image)
cmd += remaining_args cmd += remaining_args
subprocess.run(cmd) subprocess.run(cmd)