update for better graph

This commit is contained in:
Patrick Jentsch 2019-05-15 13:54:08 +02:00
parent e5c0d53a03
commit b9dba80d7f

125
ocr
View File

@ -57,11 +57,11 @@ def parse_arguments():
required=False required=False
) )
parser.add_argument( parser.add_argument(
'--skip-binarization', '--skip-binarisation',
action='store_true', action='store_true',
default=False, default=False,
dest='skipBinarization', dest='skipBinarisation',
help='Skip binarization.', help='Skip binarisation.',
required=False required=False
) )
parser.add_argument( parser.add_argument(
@ -86,21 +86,21 @@ def parse_arguments():
class OCRWorkflow(WorkflowRunner): class OCRWorkflow(WorkflowRunner):
def __init__(self, args): def __init__(self, args):
self.jobs = analyze_jobs(args.inputDirectory, args.outputDirectory) self.jobs = analyze_jobs(args.inputDirectory, args.outputDirectory)
self.skipBinarization = args.skipBinarization self.skipBinarisation = args.skipBinarisation
self.keepIntermediates = args.keepIntermediates self.keepIntermediates = args.keepIntermediates
self.lang = args.lang self.lang = args.lang
self.nCores = args.nCores self.nCores = args.nCores
def workflow(self): def workflow(self):
print('##########################################################') '''
print('# Starting workflow... #') ' Starting workflow...
print('##########################################################') '''
for index, job in enumerate(self.jobs): for index, job in enumerate(self.jobs):
print('%i: %s' % (index, job)) print('%i: %s' % (index, job))
print('##########################################################') '''
print('# Creating output directories... #') ' Creating output directories...
print('##########################################################') '''
create_output_directories_jobs = [] create_output_directories_jobs = []
for index, job in enumerate(self.jobs): for index, job in enumerate(self.jobs):
cmd = 'mkdir -p "%s"' % ( cmd = 'mkdir -p "%s"' % (
@ -113,7 +113,7 @@ class OCRWorkflow(WorkflowRunner):
os.path.join(job['output_dir'], 'tmp', 'tiff'), os.path.join(job['output_dir'], 'tmp', 'tiff'),
os.path.join(job['output_dir'], 'tmp', 'txt') os.path.join(job['output_dir'], 'tmp', 'txt')
) )
if not self.skipBinarization: if not self.skipBinarisation:
cmd += ' "%s" "%s"' % ( cmd += ' "%s" "%s"' % (
os.path.join(job['output_dir'], 'tmp', 'bin.png'), os.path.join(job['output_dir'], 'tmp', 'bin.png'),
os.path.join(job['output_dir'], 'tmp', 'nrm.png'), os.path.join(job['output_dir'], 'tmp', 'nrm.png'),
@ -124,11 +124,10 @@ class OCRWorkflow(WorkflowRunner):
label='create_output_directories_job_-_%i' % (index) label='create_output_directories_job_-_%i' % (index)
) )
) )
self.waitForTasks()
print('##########################################################') '''
print('# Splitting... #') ' Splitting...
print('##########################################################') '''
split_jobs = [] split_jobs = []
split_job_nCores = min( split_job_nCores = min(
self.nCores, self.nCores,
@ -148,16 +147,16 @@ class OCRWorkflow(WorkflowRunner):
split_jobs.append( split_jobs.append(
self.addTask( self.addTask(
command=cmd, command=cmd,
dependencies='create_output_directories_job_-_%i' % (index),
label='split_job_-_%i' % (index), label='split_job_-_%i' % (index),
nCores=split_job_nCores nCores=split_job_nCores
) )
) )
self.waitForTasks()
if not self.skipBinarization: if not self.skipBinarisation:
print('##########################################################') '''
print('# Binarising... #') ' Binarising...
print('##########################################################') '''
binarisation_jobs = [] binarisation_jobs = []
''' '''
' We run ocropus-nlbin with either four or, if there are less then ' We run ocropus-nlbin with either four or, if there are less then
@ -174,15 +173,16 @@ class OCRWorkflow(WorkflowRunner):
binarisation_jobs.append( binarisation_jobs.append(
self.addTask( self.addTask(
command=cmd, command=cmd,
dependencies='split_job_-_%i' % (index),
label='binarisation_job_-_%i' % (index), label='binarisation_job_-_%i' % (index),
nCores=binarisation_job_nCores nCores=binarisation_job_nCores
) )
) )
self.waitForTasks()
print('##########################################################') '''
print('# Normalising file names from binarisation... #') ' Normalising file names from binarisation...
print('##########################################################') '''
self.waitForTasks()
post_binarisation_jobs = [] post_binarisation_jobs = []
for index, job in enumerate(self.jobs): for index, job in enumerate(self.jobs):
number = 0 number = 0
@ -197,6 +197,7 @@ class OCRWorkflow(WorkflowRunner):
post_binarisation_jobs.append( post_binarisation_jobs.append(
self.addTask( self.addTask(
command=cmd, command=cmd,
dependencies='binarisation_job_-_%i' % (index),
label='post_binarisation_job_-_%i-%i' % ( label='post_binarisation_job_-_%i-%i' % (
index, index,
number number
@ -204,11 +205,12 @@ class OCRWorkflow(WorkflowRunner):
) )
) )
number += 1 number += 1
self.waitForTasks()
print('##########################################################') '''
print('# Performing OCR... #') ' Performing OCR...
print('##########################################################') '''
self.waitForTasks()
print(self)
ocr_jobs = [] ocr_jobs = []
''' '''
' Tesseract runs fastest with four cores. So we run it with either four ' Tesseract runs fastest with four cores. So we run it with either four
@ -224,29 +226,39 @@ class OCRWorkflow(WorkflowRunner):
ocr_job_nCores = 1 ocr_job_nCores = 1
for index, job in enumerate(self.jobs): for index, job in enumerate(self.jobs):
number = 0 number = 0
for file in filter(lambda x: x.endswith('.tif') if self.skipBinarization else x.endswith('.bin.png'), os.listdir(os.path.join(job['output_dir'], 'tmp'))): for file in filter(lambda x: x.endswith('.tif') if self.skipBinarisation else x.endswith('.bin.png'), os.listdir(os.path.join(job['output_dir'], 'tmp'))):
cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % ( cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % (
os.path.join(job['output_dir'], 'tmp', file), os.path.join(job['output_dir'], 'tmp', file),
os.path.join( os.path.join(
job['output_dir'], job['output_dir'],
'tmp', 'tmp',
file.rsplit('.', 1 if self.skipBinarization else 2)[0] file.rsplit('.', 1 if self.skipBinarisation else 2)[0]
), ),
self.lang self.lang
) )
if self.skipBinarisation:
ocr_job_dependencies = 'split_job_-_%i' % (index)
else:
ocr_job_dependencies = filter(
lambda x: x.startswith(
'post_binarisation_job_-_%i' % (index)
),
post_binarisation_jobs
)
print(ocr_job_dependencies)
ocr_jobs.append( ocr_jobs.append(
self.addTask( self.addTask(
command=cmd, command=cmd,
dependencies=ocr_job_dependencies,
label='ocr_job_-_%i-%i' % (index, number), label='ocr_job_-_%i-%i' % (index, number),
nCores=ocr_job_nCores nCores=ocr_job_nCores
) )
) )
number += 1 number += 1
self.waitForTasks()
print('##########################################################') '''
print('# Creating TEI P5 files... #') ' Creating TEI P5 files...
print('##########################################################') '''
hocr_to_tei_jobs = [] hocr_to_tei_jobs = []
for index, job in enumerate(self.jobs): for index, job in enumerate(self.jobs):
cmd = 'hocrtotei "%s" "%s"' % ( cmd = 'hocrtotei "%s" "%s"' % (
@ -259,13 +271,17 @@ class OCRWorkflow(WorkflowRunner):
hocr_to_tei_jobs.append( hocr_to_tei_jobs.append(
self.addTask( self.addTask(
command=cmd, command=cmd,
dependencies=filter(
lambda x: x.startswith('ocr_job_-_%i' % (index)),
ocr_jobs
),
label='hocr_to_tei_job_-_%i' % (index) label='hocr_to_tei_job_-_%i' % (index)
) )
) )
print('##########################################################') '''
print('# Merging PDF files... #') ' Merging PDF files...
print('##########################################################') '''
pdf_merge_jobs = [] pdf_merge_jobs = []
for index, job in enumerate(self.jobs): for index, job in enumerate(self.jobs):
cmd = '(ls --quoting-style=shell-escape -v "%s"/*.pdf && echo "\'%s\'") | xargs pdfunite' % ( cmd = '(ls --quoting-style=shell-escape -v "%s"/*.pdf && echo "\'%s\'") | xargs pdfunite' % (
@ -278,13 +294,17 @@ class OCRWorkflow(WorkflowRunner):
pdf_merge_jobs.append( pdf_merge_jobs.append(
self.addTask( self.addTask(
command=cmd, command=cmd,
dependencies=filter(
lambda x: x.startswith('ocr_job_-_%i' % (index)),
ocr_jobs
),
label='pdf_merge_job_-_%i' % (index) label='pdf_merge_job_-_%i' % (index)
) )
) )
print('##########################################################') '''
print('# Merging text files... #') ' Merging text files...
print('##########################################################') '''
txt_merge_jobs = [] txt_merge_jobs = []
for index, job in enumerate(self.jobs): for index, job in enumerate(self.jobs):
cmd = 'ls --quoting-style=shell-escape -v "%s"/*.txt | xargs cat > "%s"' % ( cmd = 'ls --quoting-style=shell-escape -v "%s"/*.txt | xargs cat > "%s"' % (
@ -297,17 +317,25 @@ class OCRWorkflow(WorkflowRunner):
txt_merge_jobs.append( txt_merge_jobs.append(
self.addTask( self.addTask(
command=cmd, command=cmd,
dependencies=filter(
lambda x: x.startswith('ocr_job_-_%i' % (index)),
ocr_jobs
),
label='txt_merge_job_-_%i' % (index) label='txt_merge_job_-_%i' % (index)
) )
) )
self.waitForTasks()
print('##########################################################') '''
print('# Cleanup... #') ' Cleanup...
print('##########################################################') '''
cleanup_jobs = [] cleanup_jobs = []
if self.keepIntermediates: if self.keepIntermediates:
for index, job in enumerate(self.jobs): for index, job in enumerate(self.jobs):
cleanup_job_dependencies = [
'hocr_to_tei_job_-_%i' % (index),
'pdf_merge_job_-_%i' % (index),
'txt_merge_job_-_%i' % (index)
]
cmd = 'mv "%s"/*.hocr "%s"' % ( cmd = 'mv "%s"/*.hocr "%s"' % (
os.path.join(job['output_dir'], 'tmp'), os.path.join(job['output_dir'], 'tmp'),
os.path.join(job['output_dir'], 'tmp', 'hocr'), os.path.join(job['output_dir'], 'tmp', 'hocr'),
@ -324,7 +352,7 @@ class OCRWorkflow(WorkflowRunner):
os.path.join(job['output_dir'], 'tmp'), os.path.join(job['output_dir'], 'tmp'),
os.path.join(job['output_dir'], 'tmp', 'txt'), os.path.join(job['output_dir'], 'tmp', 'txt'),
) )
if not self.skipBinarization: if not self.skipBinarisation:
cmd += ' && mv "%s"/*.bin.png "%s"' % ( cmd += ' && mv "%s"/*.bin.png "%s"' % (
os.path.join(job['output_dir'], 'tmp'), os.path.join(job['output_dir'], 'tmp'),
os.path.join(job['output_dir'], 'tmp', 'bin.png'), os.path.join(job['output_dir'], 'tmp', 'bin.png'),
@ -336,17 +364,24 @@ class OCRWorkflow(WorkflowRunner):
cleanup_jobs.append( cleanup_jobs.append(
self.addTask( self.addTask(
command=cmd, command=cmd,
dependencies=cleanup_job_dependencies,
label='cleanup_job_-_%i' % (index) label='cleanup_job_-_%i' % (index)
) )
) )
else: else:
for index, job in enumerate(self.jobs): for index, job in enumerate(self.jobs):
cleanup_job_dependencies = [
'hocr_to_tei_job_-_%i' % (index),
'pdf_merge_job_-_%i' % (index),
'txt_merge_job_-_%i' % (index)
]
cmd = 'rm -r "%s"' % ( cmd = 'rm -r "%s"' % (
os.path.join(job['output_dir'], 'tmp') os.path.join(job['output_dir'], 'tmp')
) )
cleanup_jobs.append( cleanup_jobs.append(
self.addTask( self.addTask(
command=cmd, command=cmd,
dependencies=cleanup_job_dependencies,
label='cleanup_job_-_%i' % (index) label='cleanup_job_-_%i' % (index)
) )
) )