mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2025-01-28 05:10:34 +00:00
update for better graph
This commit is contained in:
parent
e5c0d53a03
commit
b9dba80d7f
125
ocr
125
ocr
@ -57,11 +57,11 @@ def parse_arguments():
|
|||||||
required=False
|
required=False
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--skip-binarization',
|
'--skip-binarisation',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
default=False,
|
default=False,
|
||||||
dest='skipBinarization',
|
dest='skipBinarisation',
|
||||||
help='Skip binarization.',
|
help='Skip binarisation.',
|
||||||
required=False
|
required=False
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -86,21 +86,21 @@ def parse_arguments():
|
|||||||
class OCRWorkflow(WorkflowRunner):
|
class OCRWorkflow(WorkflowRunner):
|
||||||
def __init__(self, args):
|
def __init__(self, args):
|
||||||
self.jobs = analyze_jobs(args.inputDirectory, args.outputDirectory)
|
self.jobs = analyze_jobs(args.inputDirectory, args.outputDirectory)
|
||||||
self.skipBinarization = args.skipBinarization
|
self.skipBinarisation = args.skipBinarisation
|
||||||
self.keepIntermediates = args.keepIntermediates
|
self.keepIntermediates = args.keepIntermediates
|
||||||
self.lang = args.lang
|
self.lang = args.lang
|
||||||
self.nCores = args.nCores
|
self.nCores = args.nCores
|
||||||
|
|
||||||
def workflow(self):
|
def workflow(self):
|
||||||
print('##########################################################')
|
'''
|
||||||
print('# Starting workflow... #')
|
' Starting workflow...
|
||||||
print('##########################################################')
|
'''
|
||||||
for index, job in enumerate(self.jobs):
|
for index, job in enumerate(self.jobs):
|
||||||
print('%i: %s' % (index, job))
|
print('%i: %s' % (index, job))
|
||||||
|
|
||||||
print('##########################################################')
|
'''
|
||||||
print('# Creating output directories... #')
|
' Creating output directories...
|
||||||
print('##########################################################')
|
'''
|
||||||
create_output_directories_jobs = []
|
create_output_directories_jobs = []
|
||||||
for index, job in enumerate(self.jobs):
|
for index, job in enumerate(self.jobs):
|
||||||
cmd = 'mkdir -p "%s"' % (
|
cmd = 'mkdir -p "%s"' % (
|
||||||
@ -113,7 +113,7 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
os.path.join(job['output_dir'], 'tmp', 'tiff'),
|
os.path.join(job['output_dir'], 'tmp', 'tiff'),
|
||||||
os.path.join(job['output_dir'], 'tmp', 'txt')
|
os.path.join(job['output_dir'], 'tmp', 'txt')
|
||||||
)
|
)
|
||||||
if not self.skipBinarization:
|
if not self.skipBinarisation:
|
||||||
cmd += ' "%s" "%s"' % (
|
cmd += ' "%s" "%s"' % (
|
||||||
os.path.join(job['output_dir'], 'tmp', 'bin.png'),
|
os.path.join(job['output_dir'], 'tmp', 'bin.png'),
|
||||||
os.path.join(job['output_dir'], 'tmp', 'nrm.png'),
|
os.path.join(job['output_dir'], 'tmp', 'nrm.png'),
|
||||||
@ -124,11 +124,10 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
label='create_output_directories_job_-_%i' % (index)
|
label='create_output_directories_job_-_%i' % (index)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
self.waitForTasks()
|
|
||||||
|
|
||||||
print('##########################################################')
|
'''
|
||||||
print('# Splitting... #')
|
' Splitting...
|
||||||
print('##########################################################')
|
'''
|
||||||
split_jobs = []
|
split_jobs = []
|
||||||
split_job_nCores = min(
|
split_job_nCores = min(
|
||||||
self.nCores,
|
self.nCores,
|
||||||
@ -148,16 +147,16 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
split_jobs.append(
|
split_jobs.append(
|
||||||
self.addTask(
|
self.addTask(
|
||||||
command=cmd,
|
command=cmd,
|
||||||
|
dependencies='create_output_directories_job_-_%i' % (index),
|
||||||
label='split_job_-_%i' % (index),
|
label='split_job_-_%i' % (index),
|
||||||
nCores=split_job_nCores
|
nCores=split_job_nCores
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
self.waitForTasks()
|
|
||||||
|
|
||||||
if not self.skipBinarization:
|
if not self.skipBinarisation:
|
||||||
print('##########################################################')
|
'''
|
||||||
print('# Binarising... #')
|
' Binarising...
|
||||||
print('##########################################################')
|
'''
|
||||||
binarisation_jobs = []
|
binarisation_jobs = []
|
||||||
'''
|
'''
|
||||||
' We run ocropus-nlbin with either four or, if there are less then
|
' We run ocropus-nlbin with either four or, if there are less then
|
||||||
@ -174,15 +173,16 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
binarisation_jobs.append(
|
binarisation_jobs.append(
|
||||||
self.addTask(
|
self.addTask(
|
||||||
command=cmd,
|
command=cmd,
|
||||||
|
dependencies='split_job_-_%i' % (index),
|
||||||
label='binarisation_job_-_%i' % (index),
|
label='binarisation_job_-_%i' % (index),
|
||||||
nCores=binarisation_job_nCores
|
nCores=binarisation_job_nCores
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
self.waitForTasks()
|
|
||||||
|
|
||||||
print('##########################################################')
|
'''
|
||||||
print('# Normalising file names from binarisation... #')
|
' Normalising file names from binarisation...
|
||||||
print('##########################################################')
|
'''
|
||||||
|
self.waitForTasks()
|
||||||
post_binarisation_jobs = []
|
post_binarisation_jobs = []
|
||||||
for index, job in enumerate(self.jobs):
|
for index, job in enumerate(self.jobs):
|
||||||
number = 0
|
number = 0
|
||||||
@ -197,6 +197,7 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
post_binarisation_jobs.append(
|
post_binarisation_jobs.append(
|
||||||
self.addTask(
|
self.addTask(
|
||||||
command=cmd,
|
command=cmd,
|
||||||
|
dependencies='binarisation_job_-_%i' % (index),
|
||||||
label='post_binarisation_job_-_%i-%i' % (
|
label='post_binarisation_job_-_%i-%i' % (
|
||||||
index,
|
index,
|
||||||
number
|
number
|
||||||
@ -204,11 +205,12 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
number += 1
|
number += 1
|
||||||
self.waitForTasks()
|
|
||||||
|
|
||||||
print('##########################################################')
|
'''
|
||||||
print('# Performing OCR... #')
|
' Performing OCR...
|
||||||
print('##########################################################')
|
'''
|
||||||
|
self.waitForTasks()
|
||||||
|
print(self)
|
||||||
ocr_jobs = []
|
ocr_jobs = []
|
||||||
'''
|
'''
|
||||||
' Tesseract runs fastest with four cores. So we run it with either four
|
' Tesseract runs fastest with four cores. So we run it with either four
|
||||||
@ -224,29 +226,39 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
ocr_job_nCores = 1
|
ocr_job_nCores = 1
|
||||||
for index, job in enumerate(self.jobs):
|
for index, job in enumerate(self.jobs):
|
||||||
number = 0
|
number = 0
|
||||||
for file in filter(lambda x: x.endswith('.tif') if self.skipBinarization else x.endswith('.bin.png'), os.listdir(os.path.join(job['output_dir'], 'tmp'))):
|
for file in filter(lambda x: x.endswith('.tif') if self.skipBinarisation else x.endswith('.bin.png'), os.listdir(os.path.join(job['output_dir'], 'tmp'))):
|
||||||
cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % (
|
cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % (
|
||||||
os.path.join(job['output_dir'], 'tmp', file),
|
os.path.join(job['output_dir'], 'tmp', file),
|
||||||
os.path.join(
|
os.path.join(
|
||||||
job['output_dir'],
|
job['output_dir'],
|
||||||
'tmp',
|
'tmp',
|
||||||
file.rsplit('.', 1 if self.skipBinarization else 2)[0]
|
file.rsplit('.', 1 if self.skipBinarisation else 2)[0]
|
||||||
),
|
),
|
||||||
self.lang
|
self.lang
|
||||||
)
|
)
|
||||||
|
if self.skipBinarisation:
|
||||||
|
ocr_job_dependencies = 'split_job_-_%i' % (index)
|
||||||
|
else:
|
||||||
|
ocr_job_dependencies = filter(
|
||||||
|
lambda x: x.startswith(
|
||||||
|
'post_binarisation_job_-_%i' % (index)
|
||||||
|
),
|
||||||
|
post_binarisation_jobs
|
||||||
|
)
|
||||||
|
print(ocr_job_dependencies)
|
||||||
ocr_jobs.append(
|
ocr_jobs.append(
|
||||||
self.addTask(
|
self.addTask(
|
||||||
command=cmd,
|
command=cmd,
|
||||||
|
dependencies=ocr_job_dependencies,
|
||||||
label='ocr_job_-_%i-%i' % (index, number),
|
label='ocr_job_-_%i-%i' % (index, number),
|
||||||
nCores=ocr_job_nCores
|
nCores=ocr_job_nCores
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
number += 1
|
number += 1
|
||||||
self.waitForTasks()
|
|
||||||
|
|
||||||
print('##########################################################')
|
'''
|
||||||
print('# Creating TEI P5 files... #')
|
' Creating TEI P5 files...
|
||||||
print('##########################################################')
|
'''
|
||||||
hocr_to_tei_jobs = []
|
hocr_to_tei_jobs = []
|
||||||
for index, job in enumerate(self.jobs):
|
for index, job in enumerate(self.jobs):
|
||||||
cmd = 'hocrtotei "%s" "%s"' % (
|
cmd = 'hocrtotei "%s" "%s"' % (
|
||||||
@ -259,13 +271,17 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
hocr_to_tei_jobs.append(
|
hocr_to_tei_jobs.append(
|
||||||
self.addTask(
|
self.addTask(
|
||||||
command=cmd,
|
command=cmd,
|
||||||
|
dependencies=filter(
|
||||||
|
lambda x: x.startswith('ocr_job_-_%i' % (index)),
|
||||||
|
ocr_jobs
|
||||||
|
),
|
||||||
label='hocr_to_tei_job_-_%i' % (index)
|
label='hocr_to_tei_job_-_%i' % (index)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
print('##########################################################')
|
'''
|
||||||
print('# Merging PDF files... #')
|
' Merging PDF files...
|
||||||
print('##########################################################')
|
'''
|
||||||
pdf_merge_jobs = []
|
pdf_merge_jobs = []
|
||||||
for index, job in enumerate(self.jobs):
|
for index, job in enumerate(self.jobs):
|
||||||
cmd = '(ls --quoting-style=shell-escape -v "%s"/*.pdf && echo "\'%s\'") | xargs pdfunite' % (
|
cmd = '(ls --quoting-style=shell-escape -v "%s"/*.pdf && echo "\'%s\'") | xargs pdfunite' % (
|
||||||
@ -278,13 +294,17 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
pdf_merge_jobs.append(
|
pdf_merge_jobs.append(
|
||||||
self.addTask(
|
self.addTask(
|
||||||
command=cmd,
|
command=cmd,
|
||||||
|
dependencies=filter(
|
||||||
|
lambda x: x.startswith('ocr_job_-_%i' % (index)),
|
||||||
|
ocr_jobs
|
||||||
|
),
|
||||||
label='pdf_merge_job_-_%i' % (index)
|
label='pdf_merge_job_-_%i' % (index)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
print('##########################################################')
|
'''
|
||||||
print('# Merging text files... #')
|
' Merging text files...
|
||||||
print('##########################################################')
|
'''
|
||||||
txt_merge_jobs = []
|
txt_merge_jobs = []
|
||||||
for index, job in enumerate(self.jobs):
|
for index, job in enumerate(self.jobs):
|
||||||
cmd = 'ls --quoting-style=shell-escape -v "%s"/*.txt | xargs cat > "%s"' % (
|
cmd = 'ls --quoting-style=shell-escape -v "%s"/*.txt | xargs cat > "%s"' % (
|
||||||
@ -297,17 +317,25 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
txt_merge_jobs.append(
|
txt_merge_jobs.append(
|
||||||
self.addTask(
|
self.addTask(
|
||||||
command=cmd,
|
command=cmd,
|
||||||
|
dependencies=filter(
|
||||||
|
lambda x: x.startswith('ocr_job_-_%i' % (index)),
|
||||||
|
ocr_jobs
|
||||||
|
),
|
||||||
label='txt_merge_job_-_%i' % (index)
|
label='txt_merge_job_-_%i' % (index)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
self.waitForTasks()
|
|
||||||
|
|
||||||
print('##########################################################')
|
'''
|
||||||
print('# Cleanup... #')
|
' Cleanup...
|
||||||
print('##########################################################')
|
'''
|
||||||
cleanup_jobs = []
|
cleanup_jobs = []
|
||||||
if self.keepIntermediates:
|
if self.keepIntermediates:
|
||||||
for index, job in enumerate(self.jobs):
|
for index, job in enumerate(self.jobs):
|
||||||
|
cleanup_job_dependencies = [
|
||||||
|
'hocr_to_tei_job_-_%i' % (index),
|
||||||
|
'pdf_merge_job_-_%i' % (index),
|
||||||
|
'txt_merge_job_-_%i' % (index)
|
||||||
|
]
|
||||||
cmd = 'mv "%s"/*.hocr "%s"' % (
|
cmd = 'mv "%s"/*.hocr "%s"' % (
|
||||||
os.path.join(job['output_dir'], 'tmp'),
|
os.path.join(job['output_dir'], 'tmp'),
|
||||||
os.path.join(job['output_dir'], 'tmp', 'hocr'),
|
os.path.join(job['output_dir'], 'tmp', 'hocr'),
|
||||||
@ -324,7 +352,7 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
os.path.join(job['output_dir'], 'tmp'),
|
os.path.join(job['output_dir'], 'tmp'),
|
||||||
os.path.join(job['output_dir'], 'tmp', 'txt'),
|
os.path.join(job['output_dir'], 'tmp', 'txt'),
|
||||||
)
|
)
|
||||||
if not self.skipBinarization:
|
if not self.skipBinarisation:
|
||||||
cmd += ' && mv "%s"/*.bin.png "%s"' % (
|
cmd += ' && mv "%s"/*.bin.png "%s"' % (
|
||||||
os.path.join(job['output_dir'], 'tmp'),
|
os.path.join(job['output_dir'], 'tmp'),
|
||||||
os.path.join(job['output_dir'], 'tmp', 'bin.png'),
|
os.path.join(job['output_dir'], 'tmp', 'bin.png'),
|
||||||
@ -336,17 +364,24 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
cleanup_jobs.append(
|
cleanup_jobs.append(
|
||||||
self.addTask(
|
self.addTask(
|
||||||
command=cmd,
|
command=cmd,
|
||||||
|
dependencies=cleanup_job_dependencies,
|
||||||
label='cleanup_job_-_%i' % (index)
|
label='cleanup_job_-_%i' % (index)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
for index, job in enumerate(self.jobs):
|
for index, job in enumerate(self.jobs):
|
||||||
|
cleanup_job_dependencies = [
|
||||||
|
'hocr_to_tei_job_-_%i' % (index),
|
||||||
|
'pdf_merge_job_-_%i' % (index),
|
||||||
|
'txt_merge_job_-_%i' % (index)
|
||||||
|
]
|
||||||
cmd = 'rm -r "%s"' % (
|
cmd = 'rm -r "%s"' % (
|
||||||
os.path.join(job['output_dir'], 'tmp')
|
os.path.join(job['output_dir'], 'tmp')
|
||||||
)
|
)
|
||||||
cleanup_jobs.append(
|
cleanup_jobs.append(
|
||||||
self.addTask(
|
self.addTask(
|
||||||
command=cmd,
|
command=cmd,
|
||||||
|
dependencies=cleanup_job_dependencies,
|
||||||
label='cleanup_job_-_%i' % (index)
|
label='cleanup_job_-_%i' % (index)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user