Add skip binarization

This commit is contained in:
Patrick Jentsch 2019-04-12 15:28:24 +02:00
parent 0a25afbd51
commit ac9b25271f

37
ocr
View File

@ -42,6 +42,12 @@ def parse_arguments():
dest="outputDir",
help="Output directory.",
required=True)
parser.add_argument("--skip-binarization",
action='store_true',
default=False,
dest="skipBinarization",
help="Skip binarization.",
required=False)
parser.add_argument("--keep-intermediates",
action='store_true',
default=False,
@ -58,8 +64,9 @@ def parse_arguments():
class OCRWorkflow(WorkflowRunner):
def __init__(self, jobs, keepIntermediates, lang, nCores):
def __init__(self, jobs, skipBinarization, keepIntermediates, lang, nCores):
self.jobs = jobs
self.skipBinarization = skipBinarization
self.keepIntermediates = keepIntermediates
self.lang = lang
self.nCores = nCores
@ -116,17 +123,18 @@ class OCRWorkflow(WorkflowRunner):
# Task "ocropus_nlbin_job": binarize tiff files from previous split
# Dependencies: split_jobs
###
self.waitForTasks()
ocropusnlbin_jobs = []
ocropusnlbin_job_number = 0
for job in self.jobs:
ocropusnlbin_job_number += 1
cmd = 'ocropus-nlbin -Q "%i" -o "%s" "%s"/*' % (
max(1, int(self.nCores / len(self.jobs))),
os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"),
os.path.join(job["output_dir"], "tmp", "tiff_files")
)
ocropusnlbin_jobs.append(self.addTask(label="ocropusnlbin_job_-_%i" % (ocropusnlbin_job_number), command=cmd, dependencies=split_jobs, nCores=max(1, int(self.nCores / len(self.jobs)))))
if (not self.skipBinarization):
self.waitForTasks()
ocropusnlbin_job_number = 0
for job in self.jobs:
ocropusnlbin_job_number += 1
cmd = 'ocropus-nlbin -Q "%i" -o "%s" "%s"/*' % (
max(1, int(self.nCores / len(self.jobs))),
os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"),
os.path.join(job["output_dir"], "tmp", "tiff_files")
)
ocropusnlbin_jobs.append(self.addTask(label="ocropusnlbin_job_-_%i" % (ocropusnlbin_job_number), command=cmd, dependencies=split_jobs, nCores=max(1, int(self.nCores / len(self.jobs)))))
###
# Task "tesseract_job": perform OCR on binarized images
@ -137,11 +145,11 @@ class OCRWorkflow(WorkflowRunner):
tesseract_job_number = 0
for job in self.jobs:
# This list is empty if you don't wait for ocropus_nlbin_jobs to complete
for file in filter(lambda x: x.endswith(".bin.png"), os.listdir(os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"))):
for file in filter(lambda x: self.skipBinarization or x.endswith(".bin.png"), os.listdir(os.path.join(job["output_dir"], "tmp", "tiff_files" if self.skipBinarization else "ocropus-nlbin")):
tesseract_job_number += 1
cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % (
os.path.join(job["output_dir"], "tmp", "ocropus-nlbin", file),
os.path.join(job["output_dir"], "tmp", "tesseract", file.rsplit(".", 2)[0]),
os.path.join(job["output_dir"], "tmp", "tiff_files" if self.skipBinarization else "ocropus-nlbin", file),
os.path.join(job["output_dir"], "tmp", "tesseract", file.rsplit(".", 1 if self.skipBinarization else 2)[0]),
self.lang
)
tesseract_jobs.append(self.addTask(label="tesseract_job_-_%i" % (tesseract_job_number), command=cmd, dependencies=ocropusnlbin_jobs, nCores=min(4, self.nCores)))
@ -240,6 +248,7 @@ def main():
wflow = OCRWorkflow(
analyze_jobs(args.inputDir, args.outputDir),
args.skipBinarization,
args.keepIntermediates,
args.lang,
args.nCores