From ac9b25271f0287812f51b3073be1e9b9d7bc7191 Mon Sep 17 00:00:00 2001
From: Patrick Jentsch
Date: Fri, 12 Apr 2019 15:28:24 +0200
Subject: [PATCH] Add skip binarization
---
ocr | 37 +++++++++++++++++++++++--------------
1 file changed, 23 insertions(+), 14 deletions(-)
diff --git a/ocr b/ocr
index 1ad626f..b54d8d9 100755
--- a/ocr
+++ b/ocr
@@ -42,6 +42,12 @@ def parse_arguments():
dest="outputDir",
help="Output directory.",
required=True)
+ parser.add_argument("--skip-binarization",
+ action='store_true',
+ default=False,
+ dest="skipBinarization",
+ help="Skip binarization.",
+ required=False)
parser.add_argument("--keep-intermediates",
action='store_true',
default=False,
@@ -58,8 +64,9 @@ def parse_arguments():
class OCRWorkflow(WorkflowRunner):
- def __init__(self, jobs, keepIntermediates, lang, nCores):
+ def __init__(self, jobs, skipBinarization, keepIntermediates, lang, nCores):
self.jobs = jobs
+ self.skipBinarization = skipBinarization
self.keepIntermediates = keepIntermediates
self.lang = lang
self.nCores = nCores
@@ -116,17 +123,18 @@ class OCRWorkflow(WorkflowRunner):
# Task "ocropus_nlbin_job": binarize tiff files from previous split
# Dependencies: split_jobs
###
- self.waitForTasks()
ocropusnlbin_jobs = []
- ocropusnlbin_job_number = 0
- for job in self.jobs:
- ocropusnlbin_job_number += 1
- cmd = 'ocropus-nlbin -Q "%i" -o "%s" "%s"/*' % (
- max(1, int(self.nCores / len(self.jobs))),
- os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"),
- os.path.join(job["output_dir"], "tmp", "tiff_files")
- )
- ocropusnlbin_jobs.append(self.addTask(label="ocropusnlbin_job_-_%i" % (ocropusnlbin_job_number), command=cmd, dependencies=split_jobs, nCores=max(1, int(self.nCores / len(self.jobs)))))
+ if (not self.skipBinarization):
+ self.waitForTasks()
+ ocropusnlbin_job_number = 0
+ for job in self.jobs:
+ ocropusnlbin_job_number += 1
+ cmd = 'ocropus-nlbin -Q "%i" -o "%s" "%s"/*' % (
+ max(1, int(self.nCores / len(self.jobs))),
+ os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"),
+ os.path.join(job["output_dir"], "tmp", "tiff_files")
+ )
+ ocropusnlbin_jobs.append(self.addTask(label="ocropusnlbin_job_-_%i" % (ocropusnlbin_job_number), command=cmd, dependencies=split_jobs, nCores=max(1, int(self.nCores / len(self.jobs)))))
###
# Task "tesseract_job": perform OCR on binarized images
@@ -137,11 +145,11 @@ class OCRWorkflow(WorkflowRunner):
tesseract_job_number = 0
for job in self.jobs:
# This list is empty if you don't wait for ocropus_nlbin_jobs to complete
- for file in filter(lambda x: x.endswith(".bin.png"), os.listdir(os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"))):
+ for file in filter(lambda x: self.skipBinarization or x.endswith(".bin.png"), os.listdir(os.path.join(job["output_dir"], "tmp", "tiff_files" if self.skipBinarization else "ocropus-nlbin")):
tesseract_job_number += 1
cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % (
- os.path.join(job["output_dir"], "tmp", "ocropus-nlbin", file),
- os.path.join(job["output_dir"], "tmp", "tesseract", file.rsplit(".", 2)[0]),
+ os.path.join(job["output_dir"], "tmp", "tiff_files" if self.skipBinarization else "ocropus-nlbin", file),
+ os.path.join(job["output_dir"], "tmp", "tesseract", file.rsplit(".", 1 if self.skipBinarization else 2)[0]),
self.lang
)
tesseract_jobs.append(self.addTask(label="tesseract_job_-_%i" % (tesseract_job_number), command=cmd, dependencies=ocropusnlbin_jobs, nCores=min(4, self.nCores)))
@@ -240,6 +248,7 @@ def main():
wflow = OCRWorkflow(
analyze_jobs(args.inputDir, args.outputDir),
+ args.skipBinarization,
args.keepIntermediates,
args.lang,
args.nCores