Add skip binarization

This commit is contained in:
Patrick Jentsch 2019-04-12 15:28:24 +02:00
parent 0a25afbd51
commit ac9b25271f

37
ocr
View File

@ -42,6 +42,12 @@ def parse_arguments():
dest="outputDir", dest="outputDir",
help="Output directory.", help="Output directory.",
required=True) required=True)
parser.add_argument("--skip-binarization",
action='store_true',
default=False,
dest="skipBinarization",
help="Skip binarization.",
required=False)
parser.add_argument("--keep-intermediates", parser.add_argument("--keep-intermediates",
action='store_true', action='store_true',
default=False, default=False,
@ -58,8 +64,9 @@ def parse_arguments():
class OCRWorkflow(WorkflowRunner): class OCRWorkflow(WorkflowRunner):
def __init__(self, jobs, keepIntermediates, lang, nCores): def __init__(self, jobs, skipBinarization, keepIntermediates, lang, nCores):
self.jobs = jobs self.jobs = jobs
self.skipBinarization = skipBinarization
self.keepIntermediates = keepIntermediates self.keepIntermediates = keepIntermediates
self.lang = lang self.lang = lang
self.nCores = nCores self.nCores = nCores
@ -116,17 +123,18 @@ class OCRWorkflow(WorkflowRunner):
# Task "ocropus_nlbin_job": binarize tiff files from previous split # Task "ocropus_nlbin_job": binarize tiff files from previous split
# Dependencies: split_jobs # Dependencies: split_jobs
### ###
self.waitForTasks()
ocropusnlbin_jobs = [] ocropusnlbin_jobs = []
ocropusnlbin_job_number = 0 if (not self.skipBinarization):
for job in self.jobs: self.waitForTasks()
ocropusnlbin_job_number += 1 ocropusnlbin_job_number = 0
cmd = 'ocropus-nlbin -Q "%i" -o "%s" "%s"/*' % ( for job in self.jobs:
max(1, int(self.nCores / len(self.jobs))), ocropusnlbin_job_number += 1
os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"), cmd = 'ocropus-nlbin -Q "%i" -o "%s" "%s"/*' % (
os.path.join(job["output_dir"], "tmp", "tiff_files") max(1, int(self.nCores / len(self.jobs))),
) os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"),
ocropusnlbin_jobs.append(self.addTask(label="ocropusnlbin_job_-_%i" % (ocropusnlbin_job_number), command=cmd, dependencies=split_jobs, nCores=max(1, int(self.nCores / len(self.jobs))))) os.path.join(job["output_dir"], "tmp", "tiff_files")
)
ocropusnlbin_jobs.append(self.addTask(label="ocropusnlbin_job_-_%i" % (ocropusnlbin_job_number), command=cmd, dependencies=split_jobs, nCores=max(1, int(self.nCores / len(self.jobs)))))
### ###
# Task "tesseract_job": perform OCR on binarized images # Task "tesseract_job": perform OCR on binarized images
@ -137,11 +145,11 @@ class OCRWorkflow(WorkflowRunner):
tesseract_job_number = 0 tesseract_job_number = 0
for job in self.jobs: for job in self.jobs:
# This list is empty if you don't wait for ocropus_nlbin_jobs to complete # This list is empty if you don't wait for ocropus_nlbin_jobs to complete
for file in filter(lambda x: x.endswith(".bin.png"), os.listdir(os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"))): for file in filter(lambda x: self.skipBinarization or x.endswith(".bin.png"), os.listdir(os.path.join(job["output_dir"], "tmp", "tiff_files" if self.skipBinarization else "ocropus-nlbin")):
tesseract_job_number += 1 tesseract_job_number += 1
cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % ( cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % (
os.path.join(job["output_dir"], "tmp", "ocropus-nlbin", file), os.path.join(job["output_dir"], "tmp", "tiff_files" if self.skipBinarization else "ocropus-nlbin", file),
os.path.join(job["output_dir"], "tmp", "tesseract", file.rsplit(".", 2)[0]), os.path.join(job["output_dir"], "tmp", "tesseract", file.rsplit(".", 1 if self.skipBinarization else 2)[0]),
self.lang self.lang
) )
tesseract_jobs.append(self.addTask(label="tesseract_job_-_%i" % (tesseract_job_number), command=cmd, dependencies=ocropusnlbin_jobs, nCores=min(4, self.nCores))) tesseract_jobs.append(self.addTask(label="tesseract_job_-_%i" % (tesseract_job_number), command=cmd, dependencies=ocropusnlbin_jobs, nCores=min(4, self.nCores)))
@ -240,6 +248,7 @@ def main():
wflow = OCRWorkflow( wflow = OCRWorkflow(
analyze_jobs(args.inputDir, args.outputDir), analyze_jobs(args.inputDir, args.outputDir),
args.skipBinarization,
args.keepIntermediates, args.keepIntermediates,
args.lang, args.lang,
args.nCores args.nCores