diff --git a/ocr b/ocr index 1ad626f..b54d8d9 100755 --- a/ocr +++ b/ocr @@ -42,6 +42,12 @@ def parse_arguments(): dest="outputDir", help="Output directory.", required=True) + parser.add_argument("--skip-binarization", + action='store_true', + default=False, + dest="skipBinarization", + help="Skip binarization.", + required=False) parser.add_argument("--keep-intermediates", action='store_true', default=False, @@ -58,8 +64,9 @@ def parse_arguments(): class OCRWorkflow(WorkflowRunner): - def __init__(self, jobs, keepIntermediates, lang, nCores): + def __init__(self, jobs, skipBinarization, keepIntermediates, lang, nCores): self.jobs = jobs + self.skipBinarization = skipBinarization self.keepIntermediates = keepIntermediates self.lang = lang self.nCores = nCores @@ -116,17 +123,18 @@ class OCRWorkflow(WorkflowRunner): # Task "ocropus_nlbin_job": binarize tiff files from previous split # Dependencies: split_jobs ### - self.waitForTasks() ocropusnlbin_jobs = [] - ocropusnlbin_job_number = 0 - for job in self.jobs: - ocropusnlbin_job_number += 1 - cmd = 'ocropus-nlbin -Q "%i" -o "%s" "%s"/*' % ( - max(1, int(self.nCores / len(self.jobs))), - os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"), - os.path.join(job["output_dir"], "tmp", "tiff_files") - ) - ocropusnlbin_jobs.append(self.addTask(label="ocropusnlbin_job_-_%i" % (ocropusnlbin_job_number), command=cmd, dependencies=split_jobs, nCores=max(1, int(self.nCores / len(self.jobs))))) + if (not self.skipBinarization): + self.waitForTasks() + ocropusnlbin_job_number = 0 + for job in self.jobs: + ocropusnlbin_job_number += 1 + cmd = 'ocropus-nlbin -Q "%i" -o "%s" "%s"/*' % ( + max(1, int(self.nCores / len(self.jobs))), + os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"), + os.path.join(job["output_dir"], "tmp", "tiff_files") + ) + ocropusnlbin_jobs.append(self.addTask(label="ocropusnlbin_job_-_%i" % (ocropusnlbin_job_number), command=cmd, dependencies=split_jobs, nCores=max(1, int(self.nCores / len(self.jobs))))) ### # Task "tesseract_job": perform OCR on binarized images @@ -137,11 +145,11 @@ class OCRWorkflow(WorkflowRunner): tesseract_job_number = 0 for job in self.jobs: # This list is empty if you don't wait for ocropus_nlbin_jobs to complete - for file in filter(lambda x: x.endswith(".bin.png"), os.listdir(os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"))): + for file in filter(lambda x: self.skipBinarization or x.endswith(".bin.png"), os.listdir(os.path.join(job["output_dir"], "tmp", "tiff_files" if self.skipBinarization else "ocropus-nlbin")): tesseract_job_number += 1 cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % ( - os.path.join(job["output_dir"], "tmp", "ocropus-nlbin", file), - os.path.join(job["output_dir"], "tmp", "tesseract", file.rsplit(".", 2)[0]), + os.path.join(job["output_dir"], "tmp", "tiff_files" if self.skipBinarization else "ocropus-nlbin", file), + os.path.join(job["output_dir"], "tmp", "tesseract", file.rsplit(".", 1 if self.skipBinarization else 2)[0]), self.lang ) tesseract_jobs.append(self.addTask(label="tesseract_job_-_%i" % (tesseract_job_number), command=cmd, dependencies=ocropusnlbin_jobs, nCores=min(4, self.nCores))) @@ -240,6 +248,7 @@ def main(): wflow = OCRWorkflow( analyze_jobs(args.inputDir, args.outputDir), + args.skipBinarization, args.keepIntermediates, args.lang, args.nCores