From a947e369976fd9236c5648a53a2191b389582308 Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Thu, 11 Apr 2019 11:50:09 +0200 Subject: [PATCH] Start one ocropus-nlbin job per page instead of one per document --- ocr | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/ocr b/ocr index c7b50ff..c80a3ac 100755 --- a/ocr +++ b/ocr @@ -116,15 +116,18 @@ class OCRWorkflow(WorkflowRunner): # Task "ocropus_nlbin_job": binarize tiff files from previous split # Dependencies: split_jobs ### + self.waitForTasks() ocropusnlbin_jobs = [] ocropusnlbin_job_number = 0 for job in self.jobs: - ocropusnlbin_job_number += 1 - cmd = 'ocropus-nlbin -o "%s" "%s"/*' % ( - os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"), - os.path.join(job["output_dir"], "tmp", "tiff_files") - ) - ocropusnlbin_jobs.append(self.addTask(label="ocropusnlbin_job_-_%i" % (ocropusnlbin_job_number), command=cmd, dependencies=split_jobs)) + # This list is empty if you don't wait for ocropus_nlbin_jobs to complete + for file in os.listdir(os.path.join(job["output_dir"], "tmp", "tiff_files")): + ocropusnlbin_job_number += 1 + cmd = 'ocropus-nlbin -o "%s" "%s"/*' % ( + os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"), + os.path.join(job["output_dir"], "tmp", "tiff_files", file) + ) + ocropusnlbin_jobs.append(self.addTask(label="ocropusnlbin_job_-_%i" % (ocropusnlbin_job_number), command=cmd, dependencies=split_jobs)) ### # Task "tesseract_job": perform OCR on binarized images