Start one ocropus-nlbin job per page instead of one per document

This commit is contained in:
Patrick Jentsch 2019-04-11 11:50:09 +02:00
parent 2648f40b92
commit a947e36997

15
ocr
View File

@ -116,15 +116,18 @@ class OCRWorkflow(WorkflowRunner):
# Task "ocropus_nlbin_job": binarize tiff files from previous split # Task "ocropus_nlbin_job": binarize tiff files from previous split
# Dependencies: split_jobs # Dependencies: split_jobs
### ###
self.waitForTasks()
ocropusnlbin_jobs = [] ocropusnlbin_jobs = []
ocropusnlbin_job_number = 0 ocropusnlbin_job_number = 0
for job in self.jobs: for job in self.jobs:
ocropusnlbin_job_number += 1 # This list is empty if you don't wait for ocropus_nlbin_jobs to complete
cmd = 'ocropus-nlbin -o "%s" "%s"/*' % ( for file in os.listdir(os.path.join(job["output_dir"], "tmp", "tiff_files")):
os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"), ocropusnlbin_job_number += 1
os.path.join(job["output_dir"], "tmp", "tiff_files") cmd = 'ocropus-nlbin -o "%s" "%s"/*' % (
) os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"),
ocropusnlbin_jobs.append(self.addTask(label="ocropusnlbin_job_-_%i" % (ocropusnlbin_job_number), command=cmd, dependencies=split_jobs)) os.path.join(job["output_dir"], "tmp", "tiff_files", file)
)
ocropusnlbin_jobs.append(self.addTask(label="ocropusnlbin_job_-_%i" % (ocropusnlbin_job_number), command=cmd, dependencies=split_jobs))
### ###
# Task "tesseract_job": perform OCR on binarized images # Task "tesseract_job": perform OCR on binarized images