This commit is contained in:
Patrick Jentsch 2019-04-15 10:25:57 +02:00
parent 5e11fcae01
commit 5e43e09beb

22
ocr
View File

@ -84,14 +84,17 @@ class OCRWorkflow(WorkflowRunner):
create_output_directories_job_number = 0 create_output_directories_job_number = 0
for job in self.jobs: for job in self.jobs:
create_output_directories_job_number += 1 create_output_directories_job_number += 1
cmd = 'mkdir -p "%s" "%s" "%s" "%s" "%s" "%s"' % ( cmd = 'mkdir -p "%s" "%s" "%s" "%s"' % (
os.path.join(job["output_dir"], "tmp", "binarized_png"),
os.path.join(job["output_dir"], "tmp", "hocr"), os.path.join(job["output_dir"], "tmp", "hocr"),
os.path.join(job["output_dir"], "tmp", "normalized_png"),
os.path.join(job["output_dir"], "tmp", "pdf"), os.path.join(job["output_dir"], "tmp", "pdf"),
os.path.join(job["output_dir"], "tmp", "tiff"), os.path.join(job["output_dir"], "tmp", "tiff"),
os.path.join(job["output_dir"], "tmp", "txt") os.path.join(job["output_dir"], "tmp", "txt")
) )
if not self.skipBinarization:
cmd += ' "%s" "%s"' % (
os.path.join(job["output_dir"], "tmp", "binarized_png"),
os.path.join(job["output_dir"], "tmp", "normalized_png"),
)
create_output_directories_jobs.append(self.addTask(label="create_output_directories_job_-_%i" % (create_output_directories_job_number), command=cmd)) create_output_directories_jobs.append(self.addTask(label="create_output_directories_job_-_%i" % (create_output_directories_job_number), command=cmd))
### ###
@ -237,20 +240,23 @@ class OCRWorkflow(WorkflowRunner):
if self.keepIntermediates: if self.keepIntermediates:
for job in self.jobs: for job in self.jobs:
cleanup_job_counter += 1 cleanup_job_counter += 1
cmd = 'mv "%s"/*.bin.png "%s" && mv "%s"/*.hocr "%s" && mv "%s"/*.nrm.png "%s" && mv "%s"/*.pdf "%s" && mv "%s"/*.tif "%s" && mv "%s"/*.txt "%s"' % ( cmd = 'mv "%s"/*.hocr "%s" && mv "%s"/*.pdf "%s" && mv "%s"/*.tif "%s" && mv "%s"/*.txt "%s"' % (
os.path.join(job["output_dir"], "tmp"),
os.path.join(job["output_dir"], "tmp", "binarized_png"),
os.path.join(job["output_dir"], "tmp"), os.path.join(job["output_dir"], "tmp"),
os.path.join(job["output_dir"], "tmp", "hocr"), os.path.join(job["output_dir"], "tmp", "hocr"),
os.path.join(job["output_dir"], "tmp"), os.path.join(job["output_dir"], "tmp"),
os.path.join(job["output_dir"], "tmp", "normalized_png"),
os.path.join(job["output_dir"], "tmp"),
os.path.join(job["output_dir"], "tmp", "pdf"), os.path.join(job["output_dir"], "tmp", "pdf"),
os.path.join(job["output_dir"], "tmp"), os.path.join(job["output_dir"], "tmp"),
os.path.join(job["output_dir"], "tmp", "tiff"), os.path.join(job["output_dir"], "tmp", "tiff"),
os.path.join(job["output_dir"], "tmp"), os.path.join(job["output_dir"], "tmp"),
os.path.join(job["output_dir"], "tmp", "txt") os.path.join(job["output_dir"], "tmp", "txt")
) )
if not self.skipBinarization:
cmd += ' && mv "%s"/*.bin.png "%s" && mv "%s"/*.nrm.png "%s"' % (
os.path.join(job["output_dir"], "tmp"),
os.path.join(job["output_dir"], "tmp", "binarized_png"),
os.path.join(job["output_dir"], "tmp"),
os.path.join(job["output_dir"], "tmp", "normalized_png"),
)
cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_tei_jobs + pdf_merge_jobs + txt_merge_jobs)) cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_tei_jobs + pdf_merge_jobs + txt_merge_jobs))
else: else:
for job in self.jobs: for job in self.jobs: