diff --git a/ocr b/ocr index 9f25402..bc96101 100755 --- a/ocr +++ b/ocr @@ -84,7 +84,14 @@ class OCRWorkflow(WorkflowRunner): create_output_directories_job_number = 0 for job in self.jobs: create_output_directories_job_number += 1 - cmd = 'mkdir -p "%s"' % (os.path.join(job["output_dir"], "tmp")) + cmd = 'mkdir -p "%s"' % ( + os.path.join(job["output_dir"], "tmp", "binarized"), + os.path.join(job["output_dir"], "tmp", "hocr"), + os.path.join(job["output_dir"], "tmp", "normalized"), + os.path.join(job["output_dir"], "tmp", "pdf"), + os.path.join(job["output_dir"], "tmp", "tiff"), + os.path.join(job["output_dir"], "tmp", "txt") + ) create_output_directories_jobs.append(self.addTask(label="create_output_directories_job_-_%i" % (create_output_directories_job_number), command=cmd)) ### @@ -226,7 +233,25 @@ class OCRWorkflow(WorkflowRunner): ### cleanup_jobs = [] cleanup_job_counter = 0 - if not self.keepIntermediates: + if self.keepIntermediates: + for job in self.jobs: + cleanup_job_counter += 1 + cmd = 'mv "%s"/*.bin.png "%s" && mv "%s"/*.hocr "%s" && mv "%s"/*.nrm.png "%s" && mv "%s"/*.pdf "%s" && mv "%s"/*.tif "%s" && mv "%s"/*.txt "%s"' % ( + os.path.join(job["output_dir"], "tmp"), + os.path.join(job["output_dir"], "tmp", "binarized"), + os.path.join(job["output_dir"], "tmp"), + os.path.join(job["output_dir"], "tmp", "hocr"), + os.path.join(job["output_dir"], "tmp"), + os.path.join(job["output_dir"], "tmp", "normalized"), + os.path.join(job["output_dir"], "tmp"), + os.path.join(job["output_dir"], "tmp", "pdf"), + os.path.join(job["output_dir"], "tmp"), + os.path.join(job["output_dir"], "tmp", "tiff"), + os.path.join(job["output_dir"], "tmp"), + os.path.join(job["output_dir"], "tmp", "txt") + ) + cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_tei_jobs + pdf_merge_jobs + txt_merge_jobs)) + else: for job in self.jobs: cleanup_job_counter += 1 cmd = 'rm -r "%s"' % (