diff --git a/ocr_pyflow b/ocr_pyflow index 3bb6bde..4ad3fc8 100755 --- a/ocr_pyflow +++ b/ocr_pyflow @@ -249,6 +249,20 @@ class OCRWorkflow(WorkflowRunner): pdf_merge_jobs.append(self.addTask(label="pdf_merge_job_-_%i" % (pdf_merge_job_number), command=cmd, dependencies=tesseract_jobs, nCores=1, memMb=4096)) + ### + # Task "pdf_to_txt_jobs": + # Dependencies: pdf_merge_jobs + ### + pdf_to_txt_jobs = [] + pdf_to_txt_job_number = 0 + if self.pdf: + for job in self.pdfImageJobs["images"] + self.pdfImageJobs["pdf"]: + pdf_to_txt_job_number += 1 + cmd = "pdftotext -raw %s" % ( + os.path.join(job["output_dir"], os.path.basename(job["path"].rsplit(".", 1)[0] + ".pdf"))) + pdf_merge_jobs.append(self.addTask(label="pdf_to_txt_job_-_%i" % (pdf_to_txt_job_number), command=cmd, dependencies=pdf_merge_jobs, nCores=1, memMb=4096)) + + ### # Task "move_hocr_job": move hocr files from /tmp/tesseract to /hocr_files # Dependencies: tesseract_jobs @@ -340,4 +354,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main()