Added raw text output.

This commit is contained in:
Stephan Porada 2018-11-28 16:49:55 +01:00
parent ae7bd0c51e
commit 923dbe2179

View File

@ -249,6 +249,20 @@ class OCRWorkflow(WorkflowRunner):
pdf_merge_jobs.append(self.addTask(label="pdf_merge_job_-_%i" % (pdf_merge_job_number), command=cmd, dependencies=tesseract_jobs, nCores=1, memMb=4096)) pdf_merge_jobs.append(self.addTask(label="pdf_merge_job_-_%i" % (pdf_merge_job_number), command=cmd, dependencies=tesseract_jobs, nCores=1, memMb=4096))
###
# Task "pdf_to_txt_jobs":
# Dependencies: pdf_merge_jobs
###
pdf_to_txt_jobs = []
pdf_to_txt_job_number = 0
if self.pdf:
for job in self.pdfImageJobs["images"] + self.pdfImageJobs["pdf"]:
pdf_to_txt_job_number += 1
cmd = "pdftotext -raw %s" % (
os.path.join(job["output_dir"], os.path.basename(job["path"].rsplit(".", 1)[0] + ".pdf")))
pdf_merge_jobs.append(self.addTask(label="pdf_to_txt_job_-_%i" % (pdf_to_txt_job_number), command=cmd, dependencies=pdf_merge_jobs, nCores=1, memMb=4096))
### ###
# Task "move_hocr_job": move hocr files from <output_dir>/tmp/tesseract to <output_dir>/hocr_files # Task "move_hocr_job": move hocr files from <output_dir>/tmp/tesseract to <output_dir>/hocr_files
# Dependencies: tesseract_jobs # Dependencies: tesseract_jobs
@ -340,4 +354,4 @@ def main():
if __name__ == "__main__": if __name__ == "__main__":
main() main()