mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2025-01-31 03:49:04 +00:00
Added raw text output.
This commit is contained in:
parent
ae7bd0c51e
commit
923dbe2179
14
ocr_pyflow
14
ocr_pyflow
@ -249,6 +249,20 @@ class OCRWorkflow(WorkflowRunner):
|
||||
pdf_merge_jobs.append(self.addTask(label="pdf_merge_job_-_%i" % (pdf_merge_job_number), command=cmd, dependencies=tesseract_jobs, nCores=1, memMb=4096))
|
||||
|
||||
|
||||
###
|
||||
# Task "pdf_to_txt_jobs":
|
||||
# Dependencies: pdf_merge_jobs
|
||||
###
|
||||
pdf_to_txt_jobs = []
|
||||
pdf_to_txt_job_number = 0
|
||||
if self.pdf:
|
||||
for job in self.pdfImageJobs["images"] + self.pdfImageJobs["pdf"]:
|
||||
pdf_to_txt_job_number += 1
|
||||
cmd = "pdftotext -raw %s" % (
|
||||
os.path.join(job["output_dir"], os.path.basename(job["path"].rsplit(".", 1)[0] + ".pdf")))
|
||||
pdf_merge_jobs.append(self.addTask(label="pdf_to_txt_job_-_%i" % (pdf_to_txt_job_number), command=cmd, dependencies=pdf_merge_jobs, nCores=1, memMb=4096))
|
||||
|
||||
|
||||
###
|
||||
# Task "move_hocr_job": move hocr files from <output_dir>/tmp/tesseract to <output_dir>/hocr_files
|
||||
# Dependencies: tesseract_jobs
|
||||
|
Loading…
x
Reference in New Issue
Block a user