mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2025-01-31 04:19:02 +00:00
Added raw text output.
This commit is contained in:
parent
ae7bd0c51e
commit
923dbe2179
16
ocr_pyflow
16
ocr_pyflow
@ -249,6 +249,20 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
pdf_merge_jobs.append(self.addTask(label="pdf_merge_job_-_%i" % (pdf_merge_job_number), command=cmd, dependencies=tesseract_jobs, nCores=1, memMb=4096))
|
pdf_merge_jobs.append(self.addTask(label="pdf_merge_job_-_%i" % (pdf_merge_job_number), command=cmd, dependencies=tesseract_jobs, nCores=1, memMb=4096))
|
||||||
|
|
||||||
|
|
||||||
|
###
|
||||||
|
# Task "pdf_to_txt_jobs":
|
||||||
|
# Dependencies: pdf_merge_jobs
|
||||||
|
###
|
||||||
|
pdf_to_txt_jobs = []
|
||||||
|
pdf_to_txt_job_number = 0
|
||||||
|
if self.pdf:
|
||||||
|
for job in self.pdfImageJobs["images"] + self.pdfImageJobs["pdf"]:
|
||||||
|
pdf_to_txt_job_number += 1
|
||||||
|
cmd = "pdftotext -raw %s" % (
|
||||||
|
os.path.join(job["output_dir"], os.path.basename(job["path"].rsplit(".", 1)[0] + ".pdf")))
|
||||||
|
pdf_merge_jobs.append(self.addTask(label="pdf_to_txt_job_-_%i" % (pdf_to_txt_job_number), command=cmd, dependencies=pdf_merge_jobs, nCores=1, memMb=4096))
|
||||||
|
|
||||||
|
|
||||||
###
|
###
|
||||||
# Task "move_hocr_job": move hocr files from <output_dir>/tmp/tesseract to <output_dir>/hocr_files
|
# Task "move_hocr_job": move hocr files from <output_dir>/tmp/tesseract to <output_dir>/hocr_files
|
||||||
# Dependencies: tesseract_jobs
|
# Dependencies: tesseract_jobs
|
||||||
@ -340,4 +354,4 @@ def main():
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user