mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2025-01-13 16:40:34 +00:00
Fixed shlex to shlex()
This commit is contained in:
parent
ffcbf3e6b0
commit
1d69abf717
20
ocr_pyflow
20
ocr_pyflow
@ -115,7 +115,7 @@ class OCRWorkflow(WorkflowRunner):
|
||||
os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"),
|
||||
os.path.join(job["output_dir"], "tmp", "tesseract"),
|
||||
os.path.join(job["output_dir"], "tmp", "tiff_files"))
|
||||
cmd = shlex.escape(cmd);
|
||||
cmd = shlex().escape(cmd);
|
||||
mkdir_jobs.append(self.addTask(label="mkdir_job_-_%i" % (mkdir_job_number), command=cmd))
|
||||
|
||||
|
||||
@ -140,14 +140,14 @@ class OCRWorkflow(WorkflowRunner):
|
||||
os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + ".pdf"),
|
||||
os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0]),
|
||||
os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + ".pdf"))
|
||||
cmd = shlex.escape(cmd);
|
||||
cmd = shlex().escape(cmd);
|
||||
split_jobs.append(self.addTask(label="split_job_-_%i" % (split_job_number), command=cmd, dependencies=mkdir_jobs))
|
||||
for job in self.jobs["pdf"]:
|
||||
split_job_number += 1
|
||||
cmd = "pdftoppm %s %s -tiff -r 300 -tiffcompression lzw -cropbox" % (
|
||||
job["path"],
|
||||
os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0]))
|
||||
cmd = shlex.escape(cmd);
|
||||
cmd = shlex().escape(cmd);
|
||||
split_jobs.append(self.addTask(label="split_job_-_%i" % (split_job_number), command=cmd, dependencies=mkdir_jobs))
|
||||
|
||||
|
||||
@ -162,7 +162,7 @@ class OCRWorkflow(WorkflowRunner):
|
||||
cmd = "ocropus-nlbin -o %s %s" % (
|
||||
os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"),
|
||||
os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + "-*.tif"))
|
||||
cmd = shlex.escape(cmd);
|
||||
cmd = shlex().escape(cmd);
|
||||
ocropusnlbin_jobs.append(self.addTask(label="ocropusnlbin_job_-_%i" % (ocropusnlbin_job_number), command=cmd, dependencies=split_jobs))
|
||||
|
||||
|
||||
@ -182,7 +182,7 @@ class OCRWorkflow(WorkflowRunner):
|
||||
os.path.join(job["output_dir"], "tmp", "tesseract", file.rsplit(".", 2)[0]),
|
||||
self.lang,
|
||||
"pdf" if self.pdf else "")
|
||||
cmd = shlex.escape(cmd);
|
||||
cmd = shlex().escape(cmd);
|
||||
tesseract_jobs.append(self.addTask(label="tesseract_job_-_%i" % (tesseract_job_number), command=cmd, dependencies=ocropusnlbin_jobs, nCores=min(4, self.nCores)))
|
||||
|
||||
|
||||
@ -198,7 +198,7 @@ class OCRWorkflow(WorkflowRunner):
|
||||
cmd = "pdftk %s cat output %s" % (
|
||||
os.path.join(job["output_dir"], "tmp", "tesseract", "*.pdf"),
|
||||
os.path.join(job["output_dir"], os.path.basename(job["path"].rsplit(".", 1)[0] + ".pdf")))
|
||||
cmd = shlex.escape(cmd);
|
||||
cmd = shlex().escape(cmd);
|
||||
pdf_merge_jobs.append(self.addTask(label="pdf_merge_job_-_%i" % (pdf_merge_job_number), command=cmd, dependencies=tesseract_jobs))
|
||||
|
||||
|
||||
@ -213,7 +213,7 @@ class OCRWorkflow(WorkflowRunner):
|
||||
pdf_to_txt_job_number += 1
|
||||
cmd = "pdftotext -raw %s" % (
|
||||
os.path.join(job["output_dir"], os.path.basename(job["path"].rsplit(".", 1)[0] + ".pdf")))
|
||||
cmd = shlex.escape(cmd);
|
||||
cmd = shlex().escape(cmd);
|
||||
pdf_merge_jobs.append(self.addTask(label="pdf_to_txt_job_-_%i" % (pdf_to_txt_job_number), command=cmd, dependencies=pdf_merge_jobs))
|
||||
|
||||
|
||||
@ -228,7 +228,7 @@ class OCRWorkflow(WorkflowRunner):
|
||||
cmd = "mv %s %s" % (
|
||||
os.path.join(job["output_dir"], "tmp", "tesseract", "*.hocr"),
|
||||
os.path.join(job["output_dir"], "hocr_files"))
|
||||
cmd = shlex.escape(cmd);
|
||||
cmd = shlex().escape(cmd);
|
||||
move_hocr_jobs.append(self.addTask(label="move_hocr_job_-_%i" % (move_hocr_job_number), command=cmd, dependencies=tesseract_jobs))
|
||||
|
||||
|
||||
@ -243,7 +243,7 @@ class OCRWorkflow(WorkflowRunner):
|
||||
cmd = "parse_hocr %s %s" % (
|
||||
os.path.join(job["output_dir"], "hocr_files"),
|
||||
os.path.join(os.path.join(job["output_dir"], os.path.basename(job["path"]).rsplit(".", 1)[0] + ".xml")))
|
||||
cmd = shlex.escape(cmd);
|
||||
cmd = shlex().escape(cmd);
|
||||
hocr_to_teip5_jobs.append(self.addTask(label="hocr_to_teip5_job_-_%i" % (hocr_to_teip5_job_number), command=cmd, dependencies=move_hocr_jobs))
|
||||
|
||||
|
||||
@ -258,7 +258,7 @@ class OCRWorkflow(WorkflowRunner):
|
||||
for job in self.jobs["images"] + self.jobs["pdf"]:
|
||||
cleanup_job_counter += 1
|
||||
cmd = "rm -r %s" % (os.path.join(job["output_dir"], "tmp"))
|
||||
cmd = shlex.escape(cmd);
|
||||
cmd = shlex().escape(cmd);
|
||||
cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd))
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user