From eefd8d53d015ce5a8a3c8bfbee0d79fef9fd4862 Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Fri, 11 Jan 2019 15:54:15 +0100 Subject: [PATCH] shlexed all cmds! --- ocr_pyflow | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ocr_pyflow b/ocr_pyflow index f815c26..074b2ee 100755 --- a/ocr_pyflow +++ b/ocr_pyflow @@ -25,6 +25,7 @@ __status__ = "Development" import argparse import multiprocessing import os +import shlex import sys import unicodedata from pyflow import WorkflowRunner @@ -114,6 +115,7 @@ class OCRWorkflow(WorkflowRunner): os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"), os.path.join(job["output_dir"], "tmp", "tesseract"), os.path.join(job["output_dir"], "tmp", "tiff_files")) + cmd = shlex.escape(cmd); mkdir_jobs.append(self.addTask(label="mkdir_job_-_%i" % (mkdir_job_number), command=cmd)) @@ -138,12 +140,14 @@ class OCRWorkflow(WorkflowRunner): os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + ".pdf"), os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0]), os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + ".pdf")) + cmd = shlex.escape(cmd); split_jobs.append(self.addTask(label="split_job_-_%i" % (split_job_number), command=cmd, dependencies=mkdir_jobs)) for job in self.jobs["pdf"]: split_job_number += 1 cmd = "pdftoppm %s %s -tiff -r 300 -tiffcompression lzw -cropbox" % ( job["path"], os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0])) + cmd = shlex.escape(cmd); split_jobs.append(self.addTask(label="split_job_-_%i" % (split_job_number), command=cmd, dependencies=mkdir_jobs)) @@ -158,6 +162,7 @@ class OCRWorkflow(WorkflowRunner): cmd = "ocropus-nlbin -o %s %s" % ( os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"), os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + "-*.tif")) + cmd = shlex.escape(cmd); ocropusnlbin_jobs.append(self.addTask(label="ocropusnlbin_job_-_%i" % (ocropusnlbin_job_number), command=cmd, dependencies=split_jobs)) @@ -177,6 +182,7 @@ class OCRWorkflow(WorkflowRunner): os.path.join(job["output_dir"], "tmp", "tesseract", file.rsplit(".", 2)[0]), self.lang, "pdf" if self.pdf else "") + cmd = shlex.escape(cmd); tesseract_jobs.append(self.addTask(label="tesseract_job_-_%i" % (tesseract_job_number), command=cmd, dependencies=ocropusnlbin_jobs, nCores=min(4, self.nCores))) @@ -192,6 +198,7 @@ class OCRWorkflow(WorkflowRunner): cmd = "pdftk %s cat output %s" % ( os.path.join(job["output_dir"], "tmp", "tesseract", "*.pdf"), os.path.join(job["output_dir"], os.path.basename(job["path"].rsplit(".", 1)[0] + ".pdf"))) + cmd = shlex.escape(cmd); pdf_merge_jobs.append(self.addTask(label="pdf_merge_job_-_%i" % (pdf_merge_job_number), command=cmd, dependencies=tesseract_jobs)) @@ -206,6 +213,7 @@ class OCRWorkflow(WorkflowRunner): pdf_to_txt_job_number += 1 cmd = "pdftotext -raw %s" % ( os.path.join(job["output_dir"], os.path.basename(job["path"].rsplit(".", 1)[0] + ".pdf"))) + cmd = shlex.escape(cmd); pdf_merge_jobs.append(self.addTask(label="pdf_to_txt_job_-_%i" % (pdf_to_txt_job_number), command=cmd, dependencies=pdf_merge_jobs)) @@ -220,6 +228,7 @@ class OCRWorkflow(WorkflowRunner): cmd = "mv %s %s" % ( os.path.join(job["output_dir"], "tmp", "tesseract", "*.hocr"), os.path.join(job["output_dir"], "hocr_files")) + cmd = shlex.escape(cmd); move_hocr_jobs.append(self.addTask(label="move_hocr_job_-_%i" % (move_hocr_job_number), command=cmd, dependencies=tesseract_jobs)) @@ -234,6 +243,7 @@ class OCRWorkflow(WorkflowRunner): cmd = "parse_hocr %s %s" % ( os.path.join(job["output_dir"], "hocr_files"), os.path.join(os.path.join(job["output_dir"], os.path.basename(job["path"]).rsplit(".", 1)[0] + ".xml"))) + cmd = shlex.escape(cmd); hocr_to_teip5_jobs.append(self.addTask(label="hocr_to_teip5_job_-_%i" % (hocr_to_teip5_job_number), command=cmd, dependencies=move_hocr_jobs)) @@ -248,6 +258,7 @@ class OCRWorkflow(WorkflowRunner): for job in self.jobs["images"] + self.jobs["pdf"]: cleanup_job_counter += 1 cmd = "rm -r %s" % (os.path.join(job["output_dir"], "tmp")) + cmd = shlex.escape(cmd); cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd))