diff --git a/ocr_pyflow b/ocr_pyflow index 00e8a6e..9524267 100755 --- a/ocr_pyflow +++ b/ocr_pyflow @@ -28,7 +28,6 @@ import os import sys import unicodedata from pyflow import WorkflowRunner -from shlex import shlex @@ -110,12 +109,11 @@ class OCRWorkflow(WorkflowRunner): mkdir_job_number = 0 for job in self.jobs["images"] + self.jobs["pdf"]: mkdir_job_number += 1 - cmd = "mkdir -p %s %s %s %s" % ( + cmd = "mkdir -p \"%s\" \"%s\" \"%s\" \"%s\"" % ( os.path.join(job["output_dir"], "hocr_files"), os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"), os.path.join(job["output_dir"], "tmp", "tesseract"), os.path.join(job["output_dir"], "tmp", "tiff_files")) - cmd = shlex().escape(cmd); mkdir_jobs.append(self.addTask(label="mkdir_job_-_%i" % (mkdir_job_number), command=cmd)) @@ -134,20 +132,18 @@ class OCRWorkflow(WorkflowRunner): os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + "-%sd.tif" % ("%"))) ''' # WORKAROUND - cmd = "tiff2pdf -o %s %s && pdftoppm %s %s -tiff -r 300 -tiffcompression lzw -cropbox && rm %s" % ( + cmd = "tiff2pdf -o \"%s\" \"%s\" && pdftoppm \"%s\" \"%s\" -tiff -r 300 -tiffcompression lzw -cropbox && rm \"%s\"" % ( os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + ".pdf"), job["path"], os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + ".pdf"), os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0]), os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + ".pdf")) - cmd = shlex().escape(cmd); split_jobs.append(self.addTask(label="split_job_-_%i" % (split_job_number), command=cmd, dependencies=mkdir_jobs)) for job in self.jobs["pdf"]: split_job_number += 1 - cmd = "pdftoppm %s %s -tiff -r 300 -tiffcompression lzw -cropbox" % ( + cmd = "pdftoppm \"%s\" \"%s\" -tiff -r 300 -tiffcompression lzw -cropbox" % ( job["path"], os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0])) - cmd = shlex().escape(cmd); split_jobs.append(self.addTask(label="split_job_-_%i" % (split_job_number), command=cmd, dependencies=mkdir_jobs)) @@ -159,10 +155,9 @@ class OCRWorkflow(WorkflowRunner): ocropusnlbin_job_number = 0 for job in self.jobs["images"] + self.jobs["pdf"]: ocropusnlbin_job_number += 1 - cmd = "ocropus-nlbin -o %s %s" % ( + cmd = "ocropus-nlbin -o \"%s\" \"%s\"" % ( os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"), os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + "-*.tif")) - cmd = shlex().escape(cmd); ocropusnlbin_jobs.append(self.addTask(label="ocropusnlbin_job_-_%i" % (ocropusnlbin_job_number), command=cmd, dependencies=split_jobs)) @@ -177,12 +172,11 @@ class OCRWorkflow(WorkflowRunner): # This list is empty if you don't wait for ocropus_nlbin_jobs to complete for file in filter(lambda x: x.endswith(".bin.png"), os.listdir(os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"))): tesseract_job_number += 1 - cmd = "tesseract %s %s -l %s hocr %s" % ( + cmd = "tesseract \"%s\" \"%s\" -l %s hocr %s" % ( os.path.join(job["output_dir"], "tmp", "ocropus-nlbin", file), os.path.join(job["output_dir"], "tmp", "tesseract", file.rsplit(".", 2)[0]), self.lang, "pdf" if self.pdf else "") - cmd = shlex().escape(cmd); tesseract_jobs.append(self.addTask(label="tesseract_job_-_%i" % (tesseract_job_number), command=cmd, dependencies=ocropusnlbin_jobs, nCores=min(4, self.nCores))) @@ -195,10 +189,9 @@ class OCRWorkflow(WorkflowRunner): if self.pdf: for job in self.jobs["images"] + self.jobs["pdf"]: pdf_merge_job_number += 1 - cmd = "pdftk %s cat output %s" % ( + cmd = "pdftk \"%s\" cat output \"%s\"" % ( os.path.join(job["output_dir"], "tmp", "tesseract", "*.pdf"), os.path.join(job["output_dir"], os.path.basename(job["path"].rsplit(".", 1)[0] + ".pdf"))) - cmd = shlex().escape(cmd); pdf_merge_jobs.append(self.addTask(label="pdf_merge_job_-_%i" % (pdf_merge_job_number), command=cmd, dependencies=tesseract_jobs)) @@ -211,9 +204,8 @@ class OCRWorkflow(WorkflowRunner): if self.pdf: for job in self.jobs["images"] + self.jobs["pdf"]: pdf_to_txt_job_number += 1 - cmd = "pdftotext -raw %s" % ( + cmd = "pdftotext -raw \"%s\"" % ( os.path.join(job["output_dir"], os.path.basename(job["path"].rsplit(".", 1)[0] + ".pdf"))) - cmd = shlex().escape(cmd); pdf_merge_jobs.append(self.addTask(label="pdf_to_txt_job_-_%i" % (pdf_to_txt_job_number), command=cmd, dependencies=pdf_merge_jobs)) @@ -225,10 +217,9 @@ class OCRWorkflow(WorkflowRunner): move_hocr_job_number = 0 for job in self.jobs["images"] + self.jobs["pdf"]: move_hocr_job_number += 1 - cmd = "mv %s %s" % ( + cmd = "mv \"%s\" \"%s\"" % ( os.path.join(job["output_dir"], "tmp", "tesseract", "*.hocr"), os.path.join(job["output_dir"], "hocr_files")) - cmd = shlex().escape(cmd); move_hocr_jobs.append(self.addTask(label="move_hocr_job_-_%i" % (move_hocr_job_number), command=cmd, dependencies=tesseract_jobs)) @@ -240,10 +231,9 @@ class OCRWorkflow(WorkflowRunner): hocr_to_teip5_job_number = 0 for job in self.jobs["images"] + self.jobs["pdf"]: hocr_to_teip5_job_number += 1 - cmd = "parse_hocr %s %s" % ( + cmd = "parse_hocr \"%s\" \"%s\"" % ( os.path.join(job["output_dir"], "hocr_files"), os.path.join(os.path.join(job["output_dir"], os.path.basename(job["path"]).rsplit(".", 1)[0] + ".xml"))) - cmd = shlex().escape(cmd); hocr_to_teip5_jobs.append(self.addTask(label="hocr_to_teip5_job_-_%i" % (hocr_to_teip5_job_number), command=cmd, dependencies=move_hocr_jobs)) @@ -257,8 +247,7 @@ class OCRWorkflow(WorkflowRunner): if not self.keepIntermediates: for job in self.jobs["images"] + self.jobs["pdf"]: cleanup_job_counter += 1 - cmd = "rm -r %s" % (os.path.join(job["output_dir"], "tmp")) - cmd = shlex().escape(cmd); + cmd = "rm -r \"%s\"" % (os.path.join(job["output_dir"], "tmp")) cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd))