shlexed all cmds!

This commit is contained in:
Patrick Jentsch 2019-01-11 15:54:15 +01:00
parent 4995025e45
commit eefd8d53d0

View File

@ -25,6 +25,7 @@ __status__ = "Development"
import argparse import argparse
import multiprocessing import multiprocessing
import os import os
import shlex
import sys import sys
import unicodedata import unicodedata
from pyflow import WorkflowRunner from pyflow import WorkflowRunner
@ -114,6 +115,7 @@ class OCRWorkflow(WorkflowRunner):
os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"), os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"),
os.path.join(job["output_dir"], "tmp", "tesseract"), os.path.join(job["output_dir"], "tmp", "tesseract"),
os.path.join(job["output_dir"], "tmp", "tiff_files")) os.path.join(job["output_dir"], "tmp", "tiff_files"))
cmd = shlex.escape(cmd);
mkdir_jobs.append(self.addTask(label="mkdir_job_-_%i" % (mkdir_job_number), command=cmd)) mkdir_jobs.append(self.addTask(label="mkdir_job_-_%i" % (mkdir_job_number), command=cmd))
@ -138,12 +140,14 @@ class OCRWorkflow(WorkflowRunner):
os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + ".pdf"), os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + ".pdf"),
os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0]), os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0]),
os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + ".pdf")) os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + ".pdf"))
cmd = shlex.escape(cmd);
split_jobs.append(self.addTask(label="split_job_-_%i" % (split_job_number), command=cmd, dependencies=mkdir_jobs)) split_jobs.append(self.addTask(label="split_job_-_%i" % (split_job_number), command=cmd, dependencies=mkdir_jobs))
for job in self.jobs["pdf"]: for job in self.jobs["pdf"]:
split_job_number += 1 split_job_number += 1
cmd = "pdftoppm %s %s -tiff -r 300 -tiffcompression lzw -cropbox" % ( cmd = "pdftoppm %s %s -tiff -r 300 -tiffcompression lzw -cropbox" % (
job["path"], job["path"],
os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0])) os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0]))
cmd = shlex.escape(cmd);
split_jobs.append(self.addTask(label="split_job_-_%i" % (split_job_number), command=cmd, dependencies=mkdir_jobs)) split_jobs.append(self.addTask(label="split_job_-_%i" % (split_job_number), command=cmd, dependencies=mkdir_jobs))
@ -158,6 +162,7 @@ class OCRWorkflow(WorkflowRunner):
cmd = "ocropus-nlbin -o %s %s" % ( cmd = "ocropus-nlbin -o %s %s" % (
os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"), os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"),
os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + "-*.tif")) os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + "-*.tif"))
cmd = shlex.escape(cmd);
ocropusnlbin_jobs.append(self.addTask(label="ocropusnlbin_job_-_%i" % (ocropusnlbin_job_number), command=cmd, dependencies=split_jobs)) ocropusnlbin_jobs.append(self.addTask(label="ocropusnlbin_job_-_%i" % (ocropusnlbin_job_number), command=cmd, dependencies=split_jobs))
@ -177,6 +182,7 @@ class OCRWorkflow(WorkflowRunner):
os.path.join(job["output_dir"], "tmp", "tesseract", file.rsplit(".", 2)[0]), os.path.join(job["output_dir"], "tmp", "tesseract", file.rsplit(".", 2)[0]),
self.lang, self.lang,
"pdf" if self.pdf else "") "pdf" if self.pdf else "")
cmd = shlex.escape(cmd);
tesseract_jobs.append(self.addTask(label="tesseract_job_-_%i" % (tesseract_job_number), command=cmd, dependencies=ocropusnlbin_jobs, nCores=min(4, self.nCores))) tesseract_jobs.append(self.addTask(label="tesseract_job_-_%i" % (tesseract_job_number), command=cmd, dependencies=ocropusnlbin_jobs, nCores=min(4, self.nCores)))
@ -192,6 +198,7 @@ class OCRWorkflow(WorkflowRunner):
cmd = "pdftk %s cat output %s" % ( cmd = "pdftk %s cat output %s" % (
os.path.join(job["output_dir"], "tmp", "tesseract", "*.pdf"), os.path.join(job["output_dir"], "tmp", "tesseract", "*.pdf"),
os.path.join(job["output_dir"], os.path.basename(job["path"].rsplit(".", 1)[0] + ".pdf"))) os.path.join(job["output_dir"], os.path.basename(job["path"].rsplit(".", 1)[0] + ".pdf")))
cmd = shlex.escape(cmd);
pdf_merge_jobs.append(self.addTask(label="pdf_merge_job_-_%i" % (pdf_merge_job_number), command=cmd, dependencies=tesseract_jobs)) pdf_merge_jobs.append(self.addTask(label="pdf_merge_job_-_%i" % (pdf_merge_job_number), command=cmd, dependencies=tesseract_jobs))
@ -206,6 +213,7 @@ class OCRWorkflow(WorkflowRunner):
pdf_to_txt_job_number += 1 pdf_to_txt_job_number += 1
cmd = "pdftotext -raw %s" % ( cmd = "pdftotext -raw %s" % (
os.path.join(job["output_dir"], os.path.basename(job["path"].rsplit(".", 1)[0] + ".pdf"))) os.path.join(job["output_dir"], os.path.basename(job["path"].rsplit(".", 1)[0] + ".pdf")))
cmd = shlex.escape(cmd);
pdf_merge_jobs.append(self.addTask(label="pdf_to_txt_job_-_%i" % (pdf_to_txt_job_number), command=cmd, dependencies=pdf_merge_jobs)) pdf_merge_jobs.append(self.addTask(label="pdf_to_txt_job_-_%i" % (pdf_to_txt_job_number), command=cmd, dependencies=pdf_merge_jobs))
@ -220,6 +228,7 @@ class OCRWorkflow(WorkflowRunner):
cmd = "mv %s %s" % ( cmd = "mv %s %s" % (
os.path.join(job["output_dir"], "tmp", "tesseract", "*.hocr"), os.path.join(job["output_dir"], "tmp", "tesseract", "*.hocr"),
os.path.join(job["output_dir"], "hocr_files")) os.path.join(job["output_dir"], "hocr_files"))
cmd = shlex.escape(cmd);
move_hocr_jobs.append(self.addTask(label="move_hocr_job_-_%i" % (move_hocr_job_number), command=cmd, dependencies=tesseract_jobs)) move_hocr_jobs.append(self.addTask(label="move_hocr_job_-_%i" % (move_hocr_job_number), command=cmd, dependencies=tesseract_jobs))
@ -234,6 +243,7 @@ class OCRWorkflow(WorkflowRunner):
cmd = "parse_hocr %s %s" % ( cmd = "parse_hocr %s %s" % (
os.path.join(job["output_dir"], "hocr_files"), os.path.join(job["output_dir"], "hocr_files"),
os.path.join(os.path.join(job["output_dir"], os.path.basename(job["path"]).rsplit(".", 1)[0] + ".xml"))) os.path.join(os.path.join(job["output_dir"], os.path.basename(job["path"]).rsplit(".", 1)[0] + ".xml")))
cmd = shlex.escape(cmd);
hocr_to_teip5_jobs.append(self.addTask(label="hocr_to_teip5_job_-_%i" % (hocr_to_teip5_job_number), command=cmd, dependencies=move_hocr_jobs)) hocr_to_teip5_jobs.append(self.addTask(label="hocr_to_teip5_job_-_%i" % (hocr_to_teip5_job_number), command=cmd, dependencies=move_hocr_jobs))
@ -248,6 +258,7 @@ class OCRWorkflow(WorkflowRunner):
for job in self.jobs["images"] + self.jobs["pdf"]: for job in self.jobs["images"] + self.jobs["pdf"]:
cleanup_job_counter += 1 cleanup_job_counter += 1
cmd = "rm -r %s" % (os.path.join(job["output_dir"], "tmp")) cmd = "rm -r %s" % (os.path.join(job["output_dir"], "tmp"))
cmd = shlex.escape(cmd);
cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd)) cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd))