mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2024-12-26 18:24:17 +00:00
Escaped all filenames in cmd strings.
This commit is contained in:
parent
1d69abf717
commit
c616e13eea
31
ocr_pyflow
31
ocr_pyflow
@ -28,7 +28,6 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
import unicodedata
|
import unicodedata
|
||||||
from pyflow import WorkflowRunner
|
from pyflow import WorkflowRunner
|
||||||
from shlex import shlex
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -110,12 +109,11 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
mkdir_job_number = 0
|
mkdir_job_number = 0
|
||||||
for job in self.jobs["images"] + self.jobs["pdf"]:
|
for job in self.jobs["images"] + self.jobs["pdf"]:
|
||||||
mkdir_job_number += 1
|
mkdir_job_number += 1
|
||||||
cmd = "mkdir -p %s %s %s %s" % (
|
cmd = "mkdir -p \"%s\" \"%s\" \"%s\" \"%s\"" % (
|
||||||
os.path.join(job["output_dir"], "hocr_files"),
|
os.path.join(job["output_dir"], "hocr_files"),
|
||||||
os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"),
|
os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"),
|
||||||
os.path.join(job["output_dir"], "tmp", "tesseract"),
|
os.path.join(job["output_dir"], "tmp", "tesseract"),
|
||||||
os.path.join(job["output_dir"], "tmp", "tiff_files"))
|
os.path.join(job["output_dir"], "tmp", "tiff_files"))
|
||||||
cmd = shlex().escape(cmd);
|
|
||||||
mkdir_jobs.append(self.addTask(label="mkdir_job_-_%i" % (mkdir_job_number), command=cmd))
|
mkdir_jobs.append(self.addTask(label="mkdir_job_-_%i" % (mkdir_job_number), command=cmd))
|
||||||
|
|
||||||
|
|
||||||
@ -134,20 +132,18 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + "-%sd.tif" % ("%")))
|
os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + "-%sd.tif" % ("%")))
|
||||||
'''
|
'''
|
||||||
# WORKAROUND
|
# WORKAROUND
|
||||||
cmd = "tiff2pdf -o %s %s && pdftoppm %s %s -tiff -r 300 -tiffcompression lzw -cropbox && rm %s" % (
|
cmd = "tiff2pdf -o \"%s\" \"%s\" && pdftoppm \"%s\" \"%s\" -tiff -r 300 -tiffcompression lzw -cropbox && rm \"%s\"" % (
|
||||||
os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + ".pdf"),
|
os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + ".pdf"),
|
||||||
job["path"],
|
job["path"],
|
||||||
os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + ".pdf"),
|
os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + ".pdf"),
|
||||||
os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0]),
|
os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0]),
|
||||||
os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + ".pdf"))
|
os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + ".pdf"))
|
||||||
cmd = shlex().escape(cmd);
|
|
||||||
split_jobs.append(self.addTask(label="split_job_-_%i" % (split_job_number), command=cmd, dependencies=mkdir_jobs))
|
split_jobs.append(self.addTask(label="split_job_-_%i" % (split_job_number), command=cmd, dependencies=mkdir_jobs))
|
||||||
for job in self.jobs["pdf"]:
|
for job in self.jobs["pdf"]:
|
||||||
split_job_number += 1
|
split_job_number += 1
|
||||||
cmd = "pdftoppm %s %s -tiff -r 300 -tiffcompression lzw -cropbox" % (
|
cmd = "pdftoppm \"%s\" \"%s\" -tiff -r 300 -tiffcompression lzw -cropbox" % (
|
||||||
job["path"],
|
job["path"],
|
||||||
os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0]))
|
os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0]))
|
||||||
cmd = shlex().escape(cmd);
|
|
||||||
split_jobs.append(self.addTask(label="split_job_-_%i" % (split_job_number), command=cmd, dependencies=mkdir_jobs))
|
split_jobs.append(self.addTask(label="split_job_-_%i" % (split_job_number), command=cmd, dependencies=mkdir_jobs))
|
||||||
|
|
||||||
|
|
||||||
@ -159,10 +155,9 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
ocropusnlbin_job_number = 0
|
ocropusnlbin_job_number = 0
|
||||||
for job in self.jobs["images"] + self.jobs["pdf"]:
|
for job in self.jobs["images"] + self.jobs["pdf"]:
|
||||||
ocropusnlbin_job_number += 1
|
ocropusnlbin_job_number += 1
|
||||||
cmd = "ocropus-nlbin -o %s %s" % (
|
cmd = "ocropus-nlbin -o \"%s\" \"%s\"" % (
|
||||||
os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"),
|
os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"),
|
||||||
os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + "-*.tif"))
|
os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + "-*.tif"))
|
||||||
cmd = shlex().escape(cmd);
|
|
||||||
ocropusnlbin_jobs.append(self.addTask(label="ocropusnlbin_job_-_%i" % (ocropusnlbin_job_number), command=cmd, dependencies=split_jobs))
|
ocropusnlbin_jobs.append(self.addTask(label="ocropusnlbin_job_-_%i" % (ocropusnlbin_job_number), command=cmd, dependencies=split_jobs))
|
||||||
|
|
||||||
|
|
||||||
@ -177,12 +172,11 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
# This list is empty if you don't wait for ocropus_nlbin_jobs to complete
|
# This list is empty if you don't wait for ocropus_nlbin_jobs to complete
|
||||||
for file in filter(lambda x: x.endswith(".bin.png"), os.listdir(os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"))):
|
for file in filter(lambda x: x.endswith(".bin.png"), os.listdir(os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"))):
|
||||||
tesseract_job_number += 1
|
tesseract_job_number += 1
|
||||||
cmd = "tesseract %s %s -l %s hocr %s" % (
|
cmd = "tesseract \"%s\" \"%s\" -l %s hocr %s" % (
|
||||||
os.path.join(job["output_dir"], "tmp", "ocropus-nlbin", file),
|
os.path.join(job["output_dir"], "tmp", "ocropus-nlbin", file),
|
||||||
os.path.join(job["output_dir"], "tmp", "tesseract", file.rsplit(".", 2)[0]),
|
os.path.join(job["output_dir"], "tmp", "tesseract", file.rsplit(".", 2)[0]),
|
||||||
self.lang,
|
self.lang,
|
||||||
"pdf" if self.pdf else "")
|
"pdf" if self.pdf else "")
|
||||||
cmd = shlex().escape(cmd);
|
|
||||||
tesseract_jobs.append(self.addTask(label="tesseract_job_-_%i" % (tesseract_job_number), command=cmd, dependencies=ocropusnlbin_jobs, nCores=min(4, self.nCores)))
|
tesseract_jobs.append(self.addTask(label="tesseract_job_-_%i" % (tesseract_job_number), command=cmd, dependencies=ocropusnlbin_jobs, nCores=min(4, self.nCores)))
|
||||||
|
|
||||||
|
|
||||||
@ -195,10 +189,9 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
if self.pdf:
|
if self.pdf:
|
||||||
for job in self.jobs["images"] + self.jobs["pdf"]:
|
for job in self.jobs["images"] + self.jobs["pdf"]:
|
||||||
pdf_merge_job_number += 1
|
pdf_merge_job_number += 1
|
||||||
cmd = "pdftk %s cat output %s" % (
|
cmd = "pdftk \"%s\" cat output \"%s\"" % (
|
||||||
os.path.join(job["output_dir"], "tmp", "tesseract", "*.pdf"),
|
os.path.join(job["output_dir"], "tmp", "tesseract", "*.pdf"),
|
||||||
os.path.join(job["output_dir"], os.path.basename(job["path"].rsplit(".", 1)[0] + ".pdf")))
|
os.path.join(job["output_dir"], os.path.basename(job["path"].rsplit(".", 1)[0] + ".pdf")))
|
||||||
cmd = shlex().escape(cmd);
|
|
||||||
pdf_merge_jobs.append(self.addTask(label="pdf_merge_job_-_%i" % (pdf_merge_job_number), command=cmd, dependencies=tesseract_jobs))
|
pdf_merge_jobs.append(self.addTask(label="pdf_merge_job_-_%i" % (pdf_merge_job_number), command=cmd, dependencies=tesseract_jobs))
|
||||||
|
|
||||||
|
|
||||||
@ -211,9 +204,8 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
if self.pdf:
|
if self.pdf:
|
||||||
for job in self.jobs["images"] + self.jobs["pdf"]:
|
for job in self.jobs["images"] + self.jobs["pdf"]:
|
||||||
pdf_to_txt_job_number += 1
|
pdf_to_txt_job_number += 1
|
||||||
cmd = "pdftotext -raw %s" % (
|
cmd = "pdftotext -raw \"%s\"" % (
|
||||||
os.path.join(job["output_dir"], os.path.basename(job["path"].rsplit(".", 1)[0] + ".pdf")))
|
os.path.join(job["output_dir"], os.path.basename(job["path"].rsplit(".", 1)[0] + ".pdf")))
|
||||||
cmd = shlex().escape(cmd);
|
|
||||||
pdf_merge_jobs.append(self.addTask(label="pdf_to_txt_job_-_%i" % (pdf_to_txt_job_number), command=cmd, dependencies=pdf_merge_jobs))
|
pdf_merge_jobs.append(self.addTask(label="pdf_to_txt_job_-_%i" % (pdf_to_txt_job_number), command=cmd, dependencies=pdf_merge_jobs))
|
||||||
|
|
||||||
|
|
||||||
@ -225,10 +217,9 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
move_hocr_job_number = 0
|
move_hocr_job_number = 0
|
||||||
for job in self.jobs["images"] + self.jobs["pdf"]:
|
for job in self.jobs["images"] + self.jobs["pdf"]:
|
||||||
move_hocr_job_number += 1
|
move_hocr_job_number += 1
|
||||||
cmd = "mv %s %s" % (
|
cmd = "mv \"%s\" \"%s\"" % (
|
||||||
os.path.join(job["output_dir"], "tmp", "tesseract", "*.hocr"),
|
os.path.join(job["output_dir"], "tmp", "tesseract", "*.hocr"),
|
||||||
os.path.join(job["output_dir"], "hocr_files"))
|
os.path.join(job["output_dir"], "hocr_files"))
|
||||||
cmd = shlex().escape(cmd);
|
|
||||||
move_hocr_jobs.append(self.addTask(label="move_hocr_job_-_%i" % (move_hocr_job_number), command=cmd, dependencies=tesseract_jobs))
|
move_hocr_jobs.append(self.addTask(label="move_hocr_job_-_%i" % (move_hocr_job_number), command=cmd, dependencies=tesseract_jobs))
|
||||||
|
|
||||||
|
|
||||||
@ -240,10 +231,9 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
hocr_to_teip5_job_number = 0
|
hocr_to_teip5_job_number = 0
|
||||||
for job in self.jobs["images"] + self.jobs["pdf"]:
|
for job in self.jobs["images"] + self.jobs["pdf"]:
|
||||||
hocr_to_teip5_job_number += 1
|
hocr_to_teip5_job_number += 1
|
||||||
cmd = "parse_hocr %s %s" % (
|
cmd = "parse_hocr \"%s\" \"%s\"" % (
|
||||||
os.path.join(job["output_dir"], "hocr_files"),
|
os.path.join(job["output_dir"], "hocr_files"),
|
||||||
os.path.join(os.path.join(job["output_dir"], os.path.basename(job["path"]).rsplit(".", 1)[0] + ".xml")))
|
os.path.join(os.path.join(job["output_dir"], os.path.basename(job["path"]).rsplit(".", 1)[0] + ".xml")))
|
||||||
cmd = shlex().escape(cmd);
|
|
||||||
hocr_to_teip5_jobs.append(self.addTask(label="hocr_to_teip5_job_-_%i" % (hocr_to_teip5_job_number), command=cmd, dependencies=move_hocr_jobs))
|
hocr_to_teip5_jobs.append(self.addTask(label="hocr_to_teip5_job_-_%i" % (hocr_to_teip5_job_number), command=cmd, dependencies=move_hocr_jobs))
|
||||||
|
|
||||||
|
|
||||||
@ -257,8 +247,7 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
if not self.keepIntermediates:
|
if not self.keepIntermediates:
|
||||||
for job in self.jobs["images"] + self.jobs["pdf"]:
|
for job in self.jobs["images"] + self.jobs["pdf"]:
|
||||||
cleanup_job_counter += 1
|
cleanup_job_counter += 1
|
||||||
cmd = "rm -r %s" % (os.path.join(job["output_dir"], "tmp"))
|
cmd = "rm -r \"%s\"" % (os.path.join(job["output_dir"], "tmp"))
|
||||||
cmd = shlex().escape(cmd);
|
|
||||||
cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd))
|
cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd))
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user