Changed some ticks.

This commit is contained in:
Stephan Porada 2019-01-15 10:33:04 +01:00
parent dc8db6c0b1
commit dcea6c8f97

View File

@ -109,7 +109,7 @@ class OCRWorkflow(WorkflowRunner):
mkdir_job_number = 0 mkdir_job_number = 0
for job in self.jobs["images"] + self.jobs["pdf"]: for job in self.jobs["images"] + self.jobs["pdf"]:
mkdir_job_number += 1 mkdir_job_number += 1
cmd = "mkdir -p \"%s\" \"%s\" \"%s\" \"%s\"" % ( cmd = 'mkdir -p \"%s\" \"%s\" \"%s\" \"%s\"' % (
os.path.join(job["output_dir"], "hocr_files"), os.path.join(job["output_dir"], "hocr_files"),
os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"), os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"),
os.path.join(job["output_dir"], "tmp", "tesseract"), os.path.join(job["output_dir"], "tmp", "tesseract"),
@ -127,12 +127,12 @@ class OCRWorkflow(WorkflowRunner):
split_job_number += 1 split_job_number += 1
# TODO: Make the following command work # TODO: Make the following command work
''' '''
cmd = "convert %s %s" % ( cmd = 'convert \"%s\" \"%s\"' % (
job["path"], job["path"],
os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + "-%sd.tif" % ("%"))) os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + "-%sd.tif" % ("%")))
''' '''
# WORKAROUND # WORKAROUND
cmd = "tiff2pdf -o \"%s\" \"%s\" && pdftoppm \"%s\" \"%s\" -tiff -r 300 -tiffcompression lzw -cropbox && rm \"%s\"" % ( cmd = 'tiff2pdf -o \"%s\" \"%s\" && pdftoppm \"%s\" \"%s\" -tiff -r 300 -tiffcompression lzw -cropbox && rm \"%s\"' % (
os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + ".pdf"), os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + ".pdf"),
job["path"], job["path"],
os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + ".pdf"), os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + ".pdf"),
@ -141,7 +141,7 @@ class OCRWorkflow(WorkflowRunner):
split_jobs.append(self.addTask(label="split_job_-_%i" % (split_job_number), command=cmd, dependencies=mkdir_jobs)) split_jobs.append(self.addTask(label="split_job_-_%i" % (split_job_number), command=cmd, dependencies=mkdir_jobs))
for job in self.jobs["pdf"]: for job in self.jobs["pdf"]:
split_job_number += 1 split_job_number += 1
cmd = "pdftoppm \"%s\" \"%s\" -tiff -r 300 -tiffcompression lzw -cropbox" % ( cmd = 'pdftoppm \"%s\" \"%s\" -tiff -r 300 -tiffcompression lzw -cropbox' % (
job["path"], job["path"],
os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0])) os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0]))
split_jobs.append(self.addTask(label="split_job_-_%i" % (split_job_number), command=cmd, dependencies=mkdir_jobs)) split_jobs.append(self.addTask(label="split_job_-_%i" % (split_job_number), command=cmd, dependencies=mkdir_jobs))
@ -155,7 +155,7 @@ class OCRWorkflow(WorkflowRunner):
ocropusnlbin_job_number = 0 ocropusnlbin_job_number = 0
for job in self.jobs["images"] + self.jobs["pdf"]: for job in self.jobs["images"] + self.jobs["pdf"]:
ocropusnlbin_job_number += 1 ocropusnlbin_job_number += 1
cmd = "ocropus-nlbin -o \"%s\" \"%s\"" % ( cmd = 'ocropus-nlbin -o \"%s\" \"%s\"' % (
os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"), os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"),
os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + "-*.tif")) os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + "-*.tif"))
ocropusnlbin_jobs.append(self.addTask(label="ocropusnlbin_job_-_%i" % (ocropusnlbin_job_number), command=cmd, dependencies=split_jobs)) ocropusnlbin_jobs.append(self.addTask(label="ocropusnlbin_job_-_%i" % (ocropusnlbin_job_number), command=cmd, dependencies=split_jobs))
@ -172,7 +172,7 @@ class OCRWorkflow(WorkflowRunner):
# This list is empty if you don't wait for ocropus_nlbin_jobs to complete # This list is empty if you don't wait for ocropus_nlbin_jobs to complete
for file in filter(lambda x: x.endswith(".bin.png"), os.listdir(os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"))): for file in filter(lambda x: x.endswith(".bin.png"), os.listdir(os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"))):
tesseract_job_number += 1 tesseract_job_number += 1
cmd = "tesseract \"%s\" \"%s\" -l %s hocr %s" % ( cmd = 'tesseract \"%s\" \"%s\" -l \"%s\" hocr \"%s\"' % (
os.path.join(job["output_dir"], "tmp", "ocropus-nlbin", file), os.path.join(job["output_dir"], "tmp", "ocropus-nlbin", file),
os.path.join(job["output_dir"], "tmp", "tesseract", file.rsplit(".", 2)[0]), os.path.join(job["output_dir"], "tmp", "tesseract", file.rsplit(".", 2)[0]),
self.lang, self.lang,
@ -189,7 +189,7 @@ class OCRWorkflow(WorkflowRunner):
if self.pdf: if self.pdf:
for job in self.jobs["images"] + self.jobs["pdf"]: for job in self.jobs["images"] + self.jobs["pdf"]:
pdf_merge_job_number += 1 pdf_merge_job_number += 1
cmd = "pdftk \"%s\" cat output \"%s\"" % ( cmd = 'pdftk \"%s\" cat output \"%s\"' % (
os.path.join(job["output_dir"], "tmp", "tesseract", "*.pdf"), os.path.join(job["output_dir"], "tmp", "tesseract", "*.pdf"),
os.path.join(job["output_dir"], os.path.basename(job["path"].rsplit(".", 1)[0] + ".pdf"))) os.path.join(job["output_dir"], os.path.basename(job["path"].rsplit(".", 1)[0] + ".pdf")))
pdf_merge_jobs.append(self.addTask(label="pdf_merge_job_-_%i" % (pdf_merge_job_number), command=cmd, dependencies=tesseract_jobs)) pdf_merge_jobs.append(self.addTask(label="pdf_merge_job_-_%i" % (pdf_merge_job_number), command=cmd, dependencies=tesseract_jobs))
@ -204,7 +204,7 @@ class OCRWorkflow(WorkflowRunner):
if self.pdf: if self.pdf:
for job in self.jobs["images"] + self.jobs["pdf"]: for job in self.jobs["images"] + self.jobs["pdf"]:
pdf_to_txt_job_number += 1 pdf_to_txt_job_number += 1
cmd = "pdftotext -raw \"%s\"" % ( cmd = 'pdftotext -raw \"%s\"' % (
os.path.join(job["output_dir"], os.path.basename(job["path"].rsplit(".", 1)[0] + ".pdf"))) os.path.join(job["output_dir"], os.path.basename(job["path"].rsplit(".", 1)[0] + ".pdf")))
pdf_merge_jobs.append(self.addTask(label="pdf_to_txt_job_-_%i" % (pdf_to_txt_job_number), command=cmd, dependencies=pdf_merge_jobs)) pdf_merge_jobs.append(self.addTask(label="pdf_to_txt_job_-_%i" % (pdf_to_txt_job_number), command=cmd, dependencies=pdf_merge_jobs))
@ -217,7 +217,7 @@ class OCRWorkflow(WorkflowRunner):
move_hocr_job_number = 0 move_hocr_job_number = 0
for job in self.jobs["images"] + self.jobs["pdf"]: for job in self.jobs["images"] + self.jobs["pdf"]:
move_hocr_job_number += 1 move_hocr_job_number += 1
cmd = "mv \"%s\" \"%s\"" % ( cmd = 'mv \"%s\" \"%s\"' % (
os.path.join(job["output_dir"], "tmp", "tesseract", "*.hocr"), os.path.join(job["output_dir"], "tmp", "tesseract", "*.hocr"),
os.path.join(job["output_dir"], "hocr_files")) os.path.join(job["output_dir"], "hocr_files"))
move_hocr_jobs.append(self.addTask(label="move_hocr_job_-_%i" % (move_hocr_job_number), command=cmd, dependencies=tesseract_jobs)) move_hocr_jobs.append(self.addTask(label="move_hocr_job_-_%i" % (move_hocr_job_number), command=cmd, dependencies=tesseract_jobs))
@ -231,7 +231,7 @@ class OCRWorkflow(WorkflowRunner):
hocr_to_teip5_job_number = 0 hocr_to_teip5_job_number = 0
for job in self.jobs["images"] + self.jobs["pdf"]: for job in self.jobs["images"] + self.jobs["pdf"]:
hocr_to_teip5_job_number += 1 hocr_to_teip5_job_number += 1
cmd = "parse_hocr \"%s\" \"%s\"" % ( cmd = 'parse_hocr \"%s\" \"%s\"' % (
os.path.join(job["output_dir"], "hocr_files"), os.path.join(job["output_dir"], "hocr_files"),
os.path.join(os.path.join(job["output_dir"], os.path.basename(job["path"]).rsplit(".", 1)[0] + ".xml"))) os.path.join(os.path.join(job["output_dir"], os.path.basename(job["path"]).rsplit(".", 1)[0] + ".xml")))
hocr_to_teip5_jobs.append(self.addTask(label="hocr_to_teip5_job_-_%i" % (hocr_to_teip5_job_number), command=cmd, dependencies=move_hocr_jobs)) hocr_to_teip5_jobs.append(self.addTask(label="hocr_to_teip5_job_-_%i" % (hocr_to_teip5_job_number), command=cmd, dependencies=move_hocr_jobs))
@ -247,7 +247,7 @@ class OCRWorkflow(WorkflowRunner):
if not self.keepIntermediates: if not self.keepIntermediates:
for job in self.jobs["images"] + self.jobs["pdf"]: for job in self.jobs["images"] + self.jobs["pdf"]:
cleanup_job_counter += 1 cleanup_job_counter += 1
cmd = "rm -r \"%s\"" % (os.path.join(job["output_dir"], "tmp")) cmd = 'rm -r \"%s\"' % (os.path.join(job["output_dir"], "tmp"))
cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd)) cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd))