mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2025-07-01 11:50:34 +00:00
Change tif split handling, sort files before merging
This commit is contained in:
48
ocr
48
ocr
@ -105,23 +105,13 @@ class OCRWorkflow(WorkflowRunner):
|
||||
split_job_number = 0
|
||||
for job in self.jobs:
|
||||
split_job_number += 1
|
||||
if job["basename"].endswith(".tif") or job["basename"].endswith(".tiff"):
|
||||
# TODO: Make the following command work
|
||||
'''
|
||||
cmd = 'convert "%s" "%s"' % (
|
||||
if job["filename"].endswith((".tif", ".tiff")):
|
||||
cmd = 'convert "%s" -compress LZW -density 300 -scene 1 "%s"/page-%%d.tif' % (
|
||||
job["path"],
|
||||
os.path.join(job["output_dir"], "tmp", job["basename"].rsplit(".", 1)[0] + "-%sd.tif" % ("%")))
|
||||
'''
|
||||
# WORKAROUND
|
||||
cmd = 'tiff2pdf -o "%s" "%s" && pdftoppm "%s" "%s" -tiff -r 300 -tiffcompression lzw -cropbox && rm "%s"' % (
|
||||
os.path.join(job["output_dir"], "tmp", job["basename"].rsplit(".", 1)[0] + ".pdf"),
|
||||
job["path"],
|
||||
os.path.join(job["output_dir"], "tmp", job["basename"].rsplit(".", 1)[0] + ".pdf"),
|
||||
os.path.join(job["output_dir"], "tmp", "page"),
|
||||
os.path.join(job["output_dir"], "tmp", job["basename"].rsplit(".", 1)[0] + ".pdf")
|
||||
os.path.join(job["output_dir"], "tmp")
|
||||
)
|
||||
else:
|
||||
cmd = 'pdftoppm "%s" "%s" -tiff -r 300 -tiffcompression lzw -cropbox' % (
|
||||
cmd = 'pdftoppm -r 300 -tiff -tiffcompression lzw "%s" "%s"' % (
|
||||
job["path"],
|
||||
os.path.join(job["output_dir"], "tmp", "page")
|
||||
)
|
||||
@ -141,9 +131,9 @@ class OCRWorkflow(WorkflowRunner):
|
||||
if not self.skipBinarization:
|
||||
for job in self.jobs:
|
||||
binarization_job_number += 1
|
||||
cmd = 'ocropus-nlbin -Q "%i" -o "%s" "%s"/*.tif' % (
|
||||
binarization_job_nCores,
|
||||
cmd = 'ocropus-nlbin --output "%s" --parallel "%i" $(ls "%s"/*.tif | sort -V)' % (
|
||||
os.path.join(job["output_dir"], "tmp"),
|
||||
binarization_job_nCores,
|
||||
os.path.join(job["output_dir"], "tmp")
|
||||
)
|
||||
binarization_jobs.append(self.addTask(label="binarization_job_-_%i" % (binarization_job_number), command=cmd, dependencies=split_jobs, nCores=binarization_job_nCores))
|
||||
@ -157,7 +147,7 @@ class OCRWorkflow(WorkflowRunner):
|
||||
post_binarization_job_number = 0
|
||||
if not self.skipBinarization:
|
||||
for job in self.jobs:
|
||||
for file in filter(lambda x: x.endswith(".bin.png") or x.endswith(".nrm.png"), os.listdir(os.path.join(job["output_dir"], "tmp"))):
|
||||
for file in filter(lambda x: x.endswith((".bin.png", ".nrm.png")), os.listdir(os.path.join(job["output_dir"], "tmp"))):
|
||||
post_binarization_job_number += 1
|
||||
cmd = 'mv "%s" "%s"' % (
|
||||
os.path.join(job["output_dir"], "tmp", file),
|
||||
@ -195,7 +185,7 @@ class OCRWorkflow(WorkflowRunner):
|
||||
ocr_jobs.append(self.addTask(label="ocr_job_-_%i" % (ocr_job_number), command=cmd, dependencies=post_binarization_jobs, nCores=ocr_job_nCores))
|
||||
|
||||
###
|
||||
# Task "hocr_to_teip_job": create TEI P5 file from hocr files
|
||||
# Task "hocr_to_tei_job": create TEI P5 file from hocr files
|
||||
# Dependencies: ocr_jobs
|
||||
###
|
||||
hocr_to_tei_jobs = []
|
||||
@ -204,7 +194,7 @@ class OCRWorkflow(WorkflowRunner):
|
||||
hocr_to_tei_job_number += 1
|
||||
cmd = 'hocrtotei "%s" "%s"' % (
|
||||
os.path.join(job["output_dir"], "tmp"),
|
||||
os.path.join(job["output_dir"], job["basename"].rsplit(".", 1)[0] + ".xml")
|
||||
os.path.join(job["output_dir"], job["filename"].rsplit(".", 1)[0] + ".xml")
|
||||
)
|
||||
hocr_to_tei_jobs.append(self.addTask(label="hocr_to_tei_job_-_%i" % (hocr_to_tei_job_number), command=cmd, dependencies=ocr_jobs))
|
||||
|
||||
@ -216,9 +206,9 @@ class OCRWorkflow(WorkflowRunner):
|
||||
pdf_merge_job_number = 0
|
||||
for job in self.jobs:
|
||||
pdf_merge_job_number += 1
|
||||
cmd = 'pdftk "%s"/*.pdf cat output "%s"' % (
|
||||
cmd = 'pdftk $(ls "%s"/*.pdf | sort -V) cat output "%s"' % (
|
||||
os.path.join(job["output_dir"], "tmp"),
|
||||
os.path.join(job["output_dir"], job["basename"].rsplit(".", 1)[0] + ".pdf")
|
||||
os.path.join(job["output_dir"], job["filename"].rsplit(".", 1)[0] + ".pdf")
|
||||
)
|
||||
pdf_merge_jobs.append(self.addTask(label="pdf_merge_job_-_%i" % (pdf_merge_job_number), command=cmd, dependencies=ocr_jobs))
|
||||
|
||||
@ -230,9 +220,9 @@ class OCRWorkflow(WorkflowRunner):
|
||||
txt_merge_job_number = 0
|
||||
for job in self.jobs:
|
||||
txt_merge_job_number += 1
|
||||
cmd = 'cat "%s"/*.txt > "%s"' % (
|
||||
cmd = 'cat $(ls "%s"/*.txt | sort -V) > "%s"' % (
|
||||
os.path.join(job["output_dir"], "tmp"),
|
||||
os.path.join(job["output_dir"], job["basename"].rsplit(".", 1)[0] + ".txt")
|
||||
os.path.join(job["output_dir"], job["filename"].rsplit(".", 1)[0] + ".txt")
|
||||
)
|
||||
txt_merge_jobs.append(self.addTask(label="txt_merge_job_-_%i" % (txt_merge_job_number), command=cmd, dependencies=ocr_jobs))
|
||||
|
||||
@ -272,21 +262,17 @@ class OCRWorkflow(WorkflowRunner):
|
||||
cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_tei_jobs + pdf_merge_jobs + txt_merge_jobs))
|
||||
|
||||
|
||||
def analyze_jobs(inputDir, outputDir, level=1):
|
||||
def analyze_jobs(inputDir, outputDir):
|
||||
jobs = []
|
||||
|
||||
if level > 2:
|
||||
return jobs
|
||||
|
||||
for file in os.listdir(inputDir):
|
||||
if os.path.isdir(os.path.join(inputDir, file)):
|
||||
jobs += analyze_jobs(
|
||||
os.path.join(inputDir, file),
|
||||
os.path.join(outputDir, file),
|
||||
level + 1
|
||||
os.path.join(outputDir, file)
|
||||
)
|
||||
elif file.endswith(".pdf") or file.endswith(".tif") or file.endswith(".tiff"):
|
||||
jobs.append({"basename": os.path.basename(file), "output_dir": os.path.join(outputDir, file.rsplit(".", 1)[0]), "path": os.path.join(inputDir, file)})
|
||||
elif file.endswith((".pdf", ".tif", ".tiff")):
|
||||
jobs.append({"filename": file, "output_dir": os.path.join(outputDir, file), "path": os.path.join(inputDir, file)})
|
||||
|
||||
return jobs
|
||||
|
||||
|
Reference in New Issue
Block a user