mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2024-12-26 18:24:17 +00:00
Add skip binarization
This commit is contained in:
parent
0a25afbd51
commit
ac9b25271f
37
ocr
37
ocr
@ -42,6 +42,12 @@ def parse_arguments():
|
||||
dest="outputDir",
|
||||
help="Output directory.",
|
||||
required=True)
|
||||
parser.add_argument("--skip-binarization",
|
||||
action='store_true',
|
||||
default=False,
|
||||
dest="skipBinarization",
|
||||
help="Skip binarization.",
|
||||
required=False)
|
||||
parser.add_argument("--keep-intermediates",
|
||||
action='store_true',
|
||||
default=False,
|
||||
@ -58,8 +64,9 @@ def parse_arguments():
|
||||
|
||||
|
||||
class OCRWorkflow(WorkflowRunner):
|
||||
def __init__(self, jobs, keepIntermediates, lang, nCores):
|
||||
def __init__(self, jobs, skipBinarization, keepIntermediates, lang, nCores):
|
||||
self.jobs = jobs
|
||||
self.skipBinarization = skipBinarization
|
||||
self.keepIntermediates = keepIntermediates
|
||||
self.lang = lang
|
||||
self.nCores = nCores
|
||||
@ -116,17 +123,18 @@ class OCRWorkflow(WorkflowRunner):
|
||||
# Task "ocropus_nlbin_job": binarize tiff files from previous split
|
||||
# Dependencies: split_jobs
|
||||
###
|
||||
self.waitForTasks()
|
||||
ocropusnlbin_jobs = []
|
||||
ocropusnlbin_job_number = 0
|
||||
for job in self.jobs:
|
||||
ocropusnlbin_job_number += 1
|
||||
cmd = 'ocropus-nlbin -Q "%i" -o "%s" "%s"/*' % (
|
||||
max(1, int(self.nCores / len(self.jobs))),
|
||||
os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"),
|
||||
os.path.join(job["output_dir"], "tmp", "tiff_files")
|
||||
)
|
||||
ocropusnlbin_jobs.append(self.addTask(label="ocropusnlbin_job_-_%i" % (ocropusnlbin_job_number), command=cmd, dependencies=split_jobs, nCores=max(1, int(self.nCores / len(self.jobs)))))
|
||||
if (not self.skipBinarization):
|
||||
self.waitForTasks()
|
||||
ocropusnlbin_job_number = 0
|
||||
for job in self.jobs:
|
||||
ocropusnlbin_job_number += 1
|
||||
cmd = 'ocropus-nlbin -Q "%i" -o "%s" "%s"/*' % (
|
||||
max(1, int(self.nCores / len(self.jobs))),
|
||||
os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"),
|
||||
os.path.join(job["output_dir"], "tmp", "tiff_files")
|
||||
)
|
||||
ocropusnlbin_jobs.append(self.addTask(label="ocropusnlbin_job_-_%i" % (ocropusnlbin_job_number), command=cmd, dependencies=split_jobs, nCores=max(1, int(self.nCores / len(self.jobs)))))
|
||||
|
||||
###
|
||||
# Task "tesseract_job": perform OCR on binarized images
|
||||
@ -137,11 +145,11 @@ class OCRWorkflow(WorkflowRunner):
|
||||
tesseract_job_number = 0
|
||||
for job in self.jobs:
|
||||
# This list is empty if you don't wait for ocropus_nlbin_jobs to complete
|
||||
for file in filter(lambda x: x.endswith(".bin.png"), os.listdir(os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"))):
|
||||
for file in filter(lambda x: self.skipBinarization or x.endswith(".bin.png"), os.listdir(os.path.join(job["output_dir"], "tmp", "tiff_files" if self.skipBinarization else "ocropus-nlbin")):
|
||||
tesseract_job_number += 1
|
||||
cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % (
|
||||
os.path.join(job["output_dir"], "tmp", "ocropus-nlbin", file),
|
||||
os.path.join(job["output_dir"], "tmp", "tesseract", file.rsplit(".", 2)[0]),
|
||||
os.path.join(job["output_dir"], "tmp", "tiff_files" if self.skipBinarization else "ocropus-nlbin", file),
|
||||
os.path.join(job["output_dir"], "tmp", "tesseract", file.rsplit(".", 1 if self.skipBinarization else 2)[0]),
|
||||
self.lang
|
||||
)
|
||||
tesseract_jobs.append(self.addTask(label="tesseract_job_-_%i" % (tesseract_job_number), command=cmd, dependencies=ocropusnlbin_jobs, nCores=min(4, self.nCores)))
|
||||
@ -240,6 +248,7 @@ def main():
|
||||
|
||||
wflow = OCRWorkflow(
|
||||
analyze_jobs(args.inputDir, args.outputDir),
|
||||
args.skipBinarization,
|
||||
args.keepIntermediates,
|
||||
args.lang,
|
||||
args.nCores
|
||||
|
Loading…
Reference in New Issue
Block a user