mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2024-12-27 09:34:18 +00:00
Add skip binarization
This commit is contained in:
parent
0a25afbd51
commit
ac9b25271f
37
ocr
37
ocr
@ -42,6 +42,12 @@ def parse_arguments():
|
|||||||
dest="outputDir",
|
dest="outputDir",
|
||||||
help="Output directory.",
|
help="Output directory.",
|
||||||
required=True)
|
required=True)
|
||||||
|
parser.add_argument("--skip-binarization",
|
||||||
|
action='store_true',
|
||||||
|
default=False,
|
||||||
|
dest="skipBinarization",
|
||||||
|
help="Skip binarization.",
|
||||||
|
required=False)
|
||||||
parser.add_argument("--keep-intermediates",
|
parser.add_argument("--keep-intermediates",
|
||||||
action='store_true',
|
action='store_true',
|
||||||
default=False,
|
default=False,
|
||||||
@ -58,8 +64,9 @@ def parse_arguments():
|
|||||||
|
|
||||||
|
|
||||||
class OCRWorkflow(WorkflowRunner):
|
class OCRWorkflow(WorkflowRunner):
|
||||||
def __init__(self, jobs, keepIntermediates, lang, nCores):
|
def __init__(self, jobs, skipBinarization, keepIntermediates, lang, nCores):
|
||||||
self.jobs = jobs
|
self.jobs = jobs
|
||||||
|
self.skipBinarization = skipBinarization
|
||||||
self.keepIntermediates = keepIntermediates
|
self.keepIntermediates = keepIntermediates
|
||||||
self.lang = lang
|
self.lang = lang
|
||||||
self.nCores = nCores
|
self.nCores = nCores
|
||||||
@ -116,17 +123,18 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
# Task "ocropus_nlbin_job": binarize tiff files from previous split
|
# Task "ocropus_nlbin_job": binarize tiff files from previous split
|
||||||
# Dependencies: split_jobs
|
# Dependencies: split_jobs
|
||||||
###
|
###
|
||||||
self.waitForTasks()
|
|
||||||
ocropusnlbin_jobs = []
|
ocropusnlbin_jobs = []
|
||||||
ocropusnlbin_job_number = 0
|
if (not self.skipBinarization):
|
||||||
for job in self.jobs:
|
self.waitForTasks()
|
||||||
ocropusnlbin_job_number += 1
|
ocropusnlbin_job_number = 0
|
||||||
cmd = 'ocropus-nlbin -Q "%i" -o "%s" "%s"/*' % (
|
for job in self.jobs:
|
||||||
max(1, int(self.nCores / len(self.jobs))),
|
ocropusnlbin_job_number += 1
|
||||||
os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"),
|
cmd = 'ocropus-nlbin -Q "%i" -o "%s" "%s"/*' % (
|
||||||
os.path.join(job["output_dir"], "tmp", "tiff_files")
|
max(1, int(self.nCores / len(self.jobs))),
|
||||||
)
|
os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"),
|
||||||
ocropusnlbin_jobs.append(self.addTask(label="ocropusnlbin_job_-_%i" % (ocropusnlbin_job_number), command=cmd, dependencies=split_jobs, nCores=max(1, int(self.nCores / len(self.jobs)))))
|
os.path.join(job["output_dir"], "tmp", "tiff_files")
|
||||||
|
)
|
||||||
|
ocropusnlbin_jobs.append(self.addTask(label="ocropusnlbin_job_-_%i" % (ocropusnlbin_job_number), command=cmd, dependencies=split_jobs, nCores=max(1, int(self.nCores / len(self.jobs)))))
|
||||||
|
|
||||||
###
|
###
|
||||||
# Task "tesseract_job": perform OCR on binarized images
|
# Task "tesseract_job": perform OCR on binarized images
|
||||||
@ -137,11 +145,11 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
tesseract_job_number = 0
|
tesseract_job_number = 0
|
||||||
for job in self.jobs:
|
for job in self.jobs:
|
||||||
# This list is empty if you don't wait for ocropus_nlbin_jobs to complete
|
# This list is empty if you don't wait for ocropus_nlbin_jobs to complete
|
||||||
for file in filter(lambda x: x.endswith(".bin.png"), os.listdir(os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"))):
|
for file in filter(lambda x: self.skipBinarization or x.endswith(".bin.png"), os.listdir(os.path.join(job["output_dir"], "tmp", "tiff_files" if self.skipBinarization else "ocropus-nlbin")):
|
||||||
tesseract_job_number += 1
|
tesseract_job_number += 1
|
||||||
cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % (
|
cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % (
|
||||||
os.path.join(job["output_dir"], "tmp", "ocropus-nlbin", file),
|
os.path.join(job["output_dir"], "tmp", "tiff_files" if self.skipBinarization else "ocropus-nlbin", file),
|
||||||
os.path.join(job["output_dir"], "tmp", "tesseract", file.rsplit(".", 2)[0]),
|
os.path.join(job["output_dir"], "tmp", "tesseract", file.rsplit(".", 1 if self.skipBinarization else 2)[0]),
|
||||||
self.lang
|
self.lang
|
||||||
)
|
)
|
||||||
tesseract_jobs.append(self.addTask(label="tesseract_job_-_%i" % (tesseract_job_number), command=cmd, dependencies=ocropusnlbin_jobs, nCores=min(4, self.nCores)))
|
tesseract_jobs.append(self.addTask(label="tesseract_job_-_%i" % (tesseract_job_number), command=cmd, dependencies=ocropusnlbin_jobs, nCores=min(4, self.nCores)))
|
||||||
@ -240,6 +248,7 @@ def main():
|
|||||||
|
|
||||||
wflow = OCRWorkflow(
|
wflow = OCRWorkflow(
|
||||||
analyze_jobs(args.inputDir, args.outputDir),
|
analyze_jobs(args.inputDir, args.outputDir),
|
||||||
|
args.skipBinarization,
|
||||||
args.keepIntermediates,
|
args.keepIntermediates,
|
||||||
args.lang,
|
args.lang,
|
||||||
args.nCores
|
args.nCores
|
||||||
|
Loading…
Reference in New Issue
Block a user