From 7322a5bc7c9c8393e9aa82459b2cc6db2da39fbb Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Thu, 2 Jul 2020 11:47:43 +0200 Subject: [PATCH] More GhostScript, less dependencies! --- Dockerfile | 1 - ocr | 86 +++++++++++++----------------------------------------- 2 files changed, 21 insertions(+), 66 deletions(-) diff --git a/Dockerfile b/Dockerfile index 66b86a0..bc4ae93 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,7 +14,6 @@ RUN apt-get update \ ca-certificates \ gnupg2 \ ghostscript \ - poppler-utils \ python2.7 \ python3.7 \ wget \ diff --git a/ocr b/ocr index 020d200..26fc532 100755 --- a/ocr +++ b/ocr @@ -36,9 +36,6 @@ def parse_args(): parser.add_argument('--binarize', action='store_true', help='Use ocropy binarisation as preprocessing step.') - parser.add_argument('--compress', - action='store_true', - help='Compress the final PDF result file.') parser.add_argument('--log-dir') parser.add_argument('--n-cores', default=min(4, multiprocessing.cpu_count()), @@ -59,15 +56,13 @@ class OCRPipelineJob: class OCRPipeline(WorkflowRunner): - def __init__(self, binarize, jobs, lang, n_cores, output_dir, zip, - compress): + def __init__(self, binarize, jobs, lang, n_cores, output_dir, zip): self.binarize = binarize self.jobs = jobs self.lang = lang self.n_cores = n_cores self.output_dir = output_dir self.zip = zip - self.compress = compress def workflow(self): if not self.jobs: @@ -160,28 +155,20 @@ class OCRPipeline(WorkflowRunner): ''' ' ################################################## - ' # post binarization # + ' # Renaming of binarization output files # ' ################################################## ''' - post_binarization_jobs = [] for i, job in enumerate(self.jobs): input_dir = os.path.join(job.output_dir, 'tmp') output_dir = input_dir - number = 0 files = filter(lambda x: x.endswith('.bin.png'), os.listdir(input_dir)) - files.sort() for file in files: # int conversion is done in order to trim leading zeros - output_file = os.path.join(output_dir, 'page-{}.bin.png'.format(int(file.split('.', 1)[0]))) # noqa - cmd = 'mv "{}" "{}"'.format(os.path.join(output_dir, file), - output_file) - deps = 'binarization_-_{}'.format(i) - lbl = 'post_binarization_-_{}-{}'.format(i, number) - post_binarization_jobs.append( - self.addTask(command=cmd, dependencies=deps, label=lbl) - ) - number += 1 + page_number = int(file.split('.', 1)[0]) + output_file = 'page-{}.bin.png'.format(page_number) + os.rename(os.path.join(output_dir, file), + os.path.join(output_dir, output_file)) ''' ' The ocr_jobs are created based of the output files of either the @@ -217,7 +204,7 @@ class OCRPipeline(WorkflowRunner): cmd += ' && ' cmd += 'sed -i \'s+{}/++g\' "{}".hocr'.format(input_dir, output_file_base) # noqa if self.binarize: - deps = 'post_binarization_-_{}-{}'.format(i, number) + deps = 'binarization_-_{}'.format(i) else: deps = 'split_input_-_{}'.format(i) label = 'ocr_-_{}-{}'.format(i, number) @@ -241,13 +228,20 @@ class OCRPipeline(WorkflowRunner): combined_pdf_creation_jobs = [] for i, job in enumerate(self.jobs): input_dir = os.path.join(job.output_dir, 'tmp') + output_dir = job.output_dir files = filter(lambda x: x.endswith('.pdf'), os.listdir(input_dir)) files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) files = map(lambda x: os.path.join(input_dir, x), files) - output_file = os.path.join(job.output_dir, - '{}.pdf'.format(job.name)) - cmd = 'pdfunite "{}" "{}"'.format('" "'.join(files), output_file) + output_file = os.path.join(output_dir, '{}.pdf'.format(job.name)) + cmd = 'gs' + cmd += ' -dBATCH' + cmd += ' -dNOPAUSE' + cmd += ' -dPDFSETTINGS=/ebook' + cmd += ' -dQUIET' + cmd += ' -sDEVICE=pdfwrite' + cmd += ' "-sOutputFile={}"'.format(output_file) + cmd += ' "{}"'.format('" "'.join(files)) deps = filter(lambda x: x.startswith('ocr_-_{}'.format(i)), ocr_jobs) lbl = 'combined_pdf_creation_-_{}'.format(i) @@ -321,38 +315,6 @@ class OCRPipeline(WorkflowRunner): ''' self.waitForTasks() - ''' - ' ################################################## - ' # pdf compression # - ' ################################################## - ''' - pdf_compression_jobs = [] - n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs)))) - if self.compress: - for i, job in enumerate(self.jobs): - file = filter(lambda x: x.endswith('.pdf'), os.listdir(job.output_dir))[0] # noqa - original_file = os.path.join(job.output_dir, file) - compressed_file = os.path.join(job.output_dir, 'c_' + file) - cmd = 'gs' - cmd += ' -dBATCH' - cmd += ' -dNOPAUSE' - cmd += ' -dNumRenderingThreads={}'.format(n_cores) - cmd += ' -dPDFSETTINGS=/ebook' - # -dCompatibilityLevel must be defined after -dPDFSETTINGS - cmd += ' -dCompatibilityLevel=1.4' - cmd += ' -dQUIET' - cmd += ' -sDEVICE=pdfwrite' - cmd += ' "-sOutputFile={}"'.format(compressed_file) - cmd += ' "{}"'.format(original_file) - cmd += ' && ' - cmd += 'mv "{}" "{}"'.format(compressed_file, original_file) - deps = 'combined_pdf_creation_-_{}'.format(i) - lbl = 'pdf_compression_-_{}'.format(i) - pdf_compression_jobs.append(self.addTask(command=cmd, - dependencies=deps, - label=lbl, - nCores=n_cores)) - ''' ' ################################################## ' # cleanup # @@ -362,10 +324,7 @@ class OCRPipeline(WorkflowRunner): for i, job in enumerate(self.jobs): input_dir = os.path.join(job.output_dir, 'tmp') cmd = 'rm -r "{}"'.format(input_dir) - if self.compress: - deps = ['pdf_compression_-_{}'.format(i)] - else: - deps = ['combined_pdf_creation_-_{}'.format(i)] + deps = ['combined_pdf_creation_-_{}'.format(i)] deps.append('combined_txt_creation_-_{}'.format(i)) deps.append('poco_bundle_creation_-_{}'.format(i)) deps.append('tei_p5_creation_-_{}'.format(i)) @@ -395,8 +354,7 @@ class OCRPipeline(WorkflowRunner): cmd += ' -i "*.pdf" "*.txt" "*.xml" "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif') # noqa cmd += ' && ' cmd += 'cd -' - deps = (pdf_compression_jobs if self.compress else - combined_pdf_creation_jobs) + deps = combined_pdf_creation_jobs deps += combined_txt_creation_jobs deps += poco_bundle_creation_jobs lbl = 'zip_creation_-_all' @@ -413,8 +371,7 @@ class OCRPipeline(WorkflowRunner): cmd += ' -i "*.pdf"' cmd += ' && ' cmd += 'cd -' - deps = (pdf_compression_jobs if self.compress else - combined_pdf_creation_jobs) + deps = combined_pdf_creation_jobs lbl = 'zip_creation_-_pdf' zip_creation_jobs.append(self.addTask(command=cmd, dependencies=deps, @@ -482,8 +439,7 @@ def main(): args = parse_args() jobs = collect_jobs(args.input_directory, args.output_directory) ocr_pipeline = OCRPipeline(args.binarize, jobs, args.language, - args.n_cores, args.output_directory, args.zip, - args.compress) + args.n_cores, args.output_directory, args.zip) retval = ocr_pipeline.run( dataDirRoot=(args.log_dir or args.output_directory), nCores=args.n_cores