From 2b63ba9e59c72c0570425591387da3e0e57de854 Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Wed, 1 Jul 2020 11:03:34 +0200 Subject: [PATCH] Remove unused dependencies and use ghostscript for image split --- Dockerfile | 1 - ocr | 39 ++++++++++++++++++++++++--------------- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/Dockerfile b/Dockerfile index d924438..66b86a0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,7 +14,6 @@ RUN apt-get update \ ca-certificates \ gnupg2 \ ghostscript \ - imagemagick \ poppler-utils \ python2.7 \ python3.7 \ diff --git a/ocr b/ocr index f26f8d4..020d200 100755 --- a/ocr +++ b/ocr @@ -98,12 +98,16 @@ class OCRPipeline(WorkflowRunner): n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs)))) for i, job in enumerate(self.jobs): output_dir = os.path.join(job.output_dir, 'tmp') - output_file_base = os.path.join(output_dir, 'page') - cmd = 'pdftoppm' - cmd += ' -r 300' - cmd += ' -tiff' - cmd += ' -tiffcompression lzw' - cmd += ' "{}" "{}"'.format(job.file, output_file_base) + cmd = 'gs' + cmd += ' -dBATCH' + cmd += ' -dNOPAUSE' + cmd += ' -dNumRenderingThreads={}'.format(n_cores) + cmd += ' -dQUIET' + cmd += ' -r300' + cmd += ' -sDEVICE=tiff24nc' + cmd += ' -sCompression=lzw' + cmd += ' "-sOutputFile={}/page-%d.tif"'.format(output_dir) + cmd += ' "{}"'.format(job.file) deps = 'setup_output_directory_-_{}'.format(i) lbl = 'split_input_-_{}'.format(i) split_input_jobs.append(self.addTask(command=cmd, @@ -138,8 +142,9 @@ class OCRPipeline(WorkflowRunner): files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) files = map(lambda x: os.path.join(input_dir, x), files) cmd = 'ocropus-nlbin "{}"'.format('" "'.join(files)) - cmd += ' -o "{}"'.format(output_dir) - cmd += ' -Q "{}"'.format(n_cores) + cmd += ' --nocheck' + cmd += ' --output "{}"'.format(output_dir) + cmd += ' --parallel "{}"'.format(n_cores) deps = 'split_input_-_{}'.format(i) lbl = 'binarization_-_{}'.format(i) binarization_jobs.append(self.addTask(command=cmd, @@ -322,19 +327,22 @@ class OCRPipeline(WorkflowRunner): ' ################################################## ''' pdf_compression_jobs = [] + n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs)))) if self.compress: for i, job in enumerate(self.jobs): file = filter(lambda x: x.endswith('.pdf'), os.listdir(job.output_dir))[0] # noqa original_file = os.path.join(job.output_dir, file) compressed_file = os.path.join(job.output_dir, 'c_' + file) cmd = 'gs' - cmd += ' -sDEVICE=pdfwrite' - cmd += ' -dCompatibilityLevel=1.4' - cmd += ' -dPDFSETTINGS=/ebook' - cmd += ' -dNOPAUSE' - cmd += ' -dQUIET' cmd += ' -dBATCH' - cmd += ' -sOutputFile="{}"'.format(compressed_file) + cmd += ' -dNOPAUSE' + cmd += ' -dNumRenderingThreads={}'.format(n_cores) + cmd += ' -dPDFSETTINGS=/ebook' + # -dCompatibilityLevel must be defined after -dPDFSETTINGS + cmd += ' -dCompatibilityLevel=1.4' + cmd += ' -dQUIET' + cmd += ' -sDEVICE=pdfwrite' + cmd += ' "-sOutputFile={}"'.format(compressed_file) cmd += ' "{}"'.format(original_file) cmd += ' && ' cmd += 'mv "{}" "{}"'.format(compressed_file, original_file) @@ -342,7 +350,8 @@ class OCRPipeline(WorkflowRunner): lbl = 'pdf_compression_-_{}'.format(i) pdf_compression_jobs.append(self.addTask(command=cmd, dependencies=deps, - label=lbl)) + label=lbl, + nCores=n_cores)) ''' ' ##################################################