From ec5b4eb521ff48854841eae8406860fa3b4939dc Mon Sep 17 00:00:00 2001 From: Stephan Porada Date: Tue, 16 Jun 2020 09:31:34 +0200 Subject: [PATCH] Add PDF compression --- Dockerfile | 1 + ocr | 151 +++++++++++++++++++++++++++++++++++++---------------- 2 files changed, 108 insertions(+), 44 deletions(-) diff --git a/Dockerfile b/Dockerfile index ca91973..d924438 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,6 +13,7 @@ RUN apt-get update \ build-essential \ ca-certificates \ gnupg2 \ + ghostscript \ imagemagick \ poppler-utils \ python2.7 \ diff --git a/ocr b/ocr index b261a96..17ab5c8 100755 --- a/ocr +++ b/ocr @@ -24,19 +24,33 @@ TESSERACT_MODELS = ['deu', 'eng', 'enm', 'fra', 'frk', 'frm', 'ita', 'por', def parse_args(): parser = ArgumentParser(description='OCR Pipeline utilizing tesseract.') - parser.add_argument('i') - parser.add_argument('o') - parser.add_argument('-l', '--language', choices=TESSERACT_MODELS, + parser.add_argument('i', help='Input directory for OCR. One PDf equals one\ + job') + parser.add_argument('o', help='Output directory containing OCR results.') + parser.add_argument('-l', '--language', + choices=TESSERACT_MODELS, required=True) - parser.add_argument('--binarize', action='store_true', - help='use ocropy binarisation') - parser.add_argument('--keep-intermediates', action='store_true', - help='keep intermediate files') + parser.add_argument('--binarize', + action='store_true', + help='Use ocropy binarisation as preprocessing step.') + parser.add_argument('--keep-intermediates', + action='store_true', + help='Keep intermediate files for debugging etc.', + required=False) parser.add_argument('--n-cores', default=min(4, multiprocessing.cpu_count()), - help='total number of cores available', type=int) - parser.add_argument('--log-dir') - parser.add_argument('--zip') + help='Total number of cores available.', + type=int, + required=False) + parser.add_argument('--zip', help='Zips all results in different archives \ + depending on result types. Also zips \ + everything into one archive.', + required=False) + parser.add_argument('-c', '--compress', + help='Compress the final PDF result file.', + required=False, + action='store_true') + parser.add_argument('--log_dir') return parser.parse_args() @@ -49,7 +63,7 @@ class OCRPipelineJob: class OCRPipeline(WorkflowRunner): def __init__(self, binarize, jobs, keep_intermediates, lang, n_cores, - output_dir, zip): + output_dir, zip, compress): self.binarize = binarize self.jobs = jobs self.keep_intermediates = keep_intermediates @@ -57,6 +71,7 @@ class OCRPipeline(WorkflowRunner): self.n_cores = n_cores self.output_dir = output_dir self.zip = zip + self.compress = compress def workflow(self): if not self.jobs: @@ -103,7 +118,7 @@ class OCRPipeline(WorkflowRunner): cmd += ' -tiff' cmd += ' -tiffcompression lzw' cmd += ' "{}" "{}"'.format(job.file, output_file_base) - deps = 'mkdir_job_-_{}'.format(i) + deps = mkdir_jobs lbl = 'pdftoppm_job_-_{}'.format(i) pdftoppm_jobs.append(self.addTask(command=cmd, dependencies=deps, label=lbl, nCores=n_cores)) @@ -137,7 +152,7 @@ class OCRPipeline(WorkflowRunner): cmd = 'ocropus-nlbin "{}"'.format('" "'.join(files)) cmd += ' -o "{}"'.format(output_dir) cmd += ' -Q "{}"'.format(n_cores) - deps = 'pdftoppm_job_-_{}'.format(i) + deps = pdftoppm_jobs lbl = 'ocropus_nlbin_job_-_{}'.format(i) ocropus_nlbin_jobs.append( self.addTask(command=cmd, dependencies=deps, label=lbl, @@ -167,7 +182,7 @@ class OCRPipeline(WorkflowRunner): output_file = os.path.join(output_dir, 'page-{}.bin.png'.format(int(file.split('.', 1)[0]))) # noqa cmd = 'mv "{}" "{}"'.format(os.path.join(output_dir, file), output_file) - deps = 'ocropus_nlbin_job_-_{}'.format(i) + deps = ocropus_nlbin_jobs lbl = 'post_ocropus_nlbin_job_-_{}-{}'.format(i, number) post_ocropus_nlbin_jobs.append( self.addTask(command=cmd, dependencies=deps, @@ -207,9 +222,9 @@ class OCRPipeline(WorkflowRunner): cmd += ' -l "{}"'.format(self.lang) cmd += ' hocr pdf txt' if self.binarize: - deps = 'post_ocropus_nlbin_job_-_{}-{}'.format(i, number) + deps = post_ocropus_nlbin_jobs else: - deps = 'pdftoppm_job_-_{}'.format(i) + deps = pdftoppm_jobs label = 'tesseract_jobs_-_{}-{}'.format(i, number) tesseract_jobs.append( self.addTask(command=cmd, dependencies=deps, label=label, @@ -237,8 +252,7 @@ class OCRPipeline(WorkflowRunner): output_file = os.path.join(job.output_dir, '{}.xml'.format(job.name)) cmd = 'hocrtotei "{}" "{}"'.format('" "'.join(files), output_file) - deps = filter(lambda x: x.startswith('ocr_job_-_{}'.format(i)), - tesseract_jobs) + deps = tesseract_jobs lbl = 'hocrtotei_job_-_{}'.format(i) hocrtotei_jobs.append(self.addTask(command=cmd, dependencies=deps, label=lbl)) @@ -272,8 +286,7 @@ class OCRPipeline(WorkflowRunner): output_path_base = os.path.join(job.output_dir, 'PoCo') output_path = os.path.join(output_path_base, 'hocr') cmd = 'cp "{}" "{}"'.format('" "'.join(files), output_path) - deps = filter(lambda x: x.startswith('ocr_job_-_{}'.format(i)), - tesseract_jobs) + deps = tesseract_jobs lbl = 'hocr_poco_jobs-_{}'.format(i) hocr_poco_jobs.append(self.addTask(command=cmd, dependencies=deps, label=lbl)) @@ -293,8 +306,7 @@ class OCRPipeline(WorkflowRunner): output_path_base = os.path.join(job.output_dir, 'PoCo') output_path = os.path.join(output_path_base, 'tiff') cmd = 'cp "{}" "{}"'.format('" "'.join(files), output_path) - deps = filter(lambda x: x.startswith('ocr_job_-_{}'.format(i)), - tesseract_jobs) + deps = tesseract_jobs lbl = 'tiff_poco_jobs-_{}'.format(i) tiff_poco_jobs.append(self.addTask(command=cmd, dependencies=deps, label=lbl)) @@ -313,8 +325,7 @@ class OCRPipeline(WorkflowRunner): output_file = os.path.join(job.output_dir, '{}.pdf'.format(job.name)) cmd = 'pdfunite "{}" "{}"'.format('" "'.join(files), output_file) - deps = filter(lambda x: x.startswith('ocr_job_-_{}'.format(i)), - tesseract_jobs) + deps = tesseract_jobs lbl = 'pdfunite_job_-_{}'.format(i) pdfunite_jobs.append(self.addTask(command=cmd, dependencies=deps, label=lbl)) @@ -333,11 +344,52 @@ class OCRPipeline(WorkflowRunner): output_file = os.path.join(job.output_dir, '{}.txt'.format(job.name)) cmd = 'cat "{}" > "{}"'.format('" "'.join(files), output_file) - deps = filter(lambda x: x.startswith('ocr_job_-_{}'.format(i)), - tesseract_jobs) + deps = tesseract_jobs lbl = 'cat_job_-_{}'.format(i) cat_jobs.append(self.addTask(command=cmd, dependencies=deps, label=lbl)) + ''' + ' The following jobs are created based of the output files of the + ' pdfunite_jobs. So wait until they are finished. + ''' + self.waitForTasks() + + ''' + ' ################################################## + ' # compress_jobs # + ' ################################################## + ''' + compress_jobs = [] + if self.compress: + for i, job in enumerate(self.jobs): + print(os.listdir(job.output_dir)) + file = filter(lambda x: x.endswith('.pdf'), + os.listdir(job.output_dir))[0] + original_file = os.path.join(job.output_dir, file) + compressed_file = os.path.join(job.output_dir, 'c_' + file) + cmd = ('gs ' + + '-sDEVICE=pdfwrite ' + + '-dCompatibilityLevel=1.4 ' + + '-dPDFSETTINGS=/ebook ' + + '-dNOPAUSE ' + + '-dQUIET ' + + '-dBATCH ' + + '-sOutputFile={o} {i} ').format(o=compressed_file, + i=original_file) + cmd += '&& rm {original_f} '.format(original_f=original_file) + cmd += ('&& mv {compressed_f} ' + + '{original_f} ').format(compressed_f=compressed_file, + original_f=original_file) + deps = (hocrtotei_jobs + + tesseract_jobs + + pdfunite_jobs + + cat_jobs + + hocr_poco_jobs + + tiff_poco_jobs) + lbl = 'compress_job_-_{}'.format(i) + compress_jobs.append(self.addTask(command=cmd, + dependencies=deps, + label=lbl)) ''' ' ################################################## @@ -345,6 +397,13 @@ class OCRPipeline(WorkflowRunner): ' ################################################## ''' zip_jobs = [] + deps = (hocrtotei_jobs + + tesseract_jobs + + pdfunite_jobs + + cat_jobs + + hocr_poco_jobs + + tiff_poco_jobs + + compress_jobs) if self.zip is not None: # Remove .zip file extension if provided if self.zip.endswith('.zip'): @@ -360,11 +419,6 @@ class OCRPipeline(WorkflowRunner): cmd += ' -i "*.pdf" "*.txt" "*.xml" "*.hocr" "*.tif"' cmd += ' && ' cmd += 'cd -' - deps = (hocrtotei_jobs - + pdfunite_jobs - + cat_jobs - + hocr_poco_jobs - + tiff_poco_jobs) lbl = 'zip_job_-_all' zip_jobs.append(self.addTask(command=cmd, dependencies=deps, label=lbl)) @@ -378,7 +432,7 @@ class OCRPipeline(WorkflowRunner): cmd += ' -i "*.pdf"' cmd += ' && ' cmd += 'cd -' - deps = 'zip_job_-_all' + deps = deps + ['zip_job_-_all'] lbl = 'zip_job_-_pdf' zip_jobs.append(self.addTask(command=cmd, dependencies=deps, label=lbl)) @@ -392,7 +446,7 @@ class OCRPipeline(WorkflowRunner): cmd += ' -i "*.txt"' cmd += ' && ' cmd += 'cd -' - deps = 'zip_job_-_all' + deps = deps + ['zip_job_-_all'] lbl = 'zip_job_-_txt' zip_jobs.append(self.addTask(command=cmd, dependencies=deps, label=lbl)) @@ -406,7 +460,7 @@ class OCRPipeline(WorkflowRunner): cmd += ' -i "*.xml"' cmd += ' && ' cmd += 'cd -' - deps = 'zip_job_-_all' + deps = deps + ['zip_job_-_all'] lbl = 'zip_job_-_xml' zip_jobs.append(self.addTask(command=cmd, dependencies=deps, label=lbl)) @@ -426,7 +480,7 @@ class OCRPipeline(WorkflowRunner): cmd += ' "{}"'.format('" "'.join(poco_paths)) cmd += ' && ' cmd += 'cd -' - deps = 'zip_job_-_all' + deps = deps + ['zip_job_-_all'] lbl = 'zip_job_-_poco_{}'.format(i) zip_jobs.append(self.addTask(command=cmd, dependencies=deps, label=lbl)) @@ -454,11 +508,14 @@ class OCRPipeline(WorkflowRunner): cmd += 'mv "{}"/*.bin.png "{}"'.format(input_dir, os.path.join(output_dir, 'bin.png')) # noqa cmd += ' && ' cmd += 'mv "{}"/*.nrm.png "{}"'.format(input_dir, os.path.join(output_dir, 'nrm.png')) # noqa - deps = ['hocrtotei_job_-_{}'.format(i), - 'pdfunite_job_-_{}'.format(i), - 'cat_job_-_{}'.format(i), - 'tiff_poco_jobs_-_{i}'.format(i), - 'hocr_poco_jobs_-_{i}'.format(i)] + deps = (hocrtotei_jobs + + tesseract_jobs + + pdfunite_jobs + + cat_jobs + + hocr_poco_jobs + + tiff_poco_jobs, + + compress_jobs + + zip_jobs) lbl = 'mv_job_-_{}'.format(i) mv_jobs.append(self.addTask(command=cmd, dependencies=deps, label=lbl)) @@ -466,9 +523,14 @@ class OCRPipeline(WorkflowRunner): for i, job in enumerate(self.jobs): input_dir = os.path.join(job.output_dir, 'tmp') cmd = 'rm -r "{}"'.format(input_dir) - deps = ['hocrtotei_job_-_{}'.format(i), - 'pdfunite_job_-_{}'.format(i), - 'cat_job_-_{}'.format(i)] + deps = (hocrtotei_jobs + + tesseract_jobs + + pdfunite_jobs + + cat_jobs + + hocr_poco_jobs + + tiff_poco_jobs + + compress_jobs + + zip_jobs) lbl = 'mv_job_-_{}'.format(i) mv_jobs.append(self.addTask(command=cmd, dependencies=deps, label=lbl)) @@ -490,7 +552,8 @@ def main(): args = parse_args() jobs = collect_jobs(args.i, args.o) ocr_pipeline = OCRPipeline(args.binarize, jobs, args.keep_intermediates, - args.language, args.n_cores, args.o, args.zip) + args.language, args.n_cores, args.o, args.zip, + args.compress) retval = ocr_pipeline.run(dataDirRoot=(args.log_dir or args.o), nCores=args.n_cores) sys.exit(retval)