mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
				synced 2025-10-31 20:03:14 +00:00 
			
		
		
		
	More GhostScript, less dependencies!
This commit is contained in:
		| @@ -14,7 +14,6 @@ RUN apt-get update \ | ||||
|       ca-certificates \ | ||||
|       gnupg2 \ | ||||
|       ghostscript \ | ||||
|       poppler-utils \ | ||||
|       python2.7 \ | ||||
|       python3.7 \ | ||||
|       wget \ | ||||
|   | ||||
							
								
								
									
										86
									
								
								ocr
									
									
									
									
									
								
							
							
						
						
									
										86
									
								
								ocr
									
									
									
									
									
								
							| @@ -36,9 +36,6 @@ def parse_args(): | ||||
|     parser.add_argument('--binarize', | ||||
|                         action='store_true', | ||||
|                         help='Use ocropy binarisation as preprocessing step.') | ||||
|     parser.add_argument('--compress', | ||||
|                         action='store_true', | ||||
|                         help='Compress the final PDF result file.') | ||||
|     parser.add_argument('--log-dir') | ||||
|     parser.add_argument('--n-cores', | ||||
|                         default=min(4, multiprocessing.cpu_count()), | ||||
| @@ -59,15 +56,13 @@ class OCRPipelineJob: | ||||
|  | ||||
|  | ||||
| class OCRPipeline(WorkflowRunner): | ||||
|     def __init__(self, binarize, jobs, lang, n_cores, output_dir, zip, | ||||
|                  compress): | ||||
|     def __init__(self, binarize, jobs, lang, n_cores, output_dir, zip): | ||||
|         self.binarize = binarize | ||||
|         self.jobs = jobs | ||||
|         self.lang = lang | ||||
|         self.n_cores = n_cores | ||||
|         self.output_dir = output_dir | ||||
|         self.zip = zip | ||||
|         self.compress = compress | ||||
|  | ||||
|     def workflow(self): | ||||
|         if not self.jobs: | ||||
| @@ -160,28 +155,20 @@ class OCRPipeline(WorkflowRunner): | ||||
|  | ||||
|             ''' | ||||
|             ' ################################################## | ||||
|             ' # post binarization                              # | ||||
|             ' # Renaming of binarization output files          # | ||||
|             ' ################################################## | ||||
|             ''' | ||||
|             post_binarization_jobs = [] | ||||
|             for i, job in enumerate(self.jobs): | ||||
|                 input_dir = os.path.join(job.output_dir, 'tmp') | ||||
|                 output_dir = input_dir | ||||
|                 number = 0 | ||||
|                 files = filter(lambda x: x.endswith('.bin.png'), | ||||
|                                os.listdir(input_dir)) | ||||
|                 files.sort() | ||||
|                 for file in files: | ||||
|                     # int conversion is done in order to trim leading zeros | ||||
|                     output_file = os.path.join(output_dir, 'page-{}.bin.png'.format(int(file.split('.', 1)[0])))  # noqa | ||||
|                     cmd = 'mv "{}" "{}"'.format(os.path.join(output_dir, file), | ||||
|                                                 output_file) | ||||
|                     deps = 'binarization_-_{}'.format(i) | ||||
|                     lbl = 'post_binarization_-_{}-{}'.format(i, number) | ||||
|                     post_binarization_jobs.append( | ||||
|                         self.addTask(command=cmd, dependencies=deps, label=lbl) | ||||
|                     ) | ||||
|                     number += 1 | ||||
|                     page_number = int(file.split('.', 1)[0]) | ||||
|                     output_file = 'page-{}.bin.png'.format(page_number) | ||||
|                     os.rename(os.path.join(output_dir, file), | ||||
|                               os.path.join(output_dir, output_file)) | ||||
|  | ||||
|         ''' | ||||
|         ' The ocr_jobs are created based of the output files of either the | ||||
| @@ -217,7 +204,7 @@ class OCRPipeline(WorkflowRunner): | ||||
|                 cmd += ' && ' | ||||
|                 cmd += 'sed -i \'s+{}/++g\' "{}".hocr'.format(input_dir, output_file_base)  # noqa | ||||
|                 if self.binarize: | ||||
|                     deps = 'post_binarization_-_{}-{}'.format(i, number) | ||||
|                     deps = 'binarization_-_{}'.format(i) | ||||
|                 else: | ||||
|                     deps = 'split_input_-_{}'.format(i) | ||||
|                 label = 'ocr_-_{}-{}'.format(i, number) | ||||
| @@ -241,13 +228,20 @@ class OCRPipeline(WorkflowRunner): | ||||
|         combined_pdf_creation_jobs = [] | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             input_dir = os.path.join(job.output_dir, 'tmp') | ||||
|             output_dir = job.output_dir | ||||
|             files = filter(lambda x: x.endswith('.pdf'), | ||||
|                            os.listdir(input_dir)) | ||||
|             files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) | ||||
|             files = map(lambda x: os.path.join(input_dir, x), files) | ||||
|             output_file = os.path.join(job.output_dir, | ||||
|                                        '{}.pdf'.format(job.name)) | ||||
|             cmd = 'pdfunite "{}" "{}"'.format('" "'.join(files), output_file) | ||||
|             output_file = os.path.join(output_dir, '{}.pdf'.format(job.name)) | ||||
|             cmd = 'gs' | ||||
|             cmd += ' -dBATCH' | ||||
|             cmd += ' -dNOPAUSE' | ||||
|             cmd += ' -dPDFSETTINGS=/ebook' | ||||
|             cmd += ' -dQUIET' | ||||
|             cmd += ' -sDEVICE=pdfwrite' | ||||
|             cmd += ' "-sOutputFile={}"'.format(output_file) | ||||
|             cmd += ' "{}"'.format('" "'.join(files)) | ||||
|             deps = filter(lambda x: x.startswith('ocr_-_{}'.format(i)), | ||||
|                           ocr_jobs) | ||||
|             lbl = 'combined_pdf_creation_-_{}'.format(i) | ||||
| @@ -321,38 +315,6 @@ class OCRPipeline(WorkflowRunner): | ||||
|         ''' | ||||
|         self.waitForTasks() | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # pdf compression                                # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         pdf_compression_jobs = [] | ||||
|         n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs)))) | ||||
|         if self.compress: | ||||
|             for i, job in enumerate(self.jobs): | ||||
|                 file = filter(lambda x: x.endswith('.pdf'), os.listdir(job.output_dir))[0]  # noqa | ||||
|                 original_file = os.path.join(job.output_dir, file) | ||||
|                 compressed_file = os.path.join(job.output_dir, 'c_' + file) | ||||
|                 cmd = 'gs' | ||||
|                 cmd += ' -dBATCH' | ||||
|                 cmd += ' -dNOPAUSE' | ||||
|                 cmd += ' -dNumRenderingThreads={}'.format(n_cores) | ||||
|                 cmd += ' -dPDFSETTINGS=/ebook' | ||||
|                 # -dCompatibilityLevel must be defined after -dPDFSETTINGS | ||||
|                 cmd += ' -dCompatibilityLevel=1.4' | ||||
|                 cmd += ' -dQUIET' | ||||
|                 cmd += ' -sDEVICE=pdfwrite' | ||||
|                 cmd += ' "-sOutputFile={}"'.format(compressed_file) | ||||
|                 cmd += ' "{}"'.format(original_file) | ||||
|                 cmd += ' && ' | ||||
|                 cmd += 'mv "{}" "{}"'.format(compressed_file, original_file) | ||||
|                 deps = 'combined_pdf_creation_-_{}'.format(i) | ||||
|                 lbl = 'pdf_compression_-_{}'.format(i) | ||||
|                 pdf_compression_jobs.append(self.addTask(command=cmd, | ||||
|                                                          dependencies=deps, | ||||
|                                                          label=lbl, | ||||
|                                                          nCores=n_cores)) | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # cleanup                                        # | ||||
| @@ -362,10 +324,7 @@ class OCRPipeline(WorkflowRunner): | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             input_dir = os.path.join(job.output_dir, 'tmp') | ||||
|             cmd = 'rm -r "{}"'.format(input_dir) | ||||
|             if self.compress: | ||||
|                 deps = ['pdf_compression_-_{}'.format(i)] | ||||
|             else: | ||||
|                 deps = ['combined_pdf_creation_-_{}'.format(i)] | ||||
|             deps = ['combined_pdf_creation_-_{}'.format(i)] | ||||
|             deps.append('combined_txt_creation_-_{}'.format(i)) | ||||
|             deps.append('poco_bundle_creation_-_{}'.format(i)) | ||||
|             deps.append('tei_p5_creation_-_{}'.format(i)) | ||||
| @@ -395,8 +354,7 @@ class OCRPipeline(WorkflowRunner): | ||||
|             cmd += ' -i "*.pdf" "*.txt" "*.xml" "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif')  # noqa | ||||
|             cmd += ' && ' | ||||
|             cmd += 'cd -' | ||||
|             deps = (pdf_compression_jobs if self.compress else | ||||
|                     combined_pdf_creation_jobs) | ||||
|             deps = combined_pdf_creation_jobs | ||||
|             deps += combined_txt_creation_jobs | ||||
|             deps += poco_bundle_creation_jobs | ||||
|             lbl = 'zip_creation_-_all' | ||||
| @@ -413,8 +371,7 @@ class OCRPipeline(WorkflowRunner): | ||||
|             cmd += ' -i "*.pdf"' | ||||
|             cmd += ' && ' | ||||
|             cmd += 'cd -' | ||||
|             deps = (pdf_compression_jobs if self.compress else | ||||
|                     combined_pdf_creation_jobs) | ||||
|             deps = combined_pdf_creation_jobs | ||||
|             lbl = 'zip_creation_-_pdf' | ||||
|             zip_creation_jobs.append(self.addTask(command=cmd, | ||||
|                                                   dependencies=deps, | ||||
| @@ -482,8 +439,7 @@ def main(): | ||||
|     args = parse_args() | ||||
|     jobs = collect_jobs(args.input_directory, args.output_directory) | ||||
|     ocr_pipeline = OCRPipeline(args.binarize, jobs, args.language, | ||||
|                                args.n_cores, args.output_directory, args.zip, | ||||
|                                args.compress) | ||||
|                                args.n_cores, args.output_directory, args.zip) | ||||
|     retval = ocr_pipeline.run( | ||||
|         dataDirRoot=(args.log_dir or args.output_directory), | ||||
|         nCores=args.n_cores | ||||
|   | ||||
		Reference in New Issue
	
	Block a user