mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
				synced 2025-10-31 20:03:14 +00:00 
			
		
		
		
	fix pipeline
This commit is contained in:
		
							
								
								
									
										501
									
								
								ocr
									
									
									
									
									
								
							
							
						
						
									
										501
									
								
								ocr
									
									
									
									
									
								
							| @@ -24,33 +24,30 @@ TESSERACT_MODELS = ['deu', 'eng', 'enm', 'fra', 'frk', 'frm', 'ita', 'por', | ||||
|  | ||||
| def parse_args(): | ||||
|     parser = ArgumentParser(description='OCR Pipeline utilizing tesseract.') | ||||
|     parser.add_argument('i', help='Input directory for OCR. One PDf equals one\ | ||||
|                                    job') | ||||
|     parser.add_argument('o', help='Output directory containing OCR results.') | ||||
|     parser.add_argument('-i', '--input-directory', | ||||
|                         help='Input directory (only PDF files get processed)', | ||||
|                         required=True) | ||||
|     parser.add_argument('-o', '--output-directory', | ||||
|                         help='Output directory', | ||||
|                         required=True) | ||||
|     parser.add_argument('-l', '--language', | ||||
|                         choices=TESSERACT_MODELS, | ||||
|                         required=True) | ||||
|     parser.add_argument('--binarize', | ||||
|                         action='store_true', | ||||
|                         help='Use ocropy binarisation as preprocessing step.') | ||||
|     parser.add_argument('--keep-intermediates', | ||||
|     parser.add_argument('--compress', | ||||
|                         action='store_true', | ||||
|                         help='Keep intermediate files for debugging etc.', | ||||
|                         required=False) | ||||
|                         help='Compress the final PDF result file.') | ||||
|     parser.add_argument('--log-dir') | ||||
|     parser.add_argument('--n-cores', | ||||
|                         default=min(4, multiprocessing.cpu_count()), | ||||
|                         help='Total number of cores available.', | ||||
|                         type=int, | ||||
|                         required=False) | ||||
|     parser.add_argument('--zip', help='Zips all results in different archives \ | ||||
|                                        depending on result types. Also zips   \ | ||||
|                                        everything into one archive.', | ||||
|                         required=False) | ||||
|     parser.add_argument('-c', '--compress', | ||||
|                         help='Compress the final PDF result file.', | ||||
|                         required=False, | ||||
|                         action='store_true') | ||||
|     parser.add_argument('--log_dir') | ||||
|                         type=int) | ||||
|     parser.add_argument('--zip', | ||||
|                         help='Zips all results in different archives depending' | ||||
|                              ' on result types. Also zips everything into one ' | ||||
|                              'archive.') | ||||
|     return parser.parse_args() | ||||
|  | ||||
|  | ||||
| @@ -62,11 +59,10 @@ class OCRPipelineJob: | ||||
|  | ||||
|  | ||||
| class OCRPipeline(WorkflowRunner): | ||||
|     def __init__(self, binarize, jobs, keep_intermediates, lang, n_cores, | ||||
|                  output_dir, zip, compress): | ||||
|     def __init__(self, binarize, jobs, lang, n_cores, output_dir, zip, | ||||
|                  compress): | ||||
|         self.binarize = binarize | ||||
|         self.jobs = jobs | ||||
|         self.keep_intermediates = keep_intermediates | ||||
|         self.lang = lang | ||||
|         self.n_cores = n_cores | ||||
|         self.output_dir = output_dir | ||||
| @@ -79,36 +75,26 @@ class OCRPipeline(WorkflowRunner): | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # mkdir_jobs                                     # | ||||
|         ' # setup output directory                         # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|  | ||||
|         mkdir_jobs = [] | ||||
|         setup_output_directory_jobs = [] | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             output_dir = os.path.join(job.output_dir, 'tmp') | ||||
|             poco_dir = os.path.join(job.output_dir, 'PoCo') | ||||
|             intermediate_dir = os.path.join(job.output_dir, 'tmp') | ||||
|             cmd = 'mkdir' | ||||
|             cmd += ' -p' | ||||
|             cmd += ' "{}"'.format(output_dir) | ||||
|             cmd += ' "{}"'.format(os.path.join(poco_dir, 'hocr')) | ||||
|             cmd += ' "{}"'.format(os.path.join(poco_dir, 'tiff')) | ||||
|             if self.keep_intermediates: | ||||
|                 cmd += ' "{}"'.format(os.path.join(output_dir, 'hocr')) | ||||
|                 cmd += ' "{}"'.format(os.path.join(output_dir, 'pdf')) | ||||
|                 cmd += ' "{}"'.format(os.path.join(output_dir, 'tiff')) | ||||
|                 cmd += ' "{}"'.format(os.path.join(output_dir, 'txt')) | ||||
|                 if self.binarize: | ||||
|                     cmd += ' "{}"'.format(os.path.join(output_dir, 'bin.png')) | ||||
|                     cmd += ' "{}"'.format(os.path.join(output_dir, 'nrm.png')) | ||||
|             lbl = 'mkdir_job_-_{}'.format(i) | ||||
|             mkdir_jobs.append(self.addTask(command=cmd, label=lbl)) | ||||
|             cmd += ' "{}"'.format(intermediate_dir) | ||||
|             cmd += ' "{}"'.format(os.path.join(job.output_dir, 'poco')) | ||||
|             lbl = 'setup_output_directory_-_{}'.format(i) | ||||
|             setup_output_directory_jobs.append(self.addTask(command=cmd, | ||||
|                                                             label=lbl)) | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # pdftoppm_jobs                                  # | ||||
|         ' # split input                                    # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         pdftoppm_jobs = [] | ||||
|         split_input_jobs = [] | ||||
|         n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs)))) | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             output_dir = os.path.join(job.output_dir, 'tmp') | ||||
| @@ -118,24 +104,26 @@ class OCRPipeline(WorkflowRunner): | ||||
|             cmd += ' -tiff' | ||||
|             cmd += ' -tiffcompression lzw' | ||||
|             cmd += ' "{}" "{}"'.format(job.file, output_file_base) | ||||
|             deps = mkdir_jobs | ||||
|             lbl = 'pdftoppm_job_-_{}'.format(i) | ||||
|             pdftoppm_jobs.append(self.addTask(command=cmd, dependencies=deps, | ||||
|                                               label=lbl, nCores=n_cores)) | ||||
|             deps = 'setup_output_directory_-_{}'.format(i) | ||||
|             lbl = 'split_input_-_{}'.format(i) | ||||
|             split_input_jobs.append(self.addTask(command=cmd, | ||||
|                                                  dependencies=deps, | ||||
|                                                  label=lbl, | ||||
|                                                  nCores=n_cores)) | ||||
|  | ||||
|         if self.binarize: | ||||
|             ''' | ||||
|             ' The ocropus_nlbin_jobs list is created based on the output files | ||||
|             ' of the pdftoppm_jobs. So wait until they are finished. | ||||
|             ' The binarization_jobs list is created based on the output files | ||||
|             ' of the split_jobs. So wait until they are finished. | ||||
|             ''' | ||||
|             self.waitForTasks() | ||||
|  | ||||
|             ''' | ||||
|             ' ################################################## | ||||
|             ' # ocropus_nlbin_jobs                             # | ||||
|             ' # binarization                                   # | ||||
|             ' ################################################## | ||||
|             ''' | ||||
|             ocropus_nlbin_jobs = [] | ||||
|             binarization_jobs = [] | ||||
|             ''' | ||||
|             ' We run ocropus-nlbin with either four or, if there are less then | ||||
|             ' four cores available for this workflow, the available core | ||||
| @@ -152,24 +140,25 @@ class OCRPipeline(WorkflowRunner): | ||||
|                 cmd = 'ocropus-nlbin "{}"'.format('" "'.join(files)) | ||||
|                 cmd += ' -o "{}"'.format(output_dir) | ||||
|                 cmd += ' -Q "{}"'.format(n_cores) | ||||
|                 deps = pdftoppm_jobs | ||||
|                 lbl = 'ocropus_nlbin_job_-_{}'.format(i) | ||||
|                 ocropus_nlbin_jobs.append( | ||||
|                     self.addTask(command=cmd, dependencies=deps, label=lbl, | ||||
|                                  nCores=n_cores)) | ||||
|                 deps = 'split_input_-_{}'.format(i) | ||||
|                 lbl = 'binarization_-_{}'.format(i) | ||||
|                 binarization_jobs.append(self.addTask(command=cmd, | ||||
|                                                       dependencies=deps, | ||||
|                                                       label=lbl, | ||||
|                                                       nCores=n_cores)) | ||||
|  | ||||
|             ''' | ||||
|             ' The post_ocropus_nlbin_jobs are created based on the output files | ||||
|             ' of the ocropus_nlbin_jobs. So wait until they are finished. | ||||
|             ' The post_binarization_jobs are created based on the output files | ||||
|             ' of the binarization_jobs. So wait until they are finished. | ||||
|             ''' | ||||
|             self.waitForTasks() | ||||
|  | ||||
|             ''' | ||||
|             ' ################################################## | ||||
|             ' # post_ocropus_nlbin_jobs                        # | ||||
|             ' # post binarization                              # | ||||
|             ' ################################################## | ||||
|             ''' | ||||
|             post_ocropus_nlbin_jobs = [] | ||||
|             post_binarization_jobs = [] | ||||
|             for i, job in enumerate(self.jobs): | ||||
|                 input_dir = os.path.join(job.output_dir, 'tmp') | ||||
|                 output_dir = input_dir | ||||
| @@ -182,26 +171,26 @@ class OCRPipeline(WorkflowRunner): | ||||
|                     output_file = os.path.join(output_dir, 'page-{}.bin.png'.format(int(file.split('.', 1)[0])))  # noqa | ||||
|                     cmd = 'mv "{}" "{}"'.format(os.path.join(output_dir, file), | ||||
|                                                 output_file) | ||||
|                     deps = ocropus_nlbin_jobs | ||||
|                     lbl = 'post_ocropus_nlbin_job_-_{}-{}'.format(i, number) | ||||
|                     post_ocropus_nlbin_jobs.append( | ||||
|                         self.addTask(command=cmd, dependencies=deps, | ||||
|                                      label=lbl)) | ||||
|                     deps = 'binarization_-_{}'.format(i) | ||||
|                     lbl = 'post_binarization_-_{}-{}'.format(i, number) | ||||
|                     post_binarization_jobs.append( | ||||
|                         self.addTask(command=cmd, dependencies=deps, label=lbl) | ||||
|                     ) | ||||
|                     number += 1 | ||||
|  | ||||
|         ''' | ||||
|         ' The tesseract_jobs are created based of the output files of either | ||||
|         ' the pdftoppm_jobs or post_ocropus_nlbin_jobs. So wait until they are | ||||
|         ' The ocr_jobs are created based of the output files of either the | ||||
|         ' split_jobs or post_binarization_jobs. So wait until they are | ||||
|         ' finished. | ||||
|         ''' | ||||
|         self.waitForTasks() | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # tesseract_jobs                                 # | ||||
|         ' # ocr                                            # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         tesseract_jobs = [] | ||||
|         ocr_jobs = [] | ||||
|         ''' | ||||
|         ' Tesseract runs fastest with four cores. So we run it with either four | ||||
|         ' or, if there are less then four cores available for this workflow, | ||||
| @@ -211,8 +200,7 @@ class OCRPipeline(WorkflowRunner): | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             input_dir = os.path.join(job.output_dir, 'tmp') | ||||
|             output_dir = input_dir | ||||
|             files = filter(lambda x: x.endswith('.bin.png' if self.binarize else '.tif'),  # noqa | ||||
|                            os.listdir(input_dir)) | ||||
|             files = filter(lambda x: x.endswith('.bin.png' if self.binarize else '.tif'), os.listdir(input_dir))  # noqa | ||||
|             files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) | ||||
|             files = map(lambda x: os.path.join(input_dir, x), files) | ||||
|             number = 0 | ||||
| @@ -221,14 +209,17 @@ class OCRPipeline(WorkflowRunner): | ||||
|                 cmd = 'tesseract "{}" "{}"'.format(file, output_file_base) | ||||
|                 cmd += ' -l "{}"'.format(self.lang) | ||||
|                 cmd += ' hocr pdf txt' | ||||
|                 cmd += ' && ' | ||||
|                 cmd += 'sed -i \'s+{}/++g\' "{}".hocr'.format(input_dir, output_file_base)  # noqa | ||||
|                 if self.binarize: | ||||
|                     deps = post_ocropus_nlbin_jobs | ||||
|                     deps = 'post_binarization_-_{}-{}'.format(i, number) | ||||
|                 else: | ||||
|                     deps = pdftoppm_jobs | ||||
|                 label = 'tesseract_jobs_-_{}-{}'.format(i, number) | ||||
|                 tesseract_jobs.append( | ||||
|                     self.addTask(command=cmd, dependencies=deps, label=label, | ||||
|                                  nCores=n_cores)) | ||||
|                     deps = 'split_input_-_{}'.format(i) | ||||
|                 label = 'ocr_-_{}-{}'.format(i, number) | ||||
|                 ocr_jobs.append(self.addTask(command=cmd, | ||||
|                                              dependencies=deps, | ||||
|                                              label=label, | ||||
|                                              nCores=n_cores)) | ||||
|                 number += 1 | ||||
|  | ||||
|         ''' | ||||
| @@ -239,190 +230,171 @@ class OCRPipeline(WorkflowRunner): | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # hocrtotei_jobs                                 # | ||||
|         ' # combined pdf creation                          # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         hocrtotei_jobs = [] | ||||
|         combined_pdf_creation_jobs = [] | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             input_dir = os.path.join(job.output_dir, 'tmp') | ||||
|             files = filter(lambda x: x.endswith('.hocr'), | ||||
|             files = filter(lambda x: x.endswith('.pdf'), | ||||
|                            os.listdir(input_dir)) | ||||
|             files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) | ||||
|             files = map(lambda x: os.path.join(input_dir, x), files) | ||||
|             output_file = os.path.join(job.output_dir, | ||||
|                                        '{}.xml'.format(job.name)) | ||||
|             cmd = 'hocrtotei "{}" "{}"'.format('" "'.join(files), output_file) | ||||
|             deps = tesseract_jobs | ||||
|             lbl = 'hocrtotei_job_-_{}'.format(i) | ||||
|             hocrtotei_jobs.append(self.addTask(command=cmd, dependencies=deps, | ||||
|                                                label=lbl)) | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # hocr_poco_jobs                                 # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|  | ||||
|         hocr_poco_jobs = [] | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             input_dir = os.path.join(job.output_dir, 'tmp') | ||||
|             files = filter(lambda x: x.endswith('.hocr'), | ||||
|                            os.listdir(input_dir)) | ||||
|             files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) | ||||
|             files = map(lambda x: os.path.join(input_dir, x), files) | ||||
|             # set relative file paths into hocr | ||||
|             relative_files = map(lambda x: os.path.join('..', | ||||
|                                                         'tiff', | ||||
|                                                         os.path.basename(x).replace('.hocr', '.tif')),  # noqa | ||||
|                                  files) | ||||
|             for file, relative_file in zip(files, relative_files): | ||||
|                 with open(file, 'r+') as f: | ||||
|                     html = f.read() | ||||
|                     html = html.replace(file.replace('.hocr', '.tif'), | ||||
|                                         relative_file) | ||||
|                     f.seek(0) | ||||
|                     f.truncate(0)  # deletes content of file to write new html | ||||
|                     f.write(html) | ||||
|             output_path_base = os.path.join(job.output_dir, 'PoCo') | ||||
|             output_path = os.path.join(output_path_base, 'hocr') | ||||
|             cmd = 'cp "{}" "{}"'.format('" "'.join(files), output_path) | ||||
|             deps = tesseract_jobs | ||||
|             lbl = 'hocr_poco_jobs-_{}'.format(i) | ||||
|             hocr_poco_jobs.append(self.addTask(command=cmd, dependencies=deps, | ||||
|                                                label=lbl)) | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # tiff_poco_jobs                                 # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|  | ||||
|         tiff_poco_jobs = [] | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             input_dir = os.path.join(job.output_dir, 'tmp') | ||||
|             files = filter(lambda x: x.endswith('.tif'), | ||||
|                            os.listdir(input_dir)) | ||||
|             files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) | ||||
|             files = map(lambda x: os.path.join(input_dir, x), files) | ||||
|             output_path_base = os.path.join(job.output_dir, 'PoCo') | ||||
|             output_path = os.path.join(output_path_base, 'tiff') | ||||
|             cmd = 'cp "{}" "{}"'.format('" "'.join(files), output_path) | ||||
|             deps = tesseract_jobs | ||||
|             lbl = 'tiff_poco_jobs-_{}'.format(i) | ||||
|             tiff_poco_jobs.append(self.addTask(command=cmd, dependencies=deps, | ||||
|                                                label=lbl)) | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # pdfunite_jobs                                  # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         pdfunite_jobs = [] | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             input_dir = os.path.join(job.output_dir, 'tmp') | ||||
|             files = filter(lambda x: x.endswith('.pdf'), os.listdir(input_dir)) | ||||
|             files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) | ||||
|             files = map(lambda x: os.path.join(input_dir, x), files) | ||||
|             output_file = os.path.join(job.output_dir, | ||||
|                                        '{}.pdf'.format(job.name)) | ||||
|             cmd = 'pdfunite "{}" "{}"'.format('" "'.join(files), output_file) | ||||
|             deps = tesseract_jobs | ||||
|             lbl = 'pdfunite_job_-_{}'.format(i) | ||||
|             pdfunite_jobs.append(self.addTask(command=cmd, dependencies=deps, | ||||
|                                               label=lbl)) | ||||
|             deps = filter(lambda x: x.startswith('ocr_-_{}'.format(i)), | ||||
|                           ocr_jobs) | ||||
|             lbl = 'combined_pdf_creation_-_{}'.format(i) | ||||
|             combined_pdf_creation_jobs.append(self.addTask(command=cmd, | ||||
|                                                            dependencies=deps, | ||||
|                                                            label=lbl)) | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # cat_jobs                                       # | ||||
|         ' # combined txt creation                          # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         cat_jobs = [] | ||||
|         combined_txt_creation_jobs = [] | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             input_dir = os.path.join(job.output_dir, 'tmp') | ||||
|             files = filter(lambda x: x.endswith('.txt'), os.listdir(input_dir)) | ||||
|             files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) | ||||
|             files = map(lambda x: os.path.join(input_dir, x), files) | ||||
|             output_file = os.path.join(job.output_dir, | ||||
|                                        '{}.txt'.format(job.name)) | ||||
|             output_file = os.path.join(job.output_dir, '{}.txt'.format(job.name))  # noqa | ||||
|             cmd = 'cat "{}" > "{}"'.format('" "'.join(files), output_file) | ||||
|             deps = tesseract_jobs | ||||
|             lbl = 'cat_job_-_{}'.format(i) | ||||
|             cat_jobs.append(self.addTask(command=cmd, dependencies=deps, | ||||
|                                          label=lbl)) | ||||
|             deps = filter(lambda x: x.startswith('ocr_-_{}'.format(i)), | ||||
|                           ocr_jobs) | ||||
|             lbl = 'combined_txt_creation_-_{}'.format(i) | ||||
|             combined_txt_creation_jobs.append(self.addTask(command=cmd, | ||||
|                                                            dependencies=deps, | ||||
|                                                            label=lbl)) | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # tei p5 creation                                # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         tei_p5_creation_jobs = [] | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             input_dir = os.path.join(job.output_dir, 'tmp') | ||||
|             files = filter(lambda x: x.endswith('.hocr'), os.listdir(input_dir))  # noqa | ||||
|             files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) | ||||
|             files = map(lambda x: os.path.join(input_dir, x), files) | ||||
|             output_file = os.path.join(job.output_dir, '{}.xml'.format(job.name))  # noqa | ||||
|             cmd = 'hocrtotei "{}" "{}"'.format('" "'.join(files), output_file) | ||||
|             deps = filter(lambda x: x.startswith('ocr_-_{}'.format(i)), | ||||
|                           ocr_jobs) | ||||
|             lbl = 'tei_p5_creation_-_{}'.format(i) | ||||
|             tei_p5_creation_jobs.append(self.addTask(command=cmd, | ||||
|                                                      dependencies=deps, | ||||
|                                                      label=lbl)) | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # poco bundle creation                           # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         poco_bundle_creation_jobs = [] | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             input_dir = os.path.join(job.output_dir, 'tmp') | ||||
|             output_dir = os.path.join(job.output_dir, 'poco') | ||||
|             cmd = 'mv "{}"/*.hocr "{}"'.format(input_dir, output_dir) | ||||
|             cmd += ' && ' | ||||
|             cmd += 'mv "{}"/*.{} "{}"'.format(input_dir, 'bin.png' if self.binarize else 'tif', output_dir)  # noqa | ||||
|             deps = filter(lambda x: x.startswith('ocr_-_{}'.format(i)), | ||||
|                           ocr_jobs) | ||||
|             deps.append('tei_p5_creation_-_{}'.format(i)) | ||||
|             lbl = 'poco_bundle_creation_-_{}'.format(i) | ||||
|             poco_bundle_creation_jobs.append(self.addTask(command=cmd, | ||||
|                                                           dependencies=deps, | ||||
|                                                           label=lbl)) | ||||
|  | ||||
|         ''' | ||||
|         ' The following jobs are created based of the output files of the | ||||
|         ' pdfunite_jobs. So wait until they are finished. | ||||
|         ' combined_pdf_creation_jobs. So wait until they are finished. | ||||
|         ''' | ||||
|         self.waitForTasks() | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # compress_jobs                                  # | ||||
|         ' # pdf compression                                # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         compress_jobs = [] | ||||
|         pdf_compression_jobs = [] | ||||
|         if self.compress: | ||||
|             for i, job in enumerate(self.jobs): | ||||
|                 print(os.listdir(job.output_dir)) | ||||
|                 file = filter(lambda x: x.endswith('.pdf'), | ||||
|                               os.listdir(job.output_dir))[0] | ||||
|                 file = filter(lambda x: x.endswith('.pdf'), os.listdir(job.output_dir))[0]  # noqa | ||||
|                 original_file = os.path.join(job.output_dir, file) | ||||
|                 compressed_file = os.path.join(job.output_dir, 'c_' + file) | ||||
|                 cmd = ('gs ' | ||||
|                        + '-sDEVICE=pdfwrite ' | ||||
|                        + '-dCompatibilityLevel=1.4 ' | ||||
|                        + '-dPDFSETTINGS=/ebook ' | ||||
|                        + '-dNOPAUSE ' | ||||
|                        + '-dQUIET ' | ||||
|                        + '-dBATCH ' | ||||
|                        + '-sOutputFile={o} {i} ').format(o=compressed_file, | ||||
|                                                          i=original_file) | ||||
|                 cmd += '&& rm {original_f} '.format(original_f=original_file) | ||||
|                 cmd += ('&& mv {compressed_f} ' | ||||
|                         + '{original_f} ').format(compressed_f=compressed_file, | ||||
|                                                   original_f=original_file) | ||||
|                 deps = (hocrtotei_jobs | ||||
|                         + tesseract_jobs | ||||
|                         + pdfunite_jobs | ||||
|                         + cat_jobs | ||||
|                         + hocr_poco_jobs | ||||
|                         + tiff_poco_jobs) | ||||
|                 lbl = 'compress_job_-_{}'.format(i) | ||||
|                 compress_jobs.append(self.addTask(command=cmd, | ||||
|                                                   dependencies=deps, | ||||
|                                                   label=lbl)) | ||||
|                 cmd = 'gs' | ||||
|                 cmd += ' -sDEVICE=pdfwrite' | ||||
|                 cmd += ' -dCompatibilityLevel=1.4' | ||||
|                 cmd += ' -dPDFSETTINGS=/ebook' | ||||
|                 cmd += ' -dNOPAUSE' | ||||
|                 cmd += ' -dQUIET' | ||||
|                 cmd += ' -dBATCH' | ||||
|                 cmd += ' -sOutputFile="{}"'.format(compressed_file) | ||||
|                 cmd += ' "{}"'.format(original_file) | ||||
|                 cmd += ' && ' | ||||
|                 cmd += 'mv "{}" "{}"'.format(compressed_file, original_file) | ||||
|                 deps = 'combined_pdf_creation_-_{}'.format(i) | ||||
|                 lbl = 'pdf_compression_-_{}'.format(i) | ||||
|                 pdf_compression_jobs.append(self.addTask(command=cmd, | ||||
|                                                          dependencies=deps, | ||||
|                                                          label=lbl)) | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # zip_jobs                                       # | ||||
|         ' # cleanup                                        # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         zip_jobs = [] | ||||
|         deps = (hocrtotei_jobs | ||||
|                 + tesseract_jobs | ||||
|                 + pdfunite_jobs | ||||
|                 + cat_jobs | ||||
|                 + hocr_poco_jobs | ||||
|                 + tiff_poco_jobs | ||||
|                 + compress_jobs) | ||||
|         cleanup_jobs = [] | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             input_dir = os.path.join(job.output_dir, 'tmp') | ||||
|             cmd = 'rm -r "{}"'.format(input_dir) | ||||
|             if self.compress: | ||||
|                 deps = ['pdf_compression_-_{}'.format(i)] | ||||
|             else: | ||||
|                 deps = ['combined_pdf_creation_-_{}'.format(i)] | ||||
|             deps.append('combined_txt_creation_-_{}'.format(i)) | ||||
|             deps.append('poco_bundle_creation_-_{}'.format(i)) | ||||
|             deps.append('tei_p5_creation_-_{}'.format(i)) | ||||
|             lbl = 'cleanup_-_{}'.format(i) | ||||
|             cleanup_jobs.append(self.addTask(command=cmd, | ||||
|                                              dependencies=deps, | ||||
|                                              label=lbl)) | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # zip creation                                   # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         zip_creation_jobs = [] | ||||
|         if self.zip is not None: | ||||
|             # Remove .zip file extension if provided | ||||
|             if self.zip.endswith('.zip'): | ||||
|                 self.zip = self.zip[:-4] | ||||
|                 self.zip = self.zip if self.zip else 'output' | ||||
|         # zip ALL | ||||
|             # zip all files | ||||
|             cmd = 'cd "{}"'.format(self.output_dir) | ||||
|             cmd += ' && ' | ||||
|             cmd += 'zip' | ||||
|             cmd += ' -r' | ||||
|             cmd += ' "{}".all.zip .'.format(self.zip) | ||||
|             cmd += ' -x "pyflow.data*" "*tmp*"' | ||||
|             cmd += ' -i "*.pdf" "*.txt" "*.xml" "*.hocr" "*.tif"' | ||||
|             cmd += ' -i "*.pdf" "*.txt" "*.xml" "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif')  # noqa | ||||
|             cmd += ' && ' | ||||
|             cmd += 'cd -' | ||||
|             lbl = 'zip_job_-_all' | ||||
|             zip_jobs.append(self.addTask(command=cmd, dependencies=deps, | ||||
|                                          label=lbl)) | ||||
|         # zip PDFs | ||||
|             deps = (pdf_compression_jobs if self.compress else | ||||
|                     combined_pdf_creation_jobs) | ||||
|             deps += combined_txt_creation_jobs | ||||
|             deps += poco_bundle_creation_jobs | ||||
|             lbl = 'zip_creation_-_all' | ||||
|             zip_creation_jobs.append(self.addTask(command=cmd, | ||||
|                                                   dependencies=deps, | ||||
|                                                   label=lbl)) | ||||
|             # zip PDF files | ||||
|             cmd = 'cd "{}"'.format(self.output_dir) | ||||
|             cmd += ' && ' | ||||
|             cmd += 'zip' | ||||
| @@ -432,11 +404,13 @@ class OCRPipeline(WorkflowRunner): | ||||
|             cmd += ' -i "*.pdf"' | ||||
|             cmd += ' && ' | ||||
|             cmd += 'cd -' | ||||
|             deps = deps + ['zip_job_-_all'] | ||||
|             lbl = 'zip_job_-_pdf' | ||||
|             zip_jobs.append(self.addTask(command=cmd, dependencies=deps, | ||||
|                                          label=lbl)) | ||||
|         # zip TXTs | ||||
|             deps = (pdf_compression_jobs if self.compress else | ||||
|                     combined_pdf_creation_jobs) | ||||
|             lbl = 'zip_creation_-_pdf' | ||||
|             zip_creation_jobs.append(self.addTask(command=cmd, | ||||
|                                                   dependencies=deps, | ||||
|                                                   label=lbl)) | ||||
|             # zip TXT files | ||||
|             cmd = 'cd "{}"'.format(self.output_dir) | ||||
|             cmd += ' && ' | ||||
|             cmd += 'zip' | ||||
| @@ -446,11 +420,12 @@ class OCRPipeline(WorkflowRunner): | ||||
|             cmd += ' -i "*.txt"' | ||||
|             cmd += ' && ' | ||||
|             cmd += 'cd -' | ||||
|             deps = deps + ['zip_job_-_all'] | ||||
|             lbl = 'zip_job_-_txt' | ||||
|             zip_jobs.append(self.addTask(command=cmd, dependencies=deps, | ||||
|                                          label=lbl)) | ||||
|         # zip XMLs | ||||
|             deps = combined_txt_creation_jobs | ||||
|             lbl = 'zip_creation_-_txt' | ||||
|             zip_creation_jobs.append(self.addTask(command=cmd, | ||||
|                                                   dependencies=deps, | ||||
|                                                   label=lbl)) | ||||
|             # zip XML files | ||||
|             cmd = 'cd "{}"'.format(self.output_dir) | ||||
|             cmd += ' && ' | ||||
|             cmd += 'zip' | ||||
| @@ -460,80 +435,26 @@ class OCRPipeline(WorkflowRunner): | ||||
|             cmd += ' -i "*.xml"' | ||||
|             cmd += ' && ' | ||||
|             cmd += 'cd -' | ||||
|             deps = deps + ['zip_job_-_all'] | ||||
|             lbl = 'zip_job_-_xml' | ||||
|             zip_jobs.append(self.addTask(command=cmd, dependencies=deps, | ||||
|                                          label=lbl)) | ||||
|         # zip PoCo files | ||||
|             poco_paths = [] | ||||
|             poco_names = [] | ||||
|             for i, job in enumerate(self.jobs): | ||||
|                 poco_paths.append(os.path.join(os.path.basename(job.output_dir),  # noqa | ||||
|                                                                 'PoCo')) | ||||
|                 poco_names.append(job.output_dir) | ||||
|  | ||||
|             deps = tei_p5_creation_jobs | ||||
|             lbl = 'zip_creation_-_xml' | ||||
|             zip_creation_jobs.append(self.addTask(command=cmd, | ||||
|                                                   dependencies=deps, | ||||
|                                                   label=lbl)) | ||||
|             # zip PoCo bundles | ||||
|             cmd = 'cd "{}"'.format(self.output_dir) | ||||
|             cmd += ' && ' | ||||
|             cmd += 'zip' | ||||
|             cmd += ' -r' | ||||
|             cmd += ' "{}".poco.zip'.format(self.zip) | ||||
|             cmd += ' "{}"'.format('" "'.join(poco_paths)) | ||||
|             cmd += ' "{}".poco.zip .'.format(self.zip) | ||||
|             cmd += ' -x "pyflow.data*" "*tmp*"' | ||||
|             cmd += ' -i "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif')  # noqa | ||||
|             cmd += ' && ' | ||||
|             cmd += 'cd -' | ||||
|             deps = deps + ['zip_job_-_all'] | ||||
|             lbl = 'zip_job_-_poco_{}'.format(i) | ||||
|             zip_jobs.append(self.addTask(command=cmd, dependencies=deps, | ||||
|                                          label=lbl)) | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # mv_jobs                                        # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         mv_jobs = [] | ||||
|         if self.keep_intermediates: | ||||
|             for i, job in enumerate(self.jobs): | ||||
|                 input_dir = os.path.join(job.output_dir, 'tmp') | ||||
|                 output_dir = input_dir | ||||
|                 cmd = 'mv "{}"/*.hocr "{}"'.format( | ||||
|                     input_dir, os.path.join(output_dir, 'hocr')) | ||||
|                 cmd += ' && ' | ||||
|                 cmd += 'mv "{}"/*.pdf "{}"'.format(input_dir, os.path.join(output_dir, 'pdf'))  # noqa | ||||
|                 cmd += ' && ' | ||||
|                 cmd += 'mv "{}"/*.tif "{}"'.format(input_dir, os.path.join(output_dir, 'tiff'))  # noqa | ||||
|                 cmd += ' && ' | ||||
|                 cmd += 'mv "{}"/*.txt "{}"'.format(input_dir, os.path.join(output_dir, 'txt'))  # noqa | ||||
|                 if self.binarize: | ||||
|                     cmd += ' && ' | ||||
|                     cmd += 'mv "{}"/*.bin.png "{}"'.format(input_dir, os.path.join(output_dir, 'bin.png'))  # noqa | ||||
|                     cmd += ' && ' | ||||
|                     cmd += 'mv "{}"/*.nrm.png "{}"'.format(input_dir, os.path.join(output_dir, 'nrm.png'))  # noqa | ||||
|                 deps = (hocrtotei_jobs | ||||
|                         + tesseract_jobs | ||||
|                         + pdfunite_jobs | ||||
|                         + cat_jobs | ||||
|                         + hocr_poco_jobs | ||||
|                         + tiff_poco_jobs, | ||||
|                         + compress_jobs | ||||
|                         + zip_jobs) | ||||
|                 lbl = 'mv_job_-_{}'.format(i) | ||||
|                 mv_jobs.append(self.addTask(command=cmd, dependencies=deps, | ||||
|                                             label=lbl)) | ||||
|         else: | ||||
|             for i, job in enumerate(self.jobs): | ||||
|                 input_dir = os.path.join(job.output_dir, 'tmp') | ||||
|                 cmd = 'rm -r "{}"'.format(input_dir) | ||||
|                 deps = (hocrtotei_jobs | ||||
|                         + tesseract_jobs | ||||
|                         + pdfunite_jobs | ||||
|                         + cat_jobs | ||||
|                         + hocr_poco_jobs | ||||
|                         + tiff_poco_jobs | ||||
|                         + compress_jobs | ||||
|                         + zip_jobs) | ||||
|                 lbl = 'mv_job_-_{}'.format(i) | ||||
|                 mv_jobs.append(self.addTask(command=cmd, dependencies=deps, | ||||
|                                             label=lbl)) | ||||
|             deps = poco_bundle_creation_jobs | ||||
|             lbl = 'zip_creation_-_poco' | ||||
|             zip_creation_jobs.append(self.addTask(command=cmd, | ||||
|                                                   dependencies=deps, | ||||
|                                                   label=lbl)) | ||||
|  | ||||
|  | ||||
| def collect_jobs(input_dir, output_dir): | ||||
| @@ -550,12 +471,14 @@ def collect_jobs(input_dir, output_dir): | ||||
|  | ||||
| def main(): | ||||
|     args = parse_args() | ||||
|     jobs = collect_jobs(args.i, args.o) | ||||
|     ocr_pipeline = OCRPipeline(args.binarize, jobs, args.keep_intermediates, | ||||
|                                args.language, args.n_cores, args.o, args.zip, | ||||
|     jobs = collect_jobs(args.input_directory, args.output_directory) | ||||
|     ocr_pipeline = OCRPipeline(args.binarize, jobs, args.language, | ||||
|                                args.n_cores, args.output_directory, args.zip, | ||||
|                                args.compress) | ||||
|     retval = ocr_pipeline.run(dataDirRoot=(args.log_dir or args.o), | ||||
|                               nCores=args.n_cores) | ||||
|     retval = ocr_pipeline.run( | ||||
|         dataDirRoot=(args.log_dir or args.output_directory), | ||||
|         nCores=args.n_cores | ||||
|     ) | ||||
|     sys.exit(retval) | ||||
|  | ||||
|  | ||||
|   | ||||
							
								
								
									
										16
									
								
								wrapper/ocr
									
									
									
									
									
								
							
							
						
						
									
										16
									
								
								wrapper/ocr
									
									
									
									
									
								
							| @@ -12,17 +12,21 @@ UID = str(os.getuid()) | ||||
| GID = str(os.getgid()) | ||||
|  | ||||
| parser = ArgumentParser(add_help=False) | ||||
| parser.add_argument('-i') | ||||
| parser.add_argument('-o') | ||||
| parser.add_argument('-i', '--input-directory') | ||||
| parser.add_argument('-o', '--output-directory') | ||||
| args, remaining_args = parser.parse_known_args() | ||||
|  | ||||
| cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)] | ||||
| if args.o is not None: | ||||
|     cmd += ['-v', '{}:{}'.format(os.path.abspath(args.o), CONTAINER_OUTPUT_DIR)] | ||||
| if args.output_directory is not None: | ||||
|     cmd += ['-v', '{}:{}'.format(os.path.abspath(args.output_directory), | ||||
|                                  CONTAINER_OUTPUT_DIR)] | ||||
|     remaining_args.insert(0, CONTAINER_OUTPUT_DIR) | ||||
| if args.i is not None: | ||||
|     cmd += ['-v', '{}:{}'.format(os.path.abspath(args.i), CONTAINER_INPUT_DIR)] | ||||
|     remaining_args.insert(0, '-o') | ||||
| if args.input_directory is not None: | ||||
|     cmd += ['-v', '{}:{}'.format(os.path.abspath(args.input_directory), | ||||
|                                  CONTAINER_INPUT_DIR)] | ||||
|     remaining_args.insert(0, CONTAINER_INPUT_DIR) | ||||
|     remaining_args.insert(0, '-i') | ||||
| cmd.append(CONTAINER_IMAGE) | ||||
| cmd += remaining_args | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user