mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
				synced 2025-10-31 12:42:44 +00:00 
			
		
		
		
	Cleanup and change some output options
This commit is contained in:
		
							
								
								
									
										81
									
								
								ocr
									
									
									
									
									
								
							
							
						
						
									
										81
									
								
								ocr
									
									
									
									
									
								
							| @@ -29,7 +29,6 @@ class PipelineJob: | ||||
|         self.file = file | ||||
|         self.name = os.path.basename(file)[:-4] | ||||
|         self.output_dir = output_dir | ||||
|         self.output_files = [] | ||||
|         self.tmp_dir = os.path.join(output_dir, 'tmp') | ||||
|  | ||||
|  | ||||
| @@ -325,12 +324,40 @@ class CreateTEIWorkflow(WorkflowRunner): | ||||
|         cmd = 'hocr2tei "{}.hocr"'.format( | ||||
|             os.path.join(self.job.output_dir, self.job.name) | ||||
|         ) | ||||
|         cmd += ' --output-file "{}.xml"'.format( | ||||
|         cmd += ' --output-file "{}.tei.xml"'.format( | ||||
|             os.path.join(self.job.output_dir, self.job.name) | ||||
|         ) | ||||
|         self.addTask('hocr2tei', command=cmd, memMb=mem_mb, nCores=n_cores) | ||||
|  | ||||
|  | ||||
| class CreatePoCoZipWorkflow(WorkflowRunner): | ||||
|     def __init__(self, job): | ||||
|         self.job = job | ||||
|  | ||||
|     def workflow(self): | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # zip                                            # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         n_cores = 1 | ||||
|         mem_mb = min(512, self.getMemMb()) | ||||
|         zip_tasks = [] | ||||
|         cmd = 'cd "{}"'.format(self.job.output_dir) | ||||
|         cmd += ' && ' | ||||
|         cmd += 'zip' | ||||
|         cmd += ' -r' | ||||
|         cmd += ' -m' | ||||
|         cmd += ' "{}.poco.zip" .'.format(self.job.name) | ||||
|         cmd += ' -i "images/*.png" "{}.hocr"'.format(self.job.name) | ||||
|         cmd += ' && ' | ||||
|         cmd += 'rm -r images' | ||||
|         cmd += ' && ' | ||||
|         cmd += 'cd -' | ||||
|         task = self.addTask('zip', command=cmd, memMb=mem_mb, nCores=n_cores) | ||||
|         zip_tasks.append(task) | ||||
|  | ||||
|  | ||||
| class CreateTxtWorkflow(WorkflowRunner): | ||||
|     def __init__(self, job): | ||||
|         self.job = job | ||||
| @@ -369,8 +396,11 @@ class MainWorkflow(WorkflowRunner): | ||||
|                 continue | ||||
|             if not file.lower().endswith('.pdf'): | ||||
|                 continue | ||||
|             self.jobs.append(PipelineJob(os.path.join(self.input_dir, file), | ||||
|                                          os.path.join(self.output_dir, file))) | ||||
|             job = PipelineJob( | ||||
|                 os.path.join(self.input_dir, file), | ||||
|                 os.path.join(self.output_dir, file) | ||||
|             ) | ||||
|             self.jobs.append(job) | ||||
|  | ||||
|     def workflow(self): | ||||
|         if not self.jobs: | ||||
| @@ -469,6 +499,20 @@ class MainWorkflow(WorkflowRunner): | ||||
|             ) | ||||
|             create_tei_tasks.append(task) | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # create-poco-zip                                # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         create_poco_zip_tasks = [] | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             task = self.addWorkflowTask( | ||||
|                 'create_poco_zip_-_{}'.format(i), | ||||
|                 CreatePoCoZipWorkflow(job), | ||||
|                 dependencies='create_tei_-_{}'.format(i) | ||||
|             ) | ||||
|             create_poco_zip_tasks.append(task) | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # create-txt                                     # | ||||
| @@ -488,45 +532,36 @@ class MainWorkflow(WorkflowRunner): | ||||
|             # Remove temporary directory | ||||
|             os.rmdir(job.tmp_dir) | ||||
|             # Track output files | ||||
|             relative_input = os.path.relpath(job.file, start=self.input_dir) | ||||
|             relative_output_dir = os.path.relpath(job.output_dir, start=self.output_dir)  # noqa | ||||
|             for x in os.listdir(os.path.join(job.output_dir, 'images')): | ||||
|                 self.output_files.append( | ||||
|                     { | ||||
|                         'input': relative_input, | ||||
|                         'path': os.path.join(relative_output_dir, 'images', x), | ||||
|                         'mimetype': 'image/png' | ||||
|                     } | ||||
|                 ) | ||||
|             self.output_files.append( | ||||
|                 { | ||||
|                     'input': relative_input, | ||||
|                     'path': os.path.join(relative_output_dir, '{}.hocr'.format(job.name)),  # noqa | ||||
|                     'mimetype': 'application/xhtml+xml' | ||||
|                     'description': 'Post correction package (.png and .hocr).', | ||||
|                     'file': os.path.join(relative_output_dir, '{}.poco.zip'.format(job.name)),  # noqa | ||||
|                     'mimetype': 'application/zip' | ||||
|                 } | ||||
|             ) | ||||
|             self.output_files.append( | ||||
|                 { | ||||
|                     'input': relative_input, | ||||
|                     'filename': os.path.join(relative_output_dir, '{}.pdf'.format(job.name)),  # noqa | ||||
|                     'description': 'PDF file with text layer.', | ||||
|                     'file': os.path.join(relative_output_dir, '{}.pdf'.format(job.name)),  # noqa | ||||
|                     'mimetype': 'application/pdf' | ||||
|                 } | ||||
|             ) | ||||
|             self.output_files.append( | ||||
|                 { | ||||
|                     'input': relative_input, | ||||
|                     'filename': os.path.join(relative_output_dir, '{}.txt'.format(job.name)),  # noqa | ||||
|                     'description': 'Plain text file.', | ||||
|                     'file': os.path.join(relative_output_dir, '{}.txt'.format(job.name)),  # noqa | ||||
|                     'mimetype': 'text/plain' | ||||
|                 } | ||||
|             ) | ||||
|             self.output_files.append( | ||||
|                 { | ||||
|                     'input': relative_input, | ||||
|                     'filename': os.path.join(relative_output_dir, '{}.xml'.format(job.name)),  # noqa | ||||
|                     'description': 'TEI compliant XML file.', | ||||
|                     'file': os.path.join(relative_output_dir, '{}.tei.xml'.format(job.name)),  # noqa | ||||
|                     'mimetype': 'application/tei+xml' | ||||
|                 } | ||||
|             ) | ||||
|         with open(os.path.join(self.output_dir, 'output_files.json'), 'w') as f: | ||||
|         with open(os.path.join(self.output_dir, 'output_records.json'), 'w') as f:  # noqa | ||||
|             json.dump(self.output_files, f, indent=4) | ||||
|  | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user