mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
				synced 2025-10-31 19:53:16 +00:00 
			
		
		
		
	Cleanup and change some output options
This commit is contained in:
		
							
								
								
									
										81
									
								
								ocr
									
									
									
									
									
								
							
							
						
						
									
										81
									
								
								ocr
									
									
									
									
									
								
							| @@ -29,7 +29,6 @@ class PipelineJob: | |||||||
|         self.file = file |         self.file = file | ||||||
|         self.name = os.path.basename(file)[:-4] |         self.name = os.path.basename(file)[:-4] | ||||||
|         self.output_dir = output_dir |         self.output_dir = output_dir | ||||||
|         self.output_files = [] |  | ||||||
|         self.tmp_dir = os.path.join(output_dir, 'tmp') |         self.tmp_dir = os.path.join(output_dir, 'tmp') | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -325,12 +324,40 @@ class CreateTEIWorkflow(WorkflowRunner): | |||||||
|         cmd = 'hocr2tei "{}.hocr"'.format( |         cmd = 'hocr2tei "{}.hocr"'.format( | ||||||
|             os.path.join(self.job.output_dir, self.job.name) |             os.path.join(self.job.output_dir, self.job.name) | ||||||
|         ) |         ) | ||||||
|         cmd += ' --output-file "{}.xml"'.format( |         cmd += ' --output-file "{}.tei.xml"'.format( | ||||||
|             os.path.join(self.job.output_dir, self.job.name) |             os.path.join(self.job.output_dir, self.job.name) | ||||||
|         ) |         ) | ||||||
|         self.addTask('hocr2tei', command=cmd, memMb=mem_mb, nCores=n_cores) |         self.addTask('hocr2tei', command=cmd, memMb=mem_mb, nCores=n_cores) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class CreatePoCoZipWorkflow(WorkflowRunner): | ||||||
|  |     def __init__(self, job): | ||||||
|  |         self.job = job | ||||||
|  |  | ||||||
|  |     def workflow(self): | ||||||
|  |         ''' | ||||||
|  |         ' ################################################## | ||||||
|  |         ' # zip                                            # | ||||||
|  |         ' ################################################## | ||||||
|  |         ''' | ||||||
|  |         n_cores = 1 | ||||||
|  |         mem_mb = min(512, self.getMemMb()) | ||||||
|  |         zip_tasks = [] | ||||||
|  |         cmd = 'cd "{}"'.format(self.job.output_dir) | ||||||
|  |         cmd += ' && ' | ||||||
|  |         cmd += 'zip' | ||||||
|  |         cmd += ' -r' | ||||||
|  |         cmd += ' -m' | ||||||
|  |         cmd += ' "{}.poco.zip" .'.format(self.job.name) | ||||||
|  |         cmd += ' -i "images/*.png" "{}.hocr"'.format(self.job.name) | ||||||
|  |         cmd += ' && ' | ||||||
|  |         cmd += 'rm -r images' | ||||||
|  |         cmd += ' && ' | ||||||
|  |         cmd += 'cd -' | ||||||
|  |         task = self.addTask('zip', command=cmd, memMb=mem_mb, nCores=n_cores) | ||||||
|  |         zip_tasks.append(task) | ||||||
|  |  | ||||||
|  |  | ||||||
| class CreateTxtWorkflow(WorkflowRunner): | class CreateTxtWorkflow(WorkflowRunner): | ||||||
|     def __init__(self, job): |     def __init__(self, job): | ||||||
|         self.job = job |         self.job = job | ||||||
| @@ -369,8 +396,11 @@ class MainWorkflow(WorkflowRunner): | |||||||
|                 continue |                 continue | ||||||
|             if not file.lower().endswith('.pdf'): |             if not file.lower().endswith('.pdf'): | ||||||
|                 continue |                 continue | ||||||
|             self.jobs.append(PipelineJob(os.path.join(self.input_dir, file), |             job = PipelineJob( | ||||||
|                                          os.path.join(self.output_dir, file))) |                 os.path.join(self.input_dir, file), | ||||||
|  |                 os.path.join(self.output_dir, file) | ||||||
|  |             ) | ||||||
|  |             self.jobs.append(job) | ||||||
|  |  | ||||||
|     def workflow(self): |     def workflow(self): | ||||||
|         if not self.jobs: |         if not self.jobs: | ||||||
| @@ -469,6 +499,20 @@ class MainWorkflow(WorkflowRunner): | |||||||
|             ) |             ) | ||||||
|             create_tei_tasks.append(task) |             create_tei_tasks.append(task) | ||||||
|  |  | ||||||
|  |         ''' | ||||||
|  |         ' ################################################## | ||||||
|  |         ' # create-poco-zip                                # | ||||||
|  |         ' ################################################## | ||||||
|  |         ''' | ||||||
|  |         create_poco_zip_tasks = [] | ||||||
|  |         for i, job in enumerate(self.jobs): | ||||||
|  |             task = self.addWorkflowTask( | ||||||
|  |                 'create_poco_zip_-_{}'.format(i), | ||||||
|  |                 CreatePoCoZipWorkflow(job), | ||||||
|  |                 dependencies='create_tei_-_{}'.format(i) | ||||||
|  |             ) | ||||||
|  |             create_poco_zip_tasks.append(task) | ||||||
|  |  | ||||||
|         ''' |         ''' | ||||||
|         ' ################################################## |         ' ################################################## | ||||||
|         ' # create-txt                                     # |         ' # create-txt                                     # | ||||||
| @@ -488,45 +532,36 @@ class MainWorkflow(WorkflowRunner): | |||||||
|             # Remove temporary directory |             # Remove temporary directory | ||||||
|             os.rmdir(job.tmp_dir) |             os.rmdir(job.tmp_dir) | ||||||
|             # Track output files |             # Track output files | ||||||
|             relative_input = os.path.relpath(job.file, start=self.input_dir) |  | ||||||
|             relative_output_dir = os.path.relpath(job.output_dir, start=self.output_dir)  # noqa |             relative_output_dir = os.path.relpath(job.output_dir, start=self.output_dir)  # noqa | ||||||
|             for x in os.listdir(os.path.join(job.output_dir, 'images')): |  | ||||||
|                 self.output_files.append( |  | ||||||
|                     { |  | ||||||
|                         'input': relative_input, |  | ||||||
|                         'path': os.path.join(relative_output_dir, 'images', x), |  | ||||||
|                         'mimetype': 'image/png' |  | ||||||
|                     } |  | ||||||
|                 ) |  | ||||||
|             self.output_files.append( |             self.output_files.append( | ||||||
|                 { |                 { | ||||||
|                     'input': relative_input, |                     'description': 'Post correction package (.png and .hocr).', | ||||||
|                     'path': os.path.join(relative_output_dir, '{}.hocr'.format(job.name)),  # noqa |                     'file': os.path.join(relative_output_dir, '{}.poco.zip'.format(job.name)),  # noqa | ||||||
|                     'mimetype': 'application/xhtml+xml' |                     'mimetype': 'application/zip' | ||||||
|                 } |                 } | ||||||
|             ) |             ) | ||||||
|             self.output_files.append( |             self.output_files.append( | ||||||
|                 { |                 { | ||||||
|                     'input': relative_input, |                     'description': 'PDF file with text layer.', | ||||||
|                     'filename': os.path.join(relative_output_dir, '{}.pdf'.format(job.name)),  # noqa |                     'file': os.path.join(relative_output_dir, '{}.pdf'.format(job.name)),  # noqa | ||||||
|                     'mimetype': 'application/pdf' |                     'mimetype': 'application/pdf' | ||||||
|                 } |                 } | ||||||
|             ) |             ) | ||||||
|             self.output_files.append( |             self.output_files.append( | ||||||
|                 { |                 { | ||||||
|                     'input': relative_input, |                     'description': 'Plain text file.', | ||||||
|                     'filename': os.path.join(relative_output_dir, '{}.txt'.format(job.name)),  # noqa |                     'file': os.path.join(relative_output_dir, '{}.txt'.format(job.name)),  # noqa | ||||||
|                     'mimetype': 'text/plain' |                     'mimetype': 'text/plain' | ||||||
|                 } |                 } | ||||||
|             ) |             ) | ||||||
|             self.output_files.append( |             self.output_files.append( | ||||||
|                 { |                 { | ||||||
|                     'input': relative_input, |                     'description': 'TEI compliant XML file.', | ||||||
|                     'filename': os.path.join(relative_output_dir, '{}.xml'.format(job.name)),  # noqa |                     'file': os.path.join(relative_output_dir, '{}.tei.xml'.format(job.name)),  # noqa | ||||||
|                     'mimetype': 'application/tei+xml' |                     'mimetype': 'application/tei+xml' | ||||||
|                 } |                 } | ||||||
|             ) |             ) | ||||||
|         with open(os.path.join(self.output_dir, 'output_files.json'), 'w') as f: |         with open(os.path.join(self.output_dir, 'output_records.json'), 'w') as f:  # noqa | ||||||
|             json.dump(self.output_files, f, indent=4) |             json.dump(self.output_files, f, indent=4) | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user