mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
				synced 2025-10-31 20:03:14 +00:00 
			
		
		
		
	Add PoCo zips part 1
This commit is contained in:
		
							
								
								
									
										89
									
								
								ocr
									
									
									
									
									
								
							
							
						
						
									
										89
									
								
								ocr
									
									
									
									
									
								
							| @@ -67,12 +67,16 @@ class OCRPipeline(WorkflowRunner): | ||||
|         ' # mkdir_jobs                                     # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|  | ||||
|         mkdir_jobs = [] | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             output_dir = os.path.join(job.output_dir, 'tmp') | ||||
|             poco_dir = os.path.join(job.output_dir, 'PoCo') | ||||
|             cmd = 'mkdir' | ||||
|             cmd += ' -p' | ||||
|             cmd += ' "{}"'.format(output_dir) | ||||
|             cmd += ' "{}"'.format(os.path.join(poco_dir, 'hocr')) | ||||
|             cmd += ' "{}"'.format(os.path.join(poco_dir, 'tiff')) | ||||
|             if self.keep_intermediates: | ||||
|                 cmd += ' "{}"'.format(os.path.join(output_dir, 'hocr')) | ||||
|                 cmd += ' "{}"'.format(os.path.join(output_dir, 'pdf')) | ||||
| @@ -239,6 +243,49 @@ class OCRPipeline(WorkflowRunner): | ||||
|             hocrtotei_jobs.append(self.addTask(command=cmd, dependencies=deps, | ||||
|                                                label=lbl)) | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # hocr_poco_jobs                                 # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|  | ||||
|         hocr_poco_jobs = [] | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             input_dir = os.path.join(job.output_dir, 'tmp') | ||||
|             files = filter(lambda x: x.endswith('.hocr'), | ||||
|                            os.listdir(input_dir)) | ||||
|             files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) | ||||
|             files = map(lambda x: os.path.join(input_dir, x), files) | ||||
|             output_path_base = os.path.join(job.output_dir, 'PoCo') | ||||
|             output_path = os.path.join(output_path_base, 'hocr') | ||||
|             cmd = 'cp "{}" "{}"'.format('" "'.join(files), output_path) | ||||
|             deps = filter(lambda x: x.startswith('ocr_job_-_{}'.format(i)), | ||||
|                           tesseract_jobs) | ||||
|             lbl = 'hocr_poco_jobs-_{}'.format(i) | ||||
|             hocr_poco_jobs.append(self.addTask(command=cmd, dependencies=deps, | ||||
|                                                label=lbl)) | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # tiff_poco_jobs                                 # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|  | ||||
|         tiff_poco_jobs = [] | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             input_dir = os.path.join(job.output_dir, 'tmp') | ||||
|             files = filter(lambda x: x.endswith('.tif'), | ||||
|                            os.listdir(input_dir)) | ||||
|             files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) | ||||
|             files = map(lambda x: os.path.join(input_dir, x), files) | ||||
|             output_path_base = os.path.join(job.output_dir, 'PoCo') | ||||
|             output_path = os.path.join(output_path_base, 'tiff') | ||||
|             cmd = 'cp "{}" "{}"'.format('" "'.join(files), output_path) | ||||
|             deps = filter(lambda x: x.startswith('ocr_job_-_{}'.format(i)), | ||||
|                           tesseract_jobs) | ||||
|             lbl = 'tiff_poco_jobs-_{}'.format(i) | ||||
|             tiff_poco_jobs.append(self.addTask(command=cmd, dependencies=deps, | ||||
|                                                label=lbl)) | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # pdfunite_jobs                                  # | ||||
| @@ -290,20 +337,25 @@ class OCRPipeline(WorkflowRunner): | ||||
|             if self.zip.endswith('.zip'): | ||||
|                 self.zip = self.zip[:-4] | ||||
|                 self.zip = self.zip if self.zip else 'output' | ||||
|         # zip ALL | ||||
|             cmd = 'cd "{}"'.format(self.output_dir) | ||||
|             cmd += ' && ' | ||||
|             cmd += 'zip' | ||||
|             cmd += ' -r' | ||||
|             cmd += ' "{}".zip .'.format(self.zip) | ||||
|             cmd += ' "{}".all.zip .'.format(self.zip) | ||||
|             cmd += ' -x "pyflow.data*" "*tmp*"' | ||||
|             cmd += ' -i "*.pdf" "*.txt" "*.xml"' | ||||
|             cmd += ' -i "*.pdf" "*.txt" "*.xml" "*.hocr" "*.tif"' | ||||
|             cmd += ' && ' | ||||
|             cmd += 'cd -' | ||||
|             deps = hocrtotei_jobs + pdfunite_jobs + cat_jobs | ||||
|             deps = (hocrtotei_jobs | ||||
|                     + pdfunite_jobs | ||||
|                     + cat_jobs | ||||
|                     + hocr_poco_jobs | ||||
|                     + tiff_poco_jobs) | ||||
|             lbl = 'zip_job_-_all' | ||||
|             zip_jobs.append(self.addTask(command=cmd, dependencies=deps, | ||||
|                                          label=lbl)) | ||||
|  | ||||
|         # zip PDFs | ||||
|             cmd = 'cd "{}"'.format(self.output_dir) | ||||
|             cmd += ' && ' | ||||
|             cmd += 'zip' | ||||
| @@ -317,7 +369,7 @@ class OCRPipeline(WorkflowRunner): | ||||
|             lbl = 'zip_job_-_pdf' | ||||
|             zip_jobs.append(self.addTask(command=cmd, dependencies=deps, | ||||
|                                          label=lbl)) | ||||
|  | ||||
|         # zip TXTs | ||||
|             cmd = 'cd "{}"'.format(self.output_dir) | ||||
|             cmd += ' && ' | ||||
|             cmd += 'zip' | ||||
| @@ -331,7 +383,7 @@ class OCRPipeline(WorkflowRunner): | ||||
|             lbl = 'zip_job_-_txt' | ||||
|             zip_jobs.append(self.addTask(command=cmd, dependencies=deps, | ||||
|                                          label=lbl)) | ||||
|  | ||||
|         # zip XMLs | ||||
|             cmd = 'cd "{}"'.format(self.output_dir) | ||||
|             cmd += ' && ' | ||||
|             cmd += 'zip' | ||||
| @@ -345,6 +397,27 @@ class OCRPipeline(WorkflowRunner): | ||||
|             lbl = 'zip_job_-_xml' | ||||
|             zip_jobs.append(self.addTask(command=cmd, dependencies=deps, | ||||
|                                          label=lbl)) | ||||
|         # zip PoCo files | ||||
|         # TODO: Fix relative paths before? | ||||
|             poco_paths = [] | ||||
|             poco_names = [] | ||||
|             for i, job in enumerate(self.jobs): | ||||
|                 poco_paths.append(os.path.join(os.path.basename(job.output_dir),  # noqa | ||||
|                                                                 'PoCo')) | ||||
|                 poco_names.append(job.output_dir) | ||||
|  | ||||
|             cmd = 'cd "{}"'.format(self.output_dir) | ||||
|             cmd += ' && ' | ||||
|             cmd += 'zip' | ||||
|             cmd += ' -r' | ||||
|             cmd += ' "{}".poco.zip'.format(self.zip) | ||||
|             cmd += ' "{}"'.format('" "'.join(poco_paths)) | ||||
|             cmd += ' && ' | ||||
|             cmd += 'cd -' | ||||
|             deps = 'zip_job_-_all' | ||||
|             lbl = 'zip_job_-_poco_{}'.format(i) | ||||
|             zip_jobs.append(self.addTask(command=cmd, dependencies=deps, | ||||
|                                          label=lbl)) | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
| @@ -371,7 +444,9 @@ class OCRPipeline(WorkflowRunner): | ||||
|                     cmd += 'mv "{}"/*.nrm.png "{}"'.format(input_dir, os.path.join(output_dir, 'nrm.png'))  # noqa | ||||
|                 deps = ['hocrtotei_job_-_{}'.format(i), | ||||
|                         'pdfunite_job_-_{}'.format(i), | ||||
|                         'cat_job_-_{}'.format(i)] | ||||
|                         'cat_job_-_{}'.format(i), | ||||
|                         'tiff_poco_jobs_-_{i}'.format(i), | ||||
|                         'hocr_poco_jobs_-_{i}'.format(i)] | ||||
|                 lbl = 'mv_job_-_{}'.format(i) | ||||
|                 mv_jobs.append(self.addTask(command=cmd, dependencies=deps, | ||||
|                                             label=lbl)) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user