Add PoCo zips part 1

This commit is contained in:
Stephan Porada 2020-06-09 16:58:22 +02:00
parent 64fe706126
commit 018939ae55

89
ocr
View File

@ -67,12 +67,16 @@ class OCRPipeline(WorkflowRunner):
' # mkdir_jobs #
' ##################################################
'''
mkdir_jobs = []
for i, job in enumerate(self.jobs):
output_dir = os.path.join(job.output_dir, 'tmp')
poco_dir = os.path.join(job.output_dir, 'PoCo')
cmd = 'mkdir'
cmd += ' -p'
cmd += ' "{}"'.format(output_dir)
cmd += ' "{}"'.format(os.path.join(poco_dir, 'hocr'))
cmd += ' "{}"'.format(os.path.join(poco_dir, 'tiff'))
if self.keep_intermediates:
cmd += ' "{}"'.format(os.path.join(output_dir, 'hocr'))
cmd += ' "{}"'.format(os.path.join(output_dir, 'pdf'))
@ -239,6 +243,49 @@ class OCRPipeline(WorkflowRunner):
hocrtotei_jobs.append(self.addTask(command=cmd, dependencies=deps,
label=lbl))
'''
' ##################################################
' # hocr_poco_jobs #
' ##################################################
'''
hocr_poco_jobs = []
for i, job in enumerate(self.jobs):
input_dir = os.path.join(job.output_dir, 'tmp')
files = filter(lambda x: x.endswith('.hocr'),
os.listdir(input_dir))
files.sort(key=lambda x: int(re.search(r'\d+', x).group(0)))
files = map(lambda x: os.path.join(input_dir, x), files)
output_path_base = os.path.join(job.output_dir, 'PoCo')
output_path = os.path.join(output_path_base, 'hocr')
cmd = 'cp "{}" "{}"'.format('" "'.join(files), output_path)
deps = filter(lambda x: x.startswith('ocr_job_-_{}'.format(i)),
tesseract_jobs)
lbl = 'hocr_poco_jobs-_{}'.format(i)
hocr_poco_jobs.append(self.addTask(command=cmd, dependencies=deps,
label=lbl))
'''
' ##################################################
' # tiff_poco_jobs #
' ##################################################
'''
tiff_poco_jobs = []
for i, job in enumerate(self.jobs):
input_dir = os.path.join(job.output_dir, 'tmp')
files = filter(lambda x: x.endswith('.tif'),
os.listdir(input_dir))
files.sort(key=lambda x: int(re.search(r'\d+', x).group(0)))
files = map(lambda x: os.path.join(input_dir, x), files)
output_path_base = os.path.join(job.output_dir, 'PoCo')
output_path = os.path.join(output_path_base, 'tiff')
cmd = 'cp "{}" "{}"'.format('" "'.join(files), output_path)
deps = filter(lambda x: x.startswith('ocr_job_-_{}'.format(i)),
tesseract_jobs)
lbl = 'tiff_poco_jobs-_{}'.format(i)
tiff_poco_jobs.append(self.addTask(command=cmd, dependencies=deps,
label=lbl))
'''
' ##################################################
' # pdfunite_jobs #
@ -290,20 +337,25 @@ class OCRPipeline(WorkflowRunner):
if self.zip.endswith('.zip'):
self.zip = self.zip[:-4]
self.zip = self.zip if self.zip else 'output'
# zip ALL
cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' -r'
cmd += ' "{}".zip .'.format(self.zip)
cmd += ' "{}".all.zip .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.pdf" "*.txt" "*.xml"'
cmd += ' -i "*.pdf" "*.txt" "*.xml" "*.hocr" "*.tif"'
cmd += ' && '
cmd += 'cd -'
deps = hocrtotei_jobs + pdfunite_jobs + cat_jobs
deps = (hocrtotei_jobs
+ pdfunite_jobs
+ cat_jobs
+ hocr_poco_jobs
+ tiff_poco_jobs)
lbl = 'zip_job_-_all'
zip_jobs.append(self.addTask(command=cmd, dependencies=deps,
label=lbl))
# zip PDFs
cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && '
cmd += 'zip'
@ -317,7 +369,7 @@ class OCRPipeline(WorkflowRunner):
lbl = 'zip_job_-_pdf'
zip_jobs.append(self.addTask(command=cmd, dependencies=deps,
label=lbl))
# zip TXTs
cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && '
cmd += 'zip'
@ -331,7 +383,7 @@ class OCRPipeline(WorkflowRunner):
lbl = 'zip_job_-_txt'
zip_jobs.append(self.addTask(command=cmd, dependencies=deps,
label=lbl))
# zip XMLs
cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && '
cmd += 'zip'
@ -345,6 +397,27 @@ class OCRPipeline(WorkflowRunner):
lbl = 'zip_job_-_xml'
zip_jobs.append(self.addTask(command=cmd, dependencies=deps,
label=lbl))
# zip PoCo files
# TODO: Fix relative paths before?
poco_paths = []
poco_names = []
for i, job in enumerate(self.jobs):
poco_paths.append(os.path.join(os.path.basename(job.output_dir), # noqa
'PoCo'))
poco_names.append(job.output_dir)
cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' -r'
cmd += ' "{}".poco.zip'.format(self.zip)
cmd += ' "{}"'.format('" "'.join(poco_paths))
cmd += ' && '
cmd += 'cd -'
deps = 'zip_job_-_all'
lbl = 'zip_job_-_poco_{}'.format(i)
zip_jobs.append(self.addTask(command=cmd, dependencies=deps,
label=lbl))
'''
' ##################################################
@ -371,7 +444,9 @@ class OCRPipeline(WorkflowRunner):
cmd += 'mv "{}"/*.nrm.png "{}"'.format(input_dir, os.path.join(output_dir, 'nrm.png')) # noqa
deps = ['hocrtotei_job_-_{}'.format(i),
'pdfunite_job_-_{}'.format(i),
'cat_job_-_{}'.format(i)]
'cat_job_-_{}'.format(i),
'tiff_poco_jobs_-_{i}'.format(i),
'hocr_poco_jobs_-_{i}'.format(i)]
lbl = 'mv_job_-_{}'.format(i)
mv_jobs.append(self.addTask(command=cmd, dependencies=deps,
label=lbl))