mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2024-12-26 06:14:19 +00:00
Add PoCo zips part 1
This commit is contained in:
parent
64fe706126
commit
018939ae55
89
ocr
89
ocr
@ -67,12 +67,16 @@ class OCRPipeline(WorkflowRunner):
|
|||||||
' # mkdir_jobs #
|
' # mkdir_jobs #
|
||||||
' ##################################################
|
' ##################################################
|
||||||
'''
|
'''
|
||||||
|
|
||||||
mkdir_jobs = []
|
mkdir_jobs = []
|
||||||
for i, job in enumerate(self.jobs):
|
for i, job in enumerate(self.jobs):
|
||||||
output_dir = os.path.join(job.output_dir, 'tmp')
|
output_dir = os.path.join(job.output_dir, 'tmp')
|
||||||
|
poco_dir = os.path.join(job.output_dir, 'PoCo')
|
||||||
cmd = 'mkdir'
|
cmd = 'mkdir'
|
||||||
cmd += ' -p'
|
cmd += ' -p'
|
||||||
cmd += ' "{}"'.format(output_dir)
|
cmd += ' "{}"'.format(output_dir)
|
||||||
|
cmd += ' "{}"'.format(os.path.join(poco_dir, 'hocr'))
|
||||||
|
cmd += ' "{}"'.format(os.path.join(poco_dir, 'tiff'))
|
||||||
if self.keep_intermediates:
|
if self.keep_intermediates:
|
||||||
cmd += ' "{}"'.format(os.path.join(output_dir, 'hocr'))
|
cmd += ' "{}"'.format(os.path.join(output_dir, 'hocr'))
|
||||||
cmd += ' "{}"'.format(os.path.join(output_dir, 'pdf'))
|
cmd += ' "{}"'.format(os.path.join(output_dir, 'pdf'))
|
||||||
@ -239,6 +243,49 @@ class OCRPipeline(WorkflowRunner):
|
|||||||
hocrtotei_jobs.append(self.addTask(command=cmd, dependencies=deps,
|
hocrtotei_jobs.append(self.addTask(command=cmd, dependencies=deps,
|
||||||
label=lbl))
|
label=lbl))
|
||||||
|
|
||||||
|
'''
|
||||||
|
' ##################################################
|
||||||
|
' # hocr_poco_jobs #
|
||||||
|
' ##################################################
|
||||||
|
'''
|
||||||
|
|
||||||
|
hocr_poco_jobs = []
|
||||||
|
for i, job in enumerate(self.jobs):
|
||||||
|
input_dir = os.path.join(job.output_dir, 'tmp')
|
||||||
|
files = filter(lambda x: x.endswith('.hocr'),
|
||||||
|
os.listdir(input_dir))
|
||||||
|
files.sort(key=lambda x: int(re.search(r'\d+', x).group(0)))
|
||||||
|
files = map(lambda x: os.path.join(input_dir, x), files)
|
||||||
|
output_path_base = os.path.join(job.output_dir, 'PoCo')
|
||||||
|
output_path = os.path.join(output_path_base, 'hocr')
|
||||||
|
cmd = 'cp "{}" "{}"'.format('" "'.join(files), output_path)
|
||||||
|
deps = filter(lambda x: x.startswith('ocr_job_-_{}'.format(i)),
|
||||||
|
tesseract_jobs)
|
||||||
|
lbl = 'hocr_poco_jobs-_{}'.format(i)
|
||||||
|
hocr_poco_jobs.append(self.addTask(command=cmd, dependencies=deps,
|
||||||
|
label=lbl))
|
||||||
|
'''
|
||||||
|
' ##################################################
|
||||||
|
' # tiff_poco_jobs #
|
||||||
|
' ##################################################
|
||||||
|
'''
|
||||||
|
|
||||||
|
tiff_poco_jobs = []
|
||||||
|
for i, job in enumerate(self.jobs):
|
||||||
|
input_dir = os.path.join(job.output_dir, 'tmp')
|
||||||
|
files = filter(lambda x: x.endswith('.tif'),
|
||||||
|
os.listdir(input_dir))
|
||||||
|
files.sort(key=lambda x: int(re.search(r'\d+', x).group(0)))
|
||||||
|
files = map(lambda x: os.path.join(input_dir, x), files)
|
||||||
|
output_path_base = os.path.join(job.output_dir, 'PoCo')
|
||||||
|
output_path = os.path.join(output_path_base, 'tiff')
|
||||||
|
cmd = 'cp "{}" "{}"'.format('" "'.join(files), output_path)
|
||||||
|
deps = filter(lambda x: x.startswith('ocr_job_-_{}'.format(i)),
|
||||||
|
tesseract_jobs)
|
||||||
|
lbl = 'tiff_poco_jobs-_{}'.format(i)
|
||||||
|
tiff_poco_jobs.append(self.addTask(command=cmd, dependencies=deps,
|
||||||
|
label=lbl))
|
||||||
|
|
||||||
'''
|
'''
|
||||||
' ##################################################
|
' ##################################################
|
||||||
' # pdfunite_jobs #
|
' # pdfunite_jobs #
|
||||||
@ -290,20 +337,25 @@ class OCRPipeline(WorkflowRunner):
|
|||||||
if self.zip.endswith('.zip'):
|
if self.zip.endswith('.zip'):
|
||||||
self.zip = self.zip[:-4]
|
self.zip = self.zip[:-4]
|
||||||
self.zip = self.zip if self.zip else 'output'
|
self.zip = self.zip if self.zip else 'output'
|
||||||
|
# zip ALL
|
||||||
cmd = 'cd "{}"'.format(self.output_dir)
|
cmd = 'cd "{}"'.format(self.output_dir)
|
||||||
cmd += ' && '
|
cmd += ' && '
|
||||||
cmd += 'zip'
|
cmd += 'zip'
|
||||||
cmd += ' -r'
|
cmd += ' -r'
|
||||||
cmd += ' "{}".zip .'.format(self.zip)
|
cmd += ' "{}".all.zip .'.format(self.zip)
|
||||||
cmd += ' -x "pyflow.data*" "*tmp*"'
|
cmd += ' -x "pyflow.data*" "*tmp*"'
|
||||||
cmd += ' -i "*.pdf" "*.txt" "*.xml"'
|
cmd += ' -i "*.pdf" "*.txt" "*.xml" "*.hocr" "*.tif"'
|
||||||
cmd += ' && '
|
cmd += ' && '
|
||||||
cmd += 'cd -'
|
cmd += 'cd -'
|
||||||
deps = hocrtotei_jobs + pdfunite_jobs + cat_jobs
|
deps = (hocrtotei_jobs
|
||||||
|
+ pdfunite_jobs
|
||||||
|
+ cat_jobs
|
||||||
|
+ hocr_poco_jobs
|
||||||
|
+ tiff_poco_jobs)
|
||||||
lbl = 'zip_job_-_all'
|
lbl = 'zip_job_-_all'
|
||||||
zip_jobs.append(self.addTask(command=cmd, dependencies=deps,
|
zip_jobs.append(self.addTask(command=cmd, dependencies=deps,
|
||||||
label=lbl))
|
label=lbl))
|
||||||
|
# zip PDFs
|
||||||
cmd = 'cd "{}"'.format(self.output_dir)
|
cmd = 'cd "{}"'.format(self.output_dir)
|
||||||
cmd += ' && '
|
cmd += ' && '
|
||||||
cmd += 'zip'
|
cmd += 'zip'
|
||||||
@ -317,7 +369,7 @@ class OCRPipeline(WorkflowRunner):
|
|||||||
lbl = 'zip_job_-_pdf'
|
lbl = 'zip_job_-_pdf'
|
||||||
zip_jobs.append(self.addTask(command=cmd, dependencies=deps,
|
zip_jobs.append(self.addTask(command=cmd, dependencies=deps,
|
||||||
label=lbl))
|
label=lbl))
|
||||||
|
# zip TXTs
|
||||||
cmd = 'cd "{}"'.format(self.output_dir)
|
cmd = 'cd "{}"'.format(self.output_dir)
|
||||||
cmd += ' && '
|
cmd += ' && '
|
||||||
cmd += 'zip'
|
cmd += 'zip'
|
||||||
@ -331,7 +383,7 @@ class OCRPipeline(WorkflowRunner):
|
|||||||
lbl = 'zip_job_-_txt'
|
lbl = 'zip_job_-_txt'
|
||||||
zip_jobs.append(self.addTask(command=cmd, dependencies=deps,
|
zip_jobs.append(self.addTask(command=cmd, dependencies=deps,
|
||||||
label=lbl))
|
label=lbl))
|
||||||
|
# zip XMLs
|
||||||
cmd = 'cd "{}"'.format(self.output_dir)
|
cmd = 'cd "{}"'.format(self.output_dir)
|
||||||
cmd += ' && '
|
cmd += ' && '
|
||||||
cmd += 'zip'
|
cmd += 'zip'
|
||||||
@ -345,6 +397,27 @@ class OCRPipeline(WorkflowRunner):
|
|||||||
lbl = 'zip_job_-_xml'
|
lbl = 'zip_job_-_xml'
|
||||||
zip_jobs.append(self.addTask(command=cmd, dependencies=deps,
|
zip_jobs.append(self.addTask(command=cmd, dependencies=deps,
|
||||||
label=lbl))
|
label=lbl))
|
||||||
|
# zip PoCo files
|
||||||
|
# TODO: Fix relative paths before?
|
||||||
|
poco_paths = []
|
||||||
|
poco_names = []
|
||||||
|
for i, job in enumerate(self.jobs):
|
||||||
|
poco_paths.append(os.path.join(os.path.basename(job.output_dir), # noqa
|
||||||
|
'PoCo'))
|
||||||
|
poco_names.append(job.output_dir)
|
||||||
|
|
||||||
|
cmd = 'cd "{}"'.format(self.output_dir)
|
||||||
|
cmd += ' && '
|
||||||
|
cmd += 'zip'
|
||||||
|
cmd += ' -r'
|
||||||
|
cmd += ' "{}".poco.zip'.format(self.zip)
|
||||||
|
cmd += ' "{}"'.format('" "'.join(poco_paths))
|
||||||
|
cmd += ' && '
|
||||||
|
cmd += 'cd -'
|
||||||
|
deps = 'zip_job_-_all'
|
||||||
|
lbl = 'zip_job_-_poco_{}'.format(i)
|
||||||
|
zip_jobs.append(self.addTask(command=cmd, dependencies=deps,
|
||||||
|
label=lbl))
|
||||||
|
|
||||||
'''
|
'''
|
||||||
' ##################################################
|
' ##################################################
|
||||||
@ -371,7 +444,9 @@ class OCRPipeline(WorkflowRunner):
|
|||||||
cmd += 'mv "{}"/*.nrm.png "{}"'.format(input_dir, os.path.join(output_dir, 'nrm.png')) # noqa
|
cmd += 'mv "{}"/*.nrm.png "{}"'.format(input_dir, os.path.join(output_dir, 'nrm.png')) # noqa
|
||||||
deps = ['hocrtotei_job_-_{}'.format(i),
|
deps = ['hocrtotei_job_-_{}'.format(i),
|
||||||
'pdfunite_job_-_{}'.format(i),
|
'pdfunite_job_-_{}'.format(i),
|
||||||
'cat_job_-_{}'.format(i)]
|
'cat_job_-_{}'.format(i),
|
||||||
|
'tiff_poco_jobs_-_{i}'.format(i),
|
||||||
|
'hocr_poco_jobs_-_{i}'.format(i)]
|
||||||
lbl = 'mv_job_-_{}'.format(i)
|
lbl = 'mv_job_-_{}'.format(i)
|
||||||
mv_jobs.append(self.addTask(command=cmd, dependencies=deps,
|
mv_jobs.append(self.addTask(command=cmd, dependencies=deps,
|
||||||
label=lbl))
|
label=lbl))
|
||||||
|
Loading…
Reference in New Issue
Block a user