mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2024-12-25 18:24:18 +00:00
Add PoCo zips part 1
This commit is contained in:
parent
64fe706126
commit
018939ae55
89
ocr
89
ocr
@ -67,12 +67,16 @@ class OCRPipeline(WorkflowRunner):
|
||||
' # mkdir_jobs #
|
||||
' ##################################################
|
||||
'''
|
||||
|
||||
mkdir_jobs = []
|
||||
for i, job in enumerate(self.jobs):
|
||||
output_dir = os.path.join(job.output_dir, 'tmp')
|
||||
poco_dir = os.path.join(job.output_dir, 'PoCo')
|
||||
cmd = 'mkdir'
|
||||
cmd += ' -p'
|
||||
cmd += ' "{}"'.format(output_dir)
|
||||
cmd += ' "{}"'.format(os.path.join(poco_dir, 'hocr'))
|
||||
cmd += ' "{}"'.format(os.path.join(poco_dir, 'tiff'))
|
||||
if self.keep_intermediates:
|
||||
cmd += ' "{}"'.format(os.path.join(output_dir, 'hocr'))
|
||||
cmd += ' "{}"'.format(os.path.join(output_dir, 'pdf'))
|
||||
@ -239,6 +243,49 @@ class OCRPipeline(WorkflowRunner):
|
||||
hocrtotei_jobs.append(self.addTask(command=cmd, dependencies=deps,
|
||||
label=lbl))
|
||||
|
||||
'''
|
||||
' ##################################################
|
||||
' # hocr_poco_jobs #
|
||||
' ##################################################
|
||||
'''
|
||||
|
||||
hocr_poco_jobs = []
|
||||
for i, job in enumerate(self.jobs):
|
||||
input_dir = os.path.join(job.output_dir, 'tmp')
|
||||
files = filter(lambda x: x.endswith('.hocr'),
|
||||
os.listdir(input_dir))
|
||||
files.sort(key=lambda x: int(re.search(r'\d+', x).group(0)))
|
||||
files = map(lambda x: os.path.join(input_dir, x), files)
|
||||
output_path_base = os.path.join(job.output_dir, 'PoCo')
|
||||
output_path = os.path.join(output_path_base, 'hocr')
|
||||
cmd = 'cp "{}" "{}"'.format('" "'.join(files), output_path)
|
||||
deps = filter(lambda x: x.startswith('ocr_job_-_{}'.format(i)),
|
||||
tesseract_jobs)
|
||||
lbl = 'hocr_poco_jobs-_{}'.format(i)
|
||||
hocr_poco_jobs.append(self.addTask(command=cmd, dependencies=deps,
|
||||
label=lbl))
|
||||
'''
|
||||
' ##################################################
|
||||
' # tiff_poco_jobs #
|
||||
' ##################################################
|
||||
'''
|
||||
|
||||
tiff_poco_jobs = []
|
||||
for i, job in enumerate(self.jobs):
|
||||
input_dir = os.path.join(job.output_dir, 'tmp')
|
||||
files = filter(lambda x: x.endswith('.tif'),
|
||||
os.listdir(input_dir))
|
||||
files.sort(key=lambda x: int(re.search(r'\d+', x).group(0)))
|
||||
files = map(lambda x: os.path.join(input_dir, x), files)
|
||||
output_path_base = os.path.join(job.output_dir, 'PoCo')
|
||||
output_path = os.path.join(output_path_base, 'tiff')
|
||||
cmd = 'cp "{}" "{}"'.format('" "'.join(files), output_path)
|
||||
deps = filter(lambda x: x.startswith('ocr_job_-_{}'.format(i)),
|
||||
tesseract_jobs)
|
||||
lbl = 'tiff_poco_jobs-_{}'.format(i)
|
||||
tiff_poco_jobs.append(self.addTask(command=cmd, dependencies=deps,
|
||||
label=lbl))
|
||||
|
||||
'''
|
||||
' ##################################################
|
||||
' # pdfunite_jobs #
|
||||
@ -290,20 +337,25 @@ class OCRPipeline(WorkflowRunner):
|
||||
if self.zip.endswith('.zip'):
|
||||
self.zip = self.zip[:-4]
|
||||
self.zip = self.zip if self.zip else 'output'
|
||||
# zip ALL
|
||||
cmd = 'cd "{}"'.format(self.output_dir)
|
||||
cmd += ' && '
|
||||
cmd += 'zip'
|
||||
cmd += ' -r'
|
||||
cmd += ' "{}".zip .'.format(self.zip)
|
||||
cmd += ' "{}".all.zip .'.format(self.zip)
|
||||
cmd += ' -x "pyflow.data*" "*tmp*"'
|
||||
cmd += ' -i "*.pdf" "*.txt" "*.xml"'
|
||||
cmd += ' -i "*.pdf" "*.txt" "*.xml" "*.hocr" "*.tif"'
|
||||
cmd += ' && '
|
||||
cmd += 'cd -'
|
||||
deps = hocrtotei_jobs + pdfunite_jobs + cat_jobs
|
||||
deps = (hocrtotei_jobs
|
||||
+ pdfunite_jobs
|
||||
+ cat_jobs
|
||||
+ hocr_poco_jobs
|
||||
+ tiff_poco_jobs)
|
||||
lbl = 'zip_job_-_all'
|
||||
zip_jobs.append(self.addTask(command=cmd, dependencies=deps,
|
||||
label=lbl))
|
||||
|
||||
# zip PDFs
|
||||
cmd = 'cd "{}"'.format(self.output_dir)
|
||||
cmd += ' && '
|
||||
cmd += 'zip'
|
||||
@ -317,7 +369,7 @@ class OCRPipeline(WorkflowRunner):
|
||||
lbl = 'zip_job_-_pdf'
|
||||
zip_jobs.append(self.addTask(command=cmd, dependencies=deps,
|
||||
label=lbl))
|
||||
|
||||
# zip TXTs
|
||||
cmd = 'cd "{}"'.format(self.output_dir)
|
||||
cmd += ' && '
|
||||
cmd += 'zip'
|
||||
@ -331,7 +383,7 @@ class OCRPipeline(WorkflowRunner):
|
||||
lbl = 'zip_job_-_txt'
|
||||
zip_jobs.append(self.addTask(command=cmd, dependencies=deps,
|
||||
label=lbl))
|
||||
|
||||
# zip XMLs
|
||||
cmd = 'cd "{}"'.format(self.output_dir)
|
||||
cmd += ' && '
|
||||
cmd += 'zip'
|
||||
@ -345,6 +397,27 @@ class OCRPipeline(WorkflowRunner):
|
||||
lbl = 'zip_job_-_xml'
|
||||
zip_jobs.append(self.addTask(command=cmd, dependencies=deps,
|
||||
label=lbl))
|
||||
# zip PoCo files
|
||||
# TODO: Fix relative paths before?
|
||||
poco_paths = []
|
||||
poco_names = []
|
||||
for i, job in enumerate(self.jobs):
|
||||
poco_paths.append(os.path.join(os.path.basename(job.output_dir), # noqa
|
||||
'PoCo'))
|
||||
poco_names.append(job.output_dir)
|
||||
|
||||
cmd = 'cd "{}"'.format(self.output_dir)
|
||||
cmd += ' && '
|
||||
cmd += 'zip'
|
||||
cmd += ' -r'
|
||||
cmd += ' "{}".poco.zip'.format(self.zip)
|
||||
cmd += ' "{}"'.format('" "'.join(poco_paths))
|
||||
cmd += ' && '
|
||||
cmd += 'cd -'
|
||||
deps = 'zip_job_-_all'
|
||||
lbl = 'zip_job_-_poco_{}'.format(i)
|
||||
zip_jobs.append(self.addTask(command=cmd, dependencies=deps,
|
||||
label=lbl))
|
||||
|
||||
'''
|
||||
' ##################################################
|
||||
@ -371,7 +444,9 @@ class OCRPipeline(WorkflowRunner):
|
||||
cmd += 'mv "{}"/*.nrm.png "{}"'.format(input_dir, os.path.join(output_dir, 'nrm.png')) # noqa
|
||||
deps = ['hocrtotei_job_-_{}'.format(i),
|
||||
'pdfunite_job_-_{}'.format(i),
|
||||
'cat_job_-_{}'.format(i)]
|
||||
'cat_job_-_{}'.format(i),
|
||||
'tiff_poco_jobs_-_{i}'.format(i),
|
||||
'hocr_poco_jobs_-_{i}'.format(i)]
|
||||
lbl = 'mv_job_-_{}'.format(i)
|
||||
mv_jobs.append(self.addTask(command=cmd, dependencies=deps,
|
||||
label=lbl))
|
||||
|
Loading…
Reference in New Issue
Block a user