diff --git a/ocr b/ocr index 067053c..73b24eb 100755 --- a/ocr +++ b/ocr @@ -67,12 +67,16 @@ class OCRPipeline(WorkflowRunner): ' # mkdir_jobs # ' ################################################## ''' + mkdir_jobs = [] for i, job in enumerate(self.jobs): output_dir = os.path.join(job.output_dir, 'tmp') + poco_dir = os.path.join(job.output_dir, 'PoCo') cmd = 'mkdir' cmd += ' -p' cmd += ' "{}"'.format(output_dir) + cmd += ' "{}"'.format(os.path.join(poco_dir, 'hocr')) + cmd += ' "{}"'.format(os.path.join(poco_dir, 'tiff')) if self.keep_intermediates: cmd += ' "{}"'.format(os.path.join(output_dir, 'hocr')) cmd += ' "{}"'.format(os.path.join(output_dir, 'pdf')) @@ -239,6 +243,49 @@ class OCRPipeline(WorkflowRunner): hocrtotei_jobs.append(self.addTask(command=cmd, dependencies=deps, label=lbl)) + ''' + ' ################################################## + ' # hocr_poco_jobs # + ' ################################################## + ''' + + hocr_poco_jobs = [] + for i, job in enumerate(self.jobs): + input_dir = os.path.join(job.output_dir, 'tmp') + files = filter(lambda x: x.endswith('.hocr'), + os.listdir(input_dir)) + files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) + files = map(lambda x: os.path.join(input_dir, x), files) + output_path_base = os.path.join(job.output_dir, 'PoCo') + output_path = os.path.join(output_path_base, 'hocr') + cmd = 'cp "{}" "{}"'.format('" "'.join(files), output_path) + deps = filter(lambda x: x.startswith('ocr_job_-_{}'.format(i)), + tesseract_jobs) + lbl = 'hocr_poco_jobs-_{}'.format(i) + hocr_poco_jobs.append(self.addTask(command=cmd, dependencies=deps, + label=lbl)) + ''' + ' ################################################## + ' # tiff_poco_jobs # + ' ################################################## + ''' + + tiff_poco_jobs = [] + for i, job in enumerate(self.jobs): + input_dir = os.path.join(job.output_dir, 'tmp') + files = filter(lambda x: x.endswith('.tif'), + os.listdir(input_dir)) + files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) + files = map(lambda x: os.path.join(input_dir, x), files) + output_path_base = os.path.join(job.output_dir, 'PoCo') + output_path = os.path.join(output_path_base, 'tiff') + cmd = 'cp "{}" "{}"'.format('" "'.join(files), output_path) + deps = filter(lambda x: x.startswith('ocr_job_-_{}'.format(i)), + tesseract_jobs) + lbl = 'tiff_poco_jobs-_{}'.format(i) + tiff_poco_jobs.append(self.addTask(command=cmd, dependencies=deps, + label=lbl)) + ''' ' ################################################## ' # pdfunite_jobs # @@ -290,20 +337,25 @@ class OCRPipeline(WorkflowRunner): if self.zip.endswith('.zip'): self.zip = self.zip[:-4] self.zip = self.zip if self.zip else 'output' + # zip ALL cmd = 'cd "{}"'.format(self.output_dir) cmd += ' && ' cmd += 'zip' cmd += ' -r' - cmd += ' "{}".zip .'.format(self.zip) + cmd += ' "{}".all.zip .'.format(self.zip) cmd += ' -x "pyflow.data*" "*tmp*"' - cmd += ' -i "*.pdf" "*.txt" "*.xml"' + cmd += ' -i "*.pdf" "*.txt" "*.xml" "*.hocr" "*.tif"' cmd += ' && ' cmd += 'cd -' - deps = hocrtotei_jobs + pdfunite_jobs + cat_jobs + deps = (hocrtotei_jobs + + pdfunite_jobs + + cat_jobs + + hocr_poco_jobs + + tiff_poco_jobs) lbl = 'zip_job_-_all' zip_jobs.append(self.addTask(command=cmd, dependencies=deps, label=lbl)) - + # zip PDFs cmd = 'cd "{}"'.format(self.output_dir) cmd += ' && ' cmd += 'zip' @@ -317,7 +369,7 @@ class OCRPipeline(WorkflowRunner): lbl = 'zip_job_-_pdf' zip_jobs.append(self.addTask(command=cmd, dependencies=deps, label=lbl)) - + # zip TXTs cmd = 'cd "{}"'.format(self.output_dir) cmd += ' && ' cmd += 'zip' @@ -331,7 +383,7 @@ class OCRPipeline(WorkflowRunner): lbl = 'zip_job_-_txt' zip_jobs.append(self.addTask(command=cmd, dependencies=deps, label=lbl)) - + # zip XMLs cmd = 'cd "{}"'.format(self.output_dir) cmd += ' && ' cmd += 'zip' @@ -345,6 +397,27 @@ class OCRPipeline(WorkflowRunner): lbl = 'zip_job_-_xml' zip_jobs.append(self.addTask(command=cmd, dependencies=deps, label=lbl)) + # zip PoCo files + # TODO: Fix relative paths before? + poco_paths = [] + poco_names = [] + for i, job in enumerate(self.jobs): + poco_paths.append(os.path.join(os.path.basename(job.output_dir), # noqa + 'PoCo')) + poco_names.append(job.output_dir) + + cmd = 'cd "{}"'.format(self.output_dir) + cmd += ' && ' + cmd += 'zip' + cmd += ' -r' + cmd += ' "{}".poco.zip'.format(self.zip) + cmd += ' "{}"'.format('" "'.join(poco_paths)) + cmd += ' && ' + cmd += 'cd -' + deps = 'zip_job_-_all' + lbl = 'zip_job_-_poco_{}'.format(i) + zip_jobs.append(self.addTask(command=cmd, dependencies=deps, + label=lbl)) ''' ' ################################################## @@ -371,7 +444,9 @@ class OCRPipeline(WorkflowRunner): cmd += 'mv "{}"/*.nrm.png "{}"'.format(input_dir, os.path.join(output_dir, 'nrm.png')) # noqa deps = ['hocrtotei_job_-_{}'.format(i), 'pdfunite_job_-_{}'.format(i), - 'cat_job_-_{}'.format(i)] + 'cat_job_-_{}'.format(i), + 'tiff_poco_jobs_-_{i}'.format(i), + 'hocr_poco_jobs_-_{i}'.format(i)] lbl = 'mv_job_-_{}'.format(i) mv_jobs.append(self.addTask(command=cmd, dependencies=deps, label=lbl))