From c640d9743fcc28878bb8495fec38ec56c9d6f5ef Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Wed, 5 Jan 2022 11:25:00 +0100 Subject: [PATCH] Add output_files.json (lists all output files) generation. --- ocr | 66 +++++++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 53 insertions(+), 13 deletions(-) diff --git a/ocr b/ocr index 65ca392..54dcd40 100755 --- a/ocr +++ b/ocr @@ -7,6 +7,7 @@ __version__ = '0.1.0' from argparse import ArgumentParser from pyflow import WorkflowRunner +import json import multiprocessing import os import sys @@ -28,6 +29,7 @@ class PipelineJob: self.file = file self.name = os.path.basename(file)[:-4] self.output_dir = output_dir + self.output_files = [] self.tmp_dir = os.path.join(output_dir, 'tmp') @@ -356,21 +358,19 @@ class MainWorkflow(WorkflowRunner): self.input_dir = input_dir self.lang = lang self.output_dir = output_dir + self.output_files = [] self.binarize = binarize - self.jobs = self.collect_jobs() + self.jobs = [] def collect_jobs(self): - jobs = [] + self.jobs = [] for file in os.listdir(self.input_dir): if os.path.isdir(os.path.join(self.input_dir, file)): continue - if file.lower().endswith('.pdf'): - job = PipelineJob( - os.path.join(self.input_dir, file), - os.path.join(self.output_dir, file) - ) - jobs.append(job) - return jobs + if not file.lower().endswith('.pdf'): + continue + self.jobs.append(PipelineJob(os.path.join(self.input_dir, file), + os.path.join(self.output_dir, file))) def workflow(self): if not self.jobs: @@ -483,14 +483,53 @@ class MainWorkflow(WorkflowRunner): ) create_txt_tasks.append(task) - # Remove temporary directories when all tasks are completed self.waitForTasks() for job in self.jobs: + # Remove temporary directory os.rmdir(job.tmp_dir) + # Track output files + for x in os.listdir(os.path.join(job.output_dir, 'images')): + self.output_files.append( + { + 'directory': os.path.join(os.path.relpath(job.output_dir, start=self.output_dir), 'images'), # noqa + 'filename': x, + 'mimetype': 'image/png' + } + ) + self.output_files.append( + { + 'directory': os.path.relpath(job.output_dir, start=self.output_dir), # noqa + 'filename': '{}.hocr'.format(job.name), + 'mimetype': 'application/xhtml+xml' + } + ) + self.output_files.append( + { + 'directory': os.path.relpath(job.output_dir, start=self.output_dir), # noqa + 'filename': '{}.pdf'.format(job.name), + 'mimetype': 'application/pdf' + } + ) + self.output_files.append( + { + 'directory': os.path.relpath(job.output_dir, start=self.output_dir), # noqa + 'filename': '{}.txt'.format(job.name), + 'mimetype': 'text/plain' + } + ) + self.output_files.append( + { + 'directory': os.path.relpath(job.output_dir, start=self.output_dir), # noqa + 'filename': '{}.xml'.format(job.name), + 'mimetype': 'application/tei+xml' + } + ) + with open(os.path.join(self.output_dir, 'output_files.json'), 'w') as f: + json.dump(self.output_files, f, indent=4) def parse_args(): - parser = ArgumentParser(description='OCR pipeline for PDF file processing') + parser = ArgumentParser(description='Pipeline for PDF file OCR processing') parser.add_argument( '-i', '--input-dir', help='Input directory', required=True) parser.add_argument( @@ -543,9 +582,10 @@ def parse_args(): def main(): args = parse_args() - ocr_pipeline = MainWorkflow( + main_workflow = MainWorkflow( args.input_dir, args.language, args.output_dir, args.binarize) - retval = ocr_pipeline.run( + main_workflow.collect_jobs() + retval = main_workflow.run( dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores) sys.exit(retval)