From c640d9743fcc28878bb8495fec38ec56c9d6f5ef Mon Sep 17 00:00:00 2001
From: Patrick Jentsch
Date: Wed, 5 Jan 2022 11:25:00 +0100
Subject: [PATCH] Add output_files.json (lists all output files) generation.
---
ocr | 66 +++++++++++++++++++++++++++++++++++++++++++++++++------------
1 file changed, 53 insertions(+), 13 deletions(-)
diff --git a/ocr b/ocr
index 65ca392..54dcd40 100755
--- a/ocr
+++ b/ocr
@@ -7,6 +7,7 @@ __version__ = '0.1.0'
from argparse import ArgumentParser
from pyflow import WorkflowRunner
+import json
import multiprocessing
import os
import sys
@@ -28,6 +29,7 @@ class PipelineJob:
self.file = file
self.name = os.path.basename(file)[:-4]
self.output_dir = output_dir
+ self.output_files = []
self.tmp_dir = os.path.join(output_dir, 'tmp')
@@ -356,21 +358,19 @@ class MainWorkflow(WorkflowRunner):
self.input_dir = input_dir
self.lang = lang
self.output_dir = output_dir
+ self.output_files = []
self.binarize = binarize
- self.jobs = self.collect_jobs()
+ self.jobs = []
def collect_jobs(self):
- jobs = []
+ self.jobs = []
for file in os.listdir(self.input_dir):
if os.path.isdir(os.path.join(self.input_dir, file)):
continue
- if file.lower().endswith('.pdf'):
- job = PipelineJob(
- os.path.join(self.input_dir, file),
- os.path.join(self.output_dir, file)
- )
- jobs.append(job)
- return jobs
+ if not file.lower().endswith('.pdf'):
+ continue
+ self.jobs.append(PipelineJob(os.path.join(self.input_dir, file),
+ os.path.join(self.output_dir, file)))
def workflow(self):
if not self.jobs:
@@ -483,14 +483,53 @@ class MainWorkflow(WorkflowRunner):
)
create_txt_tasks.append(task)
- # Remove temporary directories when all tasks are completed
self.waitForTasks()
for job in self.jobs:
+ # Remove temporary directory
os.rmdir(job.tmp_dir)
+ # Track output files
+ for x in os.listdir(os.path.join(job.output_dir, 'images')):
+ self.output_files.append(
+ {
+ 'directory': os.path.join(os.path.relpath(job.output_dir, start=self.output_dir), 'images'), # noqa
+ 'filename': x,
+ 'mimetype': 'image/png'
+ }
+ )
+ self.output_files.append(
+ {
+ 'directory': os.path.relpath(job.output_dir, start=self.output_dir), # noqa
+ 'filename': '{}.hocr'.format(job.name),
+ 'mimetype': 'application/xhtml+xml'
+ }
+ )
+ self.output_files.append(
+ {
+ 'directory': os.path.relpath(job.output_dir, start=self.output_dir), # noqa
+ 'filename': '{}.pdf'.format(job.name),
+ 'mimetype': 'application/pdf'
+ }
+ )
+ self.output_files.append(
+ {
+ 'directory': os.path.relpath(job.output_dir, start=self.output_dir), # noqa
+ 'filename': '{}.txt'.format(job.name),
+ 'mimetype': 'text/plain'
+ }
+ )
+ self.output_files.append(
+ {
+ 'directory': os.path.relpath(job.output_dir, start=self.output_dir), # noqa
+ 'filename': '{}.xml'.format(job.name),
+ 'mimetype': 'application/tei+xml'
+ }
+ )
+ with open(os.path.join(self.output_dir, 'output_files.json'), 'w') as f:
+ json.dump(self.output_files, f, indent=4)
def parse_args():
- parser = ArgumentParser(description='OCR pipeline for PDF file processing')
+ parser = ArgumentParser(description='Pipeline for PDF file OCR processing')
parser.add_argument(
'-i', '--input-dir', help='Input directory', required=True)
parser.add_argument(
@@ -543,9 +582,10 @@ def parse_args():
def main():
args = parse_args()
- ocr_pipeline = MainWorkflow(
+ main_workflow = MainWorkflow(
args.input_dir, args.language, args.output_dir, args.binarize)
- retval = ocr_pipeline.run(
+ main_workflow.collect_jobs()
+ retval = main_workflow.run(
dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores)
sys.exit(retval)