Add output_files.json (lists all output files) generation.

This commit is contained in:
Patrick Jentsch 2022-01-05 11:25:00 +01:00
parent e3fd679b38
commit c640d9743f

66
ocr
View File

@ -7,6 +7,7 @@ __version__ = '0.1.0'
from argparse import ArgumentParser
from pyflow import WorkflowRunner
import json
import multiprocessing
import os
import sys
@ -28,6 +29,7 @@ class PipelineJob:
self.file = file
self.name = os.path.basename(file)[:-4]
self.output_dir = output_dir
self.output_files = []
self.tmp_dir = os.path.join(output_dir, 'tmp')
@ -356,21 +358,19 @@ class MainWorkflow(WorkflowRunner):
self.input_dir = input_dir
self.lang = lang
self.output_dir = output_dir
self.output_files = []
self.binarize = binarize
self.jobs = self.collect_jobs()
self.jobs = []
def collect_jobs(self):
jobs = []
self.jobs = []
for file in os.listdir(self.input_dir):
if os.path.isdir(os.path.join(self.input_dir, file)):
continue
if file.lower().endswith('.pdf'):
job = PipelineJob(
os.path.join(self.input_dir, file),
os.path.join(self.output_dir, file)
)
jobs.append(job)
return jobs
if not file.lower().endswith('.pdf'):
continue
self.jobs.append(PipelineJob(os.path.join(self.input_dir, file),
os.path.join(self.output_dir, file)))
def workflow(self):
if not self.jobs:
@ -483,14 +483,53 @@ class MainWorkflow(WorkflowRunner):
)
create_txt_tasks.append(task)
# Remove temporary directories when all tasks are completed
self.waitForTasks()
for job in self.jobs:
# Remove temporary directory
os.rmdir(job.tmp_dir)
# Track output files
for x in os.listdir(os.path.join(job.output_dir, 'images')):
self.output_files.append(
{
'directory': os.path.join(os.path.relpath(job.output_dir, start=self.output_dir), 'images'), # noqa
'filename': x,
'mimetype': 'image/png'
}
)
self.output_files.append(
{
'directory': os.path.relpath(job.output_dir, start=self.output_dir), # noqa
'filename': '{}.hocr'.format(job.name),
'mimetype': 'application/xhtml+xml'
}
)
self.output_files.append(
{
'directory': os.path.relpath(job.output_dir, start=self.output_dir), # noqa
'filename': '{}.pdf'.format(job.name),
'mimetype': 'application/pdf'
}
)
self.output_files.append(
{
'directory': os.path.relpath(job.output_dir, start=self.output_dir), # noqa
'filename': '{}.txt'.format(job.name),
'mimetype': 'text/plain'
}
)
self.output_files.append(
{
'directory': os.path.relpath(job.output_dir, start=self.output_dir), # noqa
'filename': '{}.xml'.format(job.name),
'mimetype': 'application/tei+xml'
}
)
with open(os.path.join(self.output_dir, 'output_files.json'), 'w') as f:
json.dump(self.output_files, f, indent=4)
def parse_args():
parser = ArgumentParser(description='OCR pipeline for PDF file processing')
parser = ArgumentParser(description='Pipeline for PDF file OCR processing')
parser.add_argument(
'-i', '--input-dir', help='Input directory', required=True)
parser.add_argument(
@ -543,9 +582,10 @@ def parse_args():
def main():
args = parse_args()
ocr_pipeline = MainWorkflow(
main_workflow = MainWorkflow(
args.input_dir, args.language, args.output_dir, args.binarize)
retval = ocr_pipeline.run(
main_workflow.collect_jobs()
retval = main_workflow.run(
dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores)
sys.exit(retval)