mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2024-12-25 18:24:18 +00:00
Add output_files.json (lists all output files) generation.
This commit is contained in:
parent
e3fd679b38
commit
c640d9743f
66
ocr
66
ocr
@ -7,6 +7,7 @@ __version__ = '0.1.0'
|
||||
|
||||
from argparse import ArgumentParser
|
||||
from pyflow import WorkflowRunner
|
||||
import json
|
||||
import multiprocessing
|
||||
import os
|
||||
import sys
|
||||
@ -28,6 +29,7 @@ class PipelineJob:
|
||||
self.file = file
|
||||
self.name = os.path.basename(file)[:-4]
|
||||
self.output_dir = output_dir
|
||||
self.output_files = []
|
||||
self.tmp_dir = os.path.join(output_dir, 'tmp')
|
||||
|
||||
|
||||
@ -356,21 +358,19 @@ class MainWorkflow(WorkflowRunner):
|
||||
self.input_dir = input_dir
|
||||
self.lang = lang
|
||||
self.output_dir = output_dir
|
||||
self.output_files = []
|
||||
self.binarize = binarize
|
||||
self.jobs = self.collect_jobs()
|
||||
self.jobs = []
|
||||
|
||||
def collect_jobs(self):
|
||||
jobs = []
|
||||
self.jobs = []
|
||||
for file in os.listdir(self.input_dir):
|
||||
if os.path.isdir(os.path.join(self.input_dir, file)):
|
||||
continue
|
||||
if file.lower().endswith('.pdf'):
|
||||
job = PipelineJob(
|
||||
os.path.join(self.input_dir, file),
|
||||
os.path.join(self.output_dir, file)
|
||||
)
|
||||
jobs.append(job)
|
||||
return jobs
|
||||
if not file.lower().endswith('.pdf'):
|
||||
continue
|
||||
self.jobs.append(PipelineJob(os.path.join(self.input_dir, file),
|
||||
os.path.join(self.output_dir, file)))
|
||||
|
||||
def workflow(self):
|
||||
if not self.jobs:
|
||||
@ -483,14 +483,53 @@ class MainWorkflow(WorkflowRunner):
|
||||
)
|
||||
create_txt_tasks.append(task)
|
||||
|
||||
# Remove temporary directories when all tasks are completed
|
||||
self.waitForTasks()
|
||||
for job in self.jobs:
|
||||
# Remove temporary directory
|
||||
os.rmdir(job.tmp_dir)
|
||||
# Track output files
|
||||
for x in os.listdir(os.path.join(job.output_dir, 'images')):
|
||||
self.output_files.append(
|
||||
{
|
||||
'directory': os.path.join(os.path.relpath(job.output_dir, start=self.output_dir), 'images'), # noqa
|
||||
'filename': x,
|
||||
'mimetype': 'image/png'
|
||||
}
|
||||
)
|
||||
self.output_files.append(
|
||||
{
|
||||
'directory': os.path.relpath(job.output_dir, start=self.output_dir), # noqa
|
||||
'filename': '{}.hocr'.format(job.name),
|
||||
'mimetype': 'application/xhtml+xml'
|
||||
}
|
||||
)
|
||||
self.output_files.append(
|
||||
{
|
||||
'directory': os.path.relpath(job.output_dir, start=self.output_dir), # noqa
|
||||
'filename': '{}.pdf'.format(job.name),
|
||||
'mimetype': 'application/pdf'
|
||||
}
|
||||
)
|
||||
self.output_files.append(
|
||||
{
|
||||
'directory': os.path.relpath(job.output_dir, start=self.output_dir), # noqa
|
||||
'filename': '{}.txt'.format(job.name),
|
||||
'mimetype': 'text/plain'
|
||||
}
|
||||
)
|
||||
self.output_files.append(
|
||||
{
|
||||
'directory': os.path.relpath(job.output_dir, start=self.output_dir), # noqa
|
||||
'filename': '{}.xml'.format(job.name),
|
||||
'mimetype': 'application/tei+xml'
|
||||
}
|
||||
)
|
||||
with open(os.path.join(self.output_dir, 'output_files.json'), 'w') as f:
|
||||
json.dump(self.output_files, f, indent=4)
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = ArgumentParser(description='OCR pipeline for PDF file processing')
|
||||
parser = ArgumentParser(description='Pipeline for PDF file OCR processing')
|
||||
parser.add_argument(
|
||||
'-i', '--input-dir', help='Input directory', required=True)
|
||||
parser.add_argument(
|
||||
@ -543,9 +582,10 @@ def parse_args():
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
ocr_pipeline = MainWorkflow(
|
||||
main_workflow = MainWorkflow(
|
||||
args.input_dir, args.language, args.output_dir, args.binarize)
|
||||
retval = ocr_pipeline.run(
|
||||
main_workflow.collect_jobs()
|
||||
retval = main_workflow.run(
|
||||
dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores)
|
||||
sys.exit(retval)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user