mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2025-01-13 23:00:33 +00:00
Add output_files.json (lists all output files) generation.
This commit is contained in:
parent
e3fd679b38
commit
c640d9743f
66
ocr
66
ocr
@ -7,6 +7,7 @@ __version__ = '0.1.0'
|
|||||||
|
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
from pyflow import WorkflowRunner
|
from pyflow import WorkflowRunner
|
||||||
|
import json
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
@ -28,6 +29,7 @@ class PipelineJob:
|
|||||||
self.file = file
|
self.file = file
|
||||||
self.name = os.path.basename(file)[:-4]
|
self.name = os.path.basename(file)[:-4]
|
||||||
self.output_dir = output_dir
|
self.output_dir = output_dir
|
||||||
|
self.output_files = []
|
||||||
self.tmp_dir = os.path.join(output_dir, 'tmp')
|
self.tmp_dir = os.path.join(output_dir, 'tmp')
|
||||||
|
|
||||||
|
|
||||||
@ -356,21 +358,19 @@ class MainWorkflow(WorkflowRunner):
|
|||||||
self.input_dir = input_dir
|
self.input_dir = input_dir
|
||||||
self.lang = lang
|
self.lang = lang
|
||||||
self.output_dir = output_dir
|
self.output_dir = output_dir
|
||||||
|
self.output_files = []
|
||||||
self.binarize = binarize
|
self.binarize = binarize
|
||||||
self.jobs = self.collect_jobs()
|
self.jobs = []
|
||||||
|
|
||||||
def collect_jobs(self):
|
def collect_jobs(self):
|
||||||
jobs = []
|
self.jobs = []
|
||||||
for file in os.listdir(self.input_dir):
|
for file in os.listdir(self.input_dir):
|
||||||
if os.path.isdir(os.path.join(self.input_dir, file)):
|
if os.path.isdir(os.path.join(self.input_dir, file)):
|
||||||
continue
|
continue
|
||||||
if file.lower().endswith('.pdf'):
|
if not file.lower().endswith('.pdf'):
|
||||||
job = PipelineJob(
|
continue
|
||||||
os.path.join(self.input_dir, file),
|
self.jobs.append(PipelineJob(os.path.join(self.input_dir, file),
|
||||||
os.path.join(self.output_dir, file)
|
os.path.join(self.output_dir, file)))
|
||||||
)
|
|
||||||
jobs.append(job)
|
|
||||||
return jobs
|
|
||||||
|
|
||||||
def workflow(self):
|
def workflow(self):
|
||||||
if not self.jobs:
|
if not self.jobs:
|
||||||
@ -483,14 +483,53 @@ class MainWorkflow(WorkflowRunner):
|
|||||||
)
|
)
|
||||||
create_txt_tasks.append(task)
|
create_txt_tasks.append(task)
|
||||||
|
|
||||||
# Remove temporary directories when all tasks are completed
|
|
||||||
self.waitForTasks()
|
self.waitForTasks()
|
||||||
for job in self.jobs:
|
for job in self.jobs:
|
||||||
|
# Remove temporary directory
|
||||||
os.rmdir(job.tmp_dir)
|
os.rmdir(job.tmp_dir)
|
||||||
|
# Track output files
|
||||||
|
for x in os.listdir(os.path.join(job.output_dir, 'images')):
|
||||||
|
self.output_files.append(
|
||||||
|
{
|
||||||
|
'directory': os.path.join(os.path.relpath(job.output_dir, start=self.output_dir), 'images'), # noqa
|
||||||
|
'filename': x,
|
||||||
|
'mimetype': 'image/png'
|
||||||
|
}
|
||||||
|
)
|
||||||
|
self.output_files.append(
|
||||||
|
{
|
||||||
|
'directory': os.path.relpath(job.output_dir, start=self.output_dir), # noqa
|
||||||
|
'filename': '{}.hocr'.format(job.name),
|
||||||
|
'mimetype': 'application/xhtml+xml'
|
||||||
|
}
|
||||||
|
)
|
||||||
|
self.output_files.append(
|
||||||
|
{
|
||||||
|
'directory': os.path.relpath(job.output_dir, start=self.output_dir), # noqa
|
||||||
|
'filename': '{}.pdf'.format(job.name),
|
||||||
|
'mimetype': 'application/pdf'
|
||||||
|
}
|
||||||
|
)
|
||||||
|
self.output_files.append(
|
||||||
|
{
|
||||||
|
'directory': os.path.relpath(job.output_dir, start=self.output_dir), # noqa
|
||||||
|
'filename': '{}.txt'.format(job.name),
|
||||||
|
'mimetype': 'text/plain'
|
||||||
|
}
|
||||||
|
)
|
||||||
|
self.output_files.append(
|
||||||
|
{
|
||||||
|
'directory': os.path.relpath(job.output_dir, start=self.output_dir), # noqa
|
||||||
|
'filename': '{}.xml'.format(job.name),
|
||||||
|
'mimetype': 'application/tei+xml'
|
||||||
|
}
|
||||||
|
)
|
||||||
|
with open(os.path.join(self.output_dir, 'output_files.json'), 'w') as f:
|
||||||
|
json.dump(self.output_files, f, indent=4)
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
parser = ArgumentParser(description='OCR pipeline for PDF file processing')
|
parser = ArgumentParser(description='Pipeline for PDF file OCR processing')
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-i', '--input-dir', help='Input directory', required=True)
|
'-i', '--input-dir', help='Input directory', required=True)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -543,9 +582,10 @@ def parse_args():
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
ocr_pipeline = MainWorkflow(
|
main_workflow = MainWorkflow(
|
||||||
args.input_dir, args.language, args.output_dir, args.binarize)
|
args.input_dir, args.language, args.output_dir, args.binarize)
|
||||||
retval = ocr_pipeline.run(
|
main_workflow.collect_jobs()
|
||||||
|
retval = main_workflow.run(
|
||||||
dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores)
|
dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores)
|
||||||
sys.exit(retval)
|
sys.exit(retval)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user