mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2025-01-13 13:00:35 +00:00
Cleanup and change some output options
This commit is contained in:
parent
f51a8c4546
commit
c057d324cf
81
ocr
81
ocr
@ -29,7 +29,6 @@ class PipelineJob:
|
||||
self.file = file
|
||||
self.name = os.path.basename(file)[:-4]
|
||||
self.output_dir = output_dir
|
||||
self.output_files = []
|
||||
self.tmp_dir = os.path.join(output_dir, 'tmp')
|
||||
|
||||
|
||||
@ -325,12 +324,40 @@ class CreateTEIWorkflow(WorkflowRunner):
|
||||
cmd = 'hocr2tei "{}.hocr"'.format(
|
||||
os.path.join(self.job.output_dir, self.job.name)
|
||||
)
|
||||
cmd += ' --output-file "{}.xml"'.format(
|
||||
cmd += ' --output-file "{}.tei.xml"'.format(
|
||||
os.path.join(self.job.output_dir, self.job.name)
|
||||
)
|
||||
self.addTask('hocr2tei', command=cmd, memMb=mem_mb, nCores=n_cores)
|
||||
|
||||
|
||||
class CreatePoCoZipWorkflow(WorkflowRunner):
|
||||
def __init__(self, job):
|
||||
self.job = job
|
||||
|
||||
def workflow(self):
|
||||
'''
|
||||
' ##################################################
|
||||
' # zip #
|
||||
' ##################################################
|
||||
'''
|
||||
n_cores = 1
|
||||
mem_mb = min(512, self.getMemMb())
|
||||
zip_tasks = []
|
||||
cmd = 'cd "{}"'.format(self.job.output_dir)
|
||||
cmd += ' && '
|
||||
cmd += 'zip'
|
||||
cmd += ' -r'
|
||||
cmd += ' -m'
|
||||
cmd += ' "{}.poco.zip" .'.format(self.job.name)
|
||||
cmd += ' -i "images/*.png" "{}.hocr"'.format(self.job.name)
|
||||
cmd += ' && '
|
||||
cmd += 'rm -r images'
|
||||
cmd += ' && '
|
||||
cmd += 'cd -'
|
||||
task = self.addTask('zip', command=cmd, memMb=mem_mb, nCores=n_cores)
|
||||
zip_tasks.append(task)
|
||||
|
||||
|
||||
class CreateTxtWorkflow(WorkflowRunner):
|
||||
def __init__(self, job):
|
||||
self.job = job
|
||||
@ -369,8 +396,11 @@ class MainWorkflow(WorkflowRunner):
|
||||
continue
|
||||
if not file.lower().endswith('.pdf'):
|
||||
continue
|
||||
self.jobs.append(PipelineJob(os.path.join(self.input_dir, file),
|
||||
os.path.join(self.output_dir, file)))
|
||||
job = PipelineJob(
|
||||
os.path.join(self.input_dir, file),
|
||||
os.path.join(self.output_dir, file)
|
||||
)
|
||||
self.jobs.append(job)
|
||||
|
||||
def workflow(self):
|
||||
if not self.jobs:
|
||||
@ -469,6 +499,20 @@ class MainWorkflow(WorkflowRunner):
|
||||
)
|
||||
create_tei_tasks.append(task)
|
||||
|
||||
'''
|
||||
' ##################################################
|
||||
' # create-poco-zip #
|
||||
' ##################################################
|
||||
'''
|
||||
create_poco_zip_tasks = []
|
||||
for i, job in enumerate(self.jobs):
|
||||
task = self.addWorkflowTask(
|
||||
'create_poco_zip_-_{}'.format(i),
|
||||
CreatePoCoZipWorkflow(job),
|
||||
dependencies='create_tei_-_{}'.format(i)
|
||||
)
|
||||
create_poco_zip_tasks.append(task)
|
||||
|
||||
'''
|
||||
' ##################################################
|
||||
' # create-txt #
|
||||
@ -488,45 +532,36 @@ class MainWorkflow(WorkflowRunner):
|
||||
# Remove temporary directory
|
||||
os.rmdir(job.tmp_dir)
|
||||
# Track output files
|
||||
relative_input = os.path.relpath(job.file, start=self.input_dir)
|
||||
relative_output_dir = os.path.relpath(job.output_dir, start=self.output_dir) # noqa
|
||||
for x in os.listdir(os.path.join(job.output_dir, 'images')):
|
||||
self.output_files.append(
|
||||
{
|
||||
'input': relative_input,
|
||||
'path': os.path.join(relative_output_dir, 'images', x),
|
||||
'mimetype': 'image/png'
|
||||
}
|
||||
)
|
||||
self.output_files.append(
|
||||
{
|
||||
'input': relative_input,
|
||||
'path': os.path.join(relative_output_dir, '{}.hocr'.format(job.name)), # noqa
|
||||
'mimetype': 'application/xhtml+xml'
|
||||
'description': 'Post correction package (.png and .hocr).',
|
||||
'file': os.path.join(relative_output_dir, '{}.poco.zip'.format(job.name)), # noqa
|
||||
'mimetype': 'application/zip'
|
||||
}
|
||||
)
|
||||
self.output_files.append(
|
||||
{
|
||||
'input': relative_input,
|
||||
'filename': os.path.join(relative_output_dir, '{}.pdf'.format(job.name)), # noqa
|
||||
'description': 'PDF file with text layer.',
|
||||
'file': os.path.join(relative_output_dir, '{}.pdf'.format(job.name)), # noqa
|
||||
'mimetype': 'application/pdf'
|
||||
}
|
||||
)
|
||||
self.output_files.append(
|
||||
{
|
||||
'input': relative_input,
|
||||
'filename': os.path.join(relative_output_dir, '{}.txt'.format(job.name)), # noqa
|
||||
'description': 'Plain text file.',
|
||||
'file': os.path.join(relative_output_dir, '{}.txt'.format(job.name)), # noqa
|
||||
'mimetype': 'text/plain'
|
||||
}
|
||||
)
|
||||
self.output_files.append(
|
||||
{
|
||||
'input': relative_input,
|
||||
'filename': os.path.join(relative_output_dir, '{}.xml'.format(job.name)), # noqa
|
||||
'description': 'TEI compliant XML file.',
|
||||
'file': os.path.join(relative_output_dir, '{}.tei.xml'.format(job.name)), # noqa
|
||||
'mimetype': 'application/tei+xml'
|
||||
}
|
||||
)
|
||||
with open(os.path.join(self.output_dir, 'output_files.json'), 'w') as f:
|
||||
with open(os.path.join(self.output_dir, 'output_records.json'), 'w') as f: # noqa
|
||||
json.dump(self.output_files, f, indent=4)
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user