Cleanup and change some output options

This commit is contained in:
Patrick Jentsch 2022-01-17 15:07:46 +01:00
parent f51a8c4546
commit c057d324cf

81
ocr
View File

@ -29,7 +29,6 @@ class PipelineJob:
self.file = file
self.name = os.path.basename(file)[:-4]
self.output_dir = output_dir
self.output_files = []
self.tmp_dir = os.path.join(output_dir, 'tmp')
@ -325,12 +324,40 @@ class CreateTEIWorkflow(WorkflowRunner):
cmd = 'hocr2tei "{}.hocr"'.format(
os.path.join(self.job.output_dir, self.job.name)
)
cmd += ' --output-file "{}.xml"'.format(
cmd += ' --output-file "{}.tei.xml"'.format(
os.path.join(self.job.output_dir, self.job.name)
)
self.addTask('hocr2tei', command=cmd, memMb=mem_mb, nCores=n_cores)
class CreatePoCoZipWorkflow(WorkflowRunner):
def __init__(self, job):
self.job = job
def workflow(self):
'''
' ##################################################
' # zip #
' ##################################################
'''
n_cores = 1
mem_mb = min(512, self.getMemMb())
zip_tasks = []
cmd = 'cd "{}"'.format(self.job.output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' -r'
cmd += ' -m'
cmd += ' "{}.poco.zip" .'.format(self.job.name)
cmd += ' -i "images/*.png" "{}.hocr"'.format(self.job.name)
cmd += ' && '
cmd += 'rm -r images'
cmd += ' && '
cmd += 'cd -'
task = self.addTask('zip', command=cmd, memMb=mem_mb, nCores=n_cores)
zip_tasks.append(task)
class CreateTxtWorkflow(WorkflowRunner):
def __init__(self, job):
self.job = job
@ -369,8 +396,11 @@ class MainWorkflow(WorkflowRunner):
continue
if not file.lower().endswith('.pdf'):
continue
self.jobs.append(PipelineJob(os.path.join(self.input_dir, file),
os.path.join(self.output_dir, file)))
job = PipelineJob(
os.path.join(self.input_dir, file),
os.path.join(self.output_dir, file)
)
self.jobs.append(job)
def workflow(self):
if not self.jobs:
@ -469,6 +499,20 @@ class MainWorkflow(WorkflowRunner):
)
create_tei_tasks.append(task)
'''
' ##################################################
' # create-poco-zip #
' ##################################################
'''
create_poco_zip_tasks = []
for i, job in enumerate(self.jobs):
task = self.addWorkflowTask(
'create_poco_zip_-_{}'.format(i),
CreatePoCoZipWorkflow(job),
dependencies='create_tei_-_{}'.format(i)
)
create_poco_zip_tasks.append(task)
'''
' ##################################################
' # create-txt #
@ -488,45 +532,36 @@ class MainWorkflow(WorkflowRunner):
# Remove temporary directory
os.rmdir(job.tmp_dir)
# Track output files
relative_input = os.path.relpath(job.file, start=self.input_dir)
relative_output_dir = os.path.relpath(job.output_dir, start=self.output_dir) # noqa
for x in os.listdir(os.path.join(job.output_dir, 'images')):
self.output_files.append(
{
'input': relative_input,
'path': os.path.join(relative_output_dir, 'images', x),
'mimetype': 'image/png'
}
)
self.output_files.append(
{
'input': relative_input,
'path': os.path.join(relative_output_dir, '{}.hocr'.format(job.name)), # noqa
'mimetype': 'application/xhtml+xml'
'description': 'Post correction package (.png and .hocr).',
'file': os.path.join(relative_output_dir, '{}.poco.zip'.format(job.name)), # noqa
'mimetype': 'application/zip'
}
)
self.output_files.append(
{
'input': relative_input,
'filename': os.path.join(relative_output_dir, '{}.pdf'.format(job.name)), # noqa
'description': 'PDF file with text layer.',
'file': os.path.join(relative_output_dir, '{}.pdf'.format(job.name)), # noqa
'mimetype': 'application/pdf'
}
)
self.output_files.append(
{
'input': relative_input,
'filename': os.path.join(relative_output_dir, '{}.txt'.format(job.name)), # noqa
'description': 'Plain text file.',
'file': os.path.join(relative_output_dir, '{}.txt'.format(job.name)), # noqa
'mimetype': 'text/plain'
}
)
self.output_files.append(
{
'input': relative_input,
'filename': os.path.join(relative_output_dir, '{}.xml'.format(job.name)), # noqa
'description': 'TEI compliant XML file.',
'file': os.path.join(relative_output_dir, '{}.tei.xml'.format(job.name)), # noqa
'mimetype': 'application/tei+xml'
}
)
with open(os.path.join(self.output_dir, 'output_files.json'), 'w') as f:
with open(os.path.join(self.output_dir, 'output_records.json'), 'w') as f: # noqa
json.dump(self.output_files, f, indent=4)