mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2024-12-26 20:14:18 +00:00
Cleanup and change some output options
This commit is contained in:
parent
f51a8c4546
commit
c057d324cf
81
ocr
81
ocr
@ -29,7 +29,6 @@ class PipelineJob:
|
|||||||
self.file = file
|
self.file = file
|
||||||
self.name = os.path.basename(file)[:-4]
|
self.name = os.path.basename(file)[:-4]
|
||||||
self.output_dir = output_dir
|
self.output_dir = output_dir
|
||||||
self.output_files = []
|
|
||||||
self.tmp_dir = os.path.join(output_dir, 'tmp')
|
self.tmp_dir = os.path.join(output_dir, 'tmp')
|
||||||
|
|
||||||
|
|
||||||
@ -325,12 +324,40 @@ class CreateTEIWorkflow(WorkflowRunner):
|
|||||||
cmd = 'hocr2tei "{}.hocr"'.format(
|
cmd = 'hocr2tei "{}.hocr"'.format(
|
||||||
os.path.join(self.job.output_dir, self.job.name)
|
os.path.join(self.job.output_dir, self.job.name)
|
||||||
)
|
)
|
||||||
cmd += ' --output-file "{}.xml"'.format(
|
cmd += ' --output-file "{}.tei.xml"'.format(
|
||||||
os.path.join(self.job.output_dir, self.job.name)
|
os.path.join(self.job.output_dir, self.job.name)
|
||||||
)
|
)
|
||||||
self.addTask('hocr2tei', command=cmd, memMb=mem_mb, nCores=n_cores)
|
self.addTask('hocr2tei', command=cmd, memMb=mem_mb, nCores=n_cores)
|
||||||
|
|
||||||
|
|
||||||
|
class CreatePoCoZipWorkflow(WorkflowRunner):
|
||||||
|
def __init__(self, job):
|
||||||
|
self.job = job
|
||||||
|
|
||||||
|
def workflow(self):
|
||||||
|
'''
|
||||||
|
' ##################################################
|
||||||
|
' # zip #
|
||||||
|
' ##################################################
|
||||||
|
'''
|
||||||
|
n_cores = 1
|
||||||
|
mem_mb = min(512, self.getMemMb())
|
||||||
|
zip_tasks = []
|
||||||
|
cmd = 'cd "{}"'.format(self.job.output_dir)
|
||||||
|
cmd += ' && '
|
||||||
|
cmd += 'zip'
|
||||||
|
cmd += ' -r'
|
||||||
|
cmd += ' -m'
|
||||||
|
cmd += ' "{}.poco.zip" .'.format(self.job.name)
|
||||||
|
cmd += ' -i "images/*.png" "{}.hocr"'.format(self.job.name)
|
||||||
|
cmd += ' && '
|
||||||
|
cmd += 'rm -r images'
|
||||||
|
cmd += ' && '
|
||||||
|
cmd += 'cd -'
|
||||||
|
task = self.addTask('zip', command=cmd, memMb=mem_mb, nCores=n_cores)
|
||||||
|
zip_tasks.append(task)
|
||||||
|
|
||||||
|
|
||||||
class CreateTxtWorkflow(WorkflowRunner):
|
class CreateTxtWorkflow(WorkflowRunner):
|
||||||
def __init__(self, job):
|
def __init__(self, job):
|
||||||
self.job = job
|
self.job = job
|
||||||
@ -369,8 +396,11 @@ class MainWorkflow(WorkflowRunner):
|
|||||||
continue
|
continue
|
||||||
if not file.lower().endswith('.pdf'):
|
if not file.lower().endswith('.pdf'):
|
||||||
continue
|
continue
|
||||||
self.jobs.append(PipelineJob(os.path.join(self.input_dir, file),
|
job = PipelineJob(
|
||||||
os.path.join(self.output_dir, file)))
|
os.path.join(self.input_dir, file),
|
||||||
|
os.path.join(self.output_dir, file)
|
||||||
|
)
|
||||||
|
self.jobs.append(job)
|
||||||
|
|
||||||
def workflow(self):
|
def workflow(self):
|
||||||
if not self.jobs:
|
if not self.jobs:
|
||||||
@ -469,6 +499,20 @@ class MainWorkflow(WorkflowRunner):
|
|||||||
)
|
)
|
||||||
create_tei_tasks.append(task)
|
create_tei_tasks.append(task)
|
||||||
|
|
||||||
|
'''
|
||||||
|
' ##################################################
|
||||||
|
' # create-poco-zip #
|
||||||
|
' ##################################################
|
||||||
|
'''
|
||||||
|
create_poco_zip_tasks = []
|
||||||
|
for i, job in enumerate(self.jobs):
|
||||||
|
task = self.addWorkflowTask(
|
||||||
|
'create_poco_zip_-_{}'.format(i),
|
||||||
|
CreatePoCoZipWorkflow(job),
|
||||||
|
dependencies='create_tei_-_{}'.format(i)
|
||||||
|
)
|
||||||
|
create_poco_zip_tasks.append(task)
|
||||||
|
|
||||||
'''
|
'''
|
||||||
' ##################################################
|
' ##################################################
|
||||||
' # create-txt #
|
' # create-txt #
|
||||||
@ -488,45 +532,36 @@ class MainWorkflow(WorkflowRunner):
|
|||||||
# Remove temporary directory
|
# Remove temporary directory
|
||||||
os.rmdir(job.tmp_dir)
|
os.rmdir(job.tmp_dir)
|
||||||
# Track output files
|
# Track output files
|
||||||
relative_input = os.path.relpath(job.file, start=self.input_dir)
|
|
||||||
relative_output_dir = os.path.relpath(job.output_dir, start=self.output_dir) # noqa
|
relative_output_dir = os.path.relpath(job.output_dir, start=self.output_dir) # noqa
|
||||||
for x in os.listdir(os.path.join(job.output_dir, 'images')):
|
|
||||||
self.output_files.append(
|
self.output_files.append(
|
||||||
{
|
{
|
||||||
'input': relative_input,
|
'description': 'Post correction package (.png and .hocr).',
|
||||||
'path': os.path.join(relative_output_dir, 'images', x),
|
'file': os.path.join(relative_output_dir, '{}.poco.zip'.format(job.name)), # noqa
|
||||||
'mimetype': 'image/png'
|
'mimetype': 'application/zip'
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
self.output_files.append(
|
self.output_files.append(
|
||||||
{
|
{
|
||||||
'input': relative_input,
|
'description': 'PDF file with text layer.',
|
||||||
'path': os.path.join(relative_output_dir, '{}.hocr'.format(job.name)), # noqa
|
'file': os.path.join(relative_output_dir, '{}.pdf'.format(job.name)), # noqa
|
||||||
'mimetype': 'application/xhtml+xml'
|
|
||||||
}
|
|
||||||
)
|
|
||||||
self.output_files.append(
|
|
||||||
{
|
|
||||||
'input': relative_input,
|
|
||||||
'filename': os.path.join(relative_output_dir, '{}.pdf'.format(job.name)), # noqa
|
|
||||||
'mimetype': 'application/pdf'
|
'mimetype': 'application/pdf'
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
self.output_files.append(
|
self.output_files.append(
|
||||||
{
|
{
|
||||||
'input': relative_input,
|
'description': 'Plain text file.',
|
||||||
'filename': os.path.join(relative_output_dir, '{}.txt'.format(job.name)), # noqa
|
'file': os.path.join(relative_output_dir, '{}.txt'.format(job.name)), # noqa
|
||||||
'mimetype': 'text/plain'
|
'mimetype': 'text/plain'
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
self.output_files.append(
|
self.output_files.append(
|
||||||
{
|
{
|
||||||
'input': relative_input,
|
'description': 'TEI compliant XML file.',
|
||||||
'filename': os.path.join(relative_output_dir, '{}.xml'.format(job.name)), # noqa
|
'file': os.path.join(relative_output_dir, '{}.tei.xml'.format(job.name)), # noqa
|
||||||
'mimetype': 'application/tei+xml'
|
'mimetype': 'application/tei+xml'
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
with open(os.path.join(self.output_dir, 'output_files.json'), 'w') as f:
|
with open(os.path.join(self.output_dir, 'output_records.json'), 'w') as f: # noqa
|
||||||
json.dump(self.output_files, f, indent=4)
|
json.dump(self.output_files, f, indent=4)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user