diff --git a/ocr b/ocr index def0e93..5602abe 100755 --- a/ocr +++ b/ocr @@ -29,7 +29,6 @@ class PipelineJob: self.file = file self.name = os.path.basename(file)[:-4] self.output_dir = output_dir - self.output_files = [] self.tmp_dir = os.path.join(output_dir, 'tmp') @@ -325,12 +324,40 @@ class CreateTEIWorkflow(WorkflowRunner): cmd = 'hocr2tei "{}.hocr"'.format( os.path.join(self.job.output_dir, self.job.name) ) - cmd += ' --output-file "{}.xml"'.format( + cmd += ' --output-file "{}.tei.xml"'.format( os.path.join(self.job.output_dir, self.job.name) ) self.addTask('hocr2tei', command=cmd, memMb=mem_mb, nCores=n_cores) +class CreatePoCoZipWorkflow(WorkflowRunner): + def __init__(self, job): + self.job = job + + def workflow(self): + ''' + ' ################################################## + ' # zip # + ' ################################################## + ''' + n_cores = 1 + mem_mb = min(512, self.getMemMb()) + zip_tasks = [] + cmd = 'cd "{}"'.format(self.job.output_dir) + cmd += ' && ' + cmd += 'zip' + cmd += ' -r' + cmd += ' -m' + cmd += ' "{}.poco.zip" .'.format(self.job.name) + cmd += ' -i "images/*.png" "{}.hocr"'.format(self.job.name) + cmd += ' && ' + cmd += 'rm -r images' + cmd += ' && ' + cmd += 'cd -' + task = self.addTask('zip', command=cmd, memMb=mem_mb, nCores=n_cores) + zip_tasks.append(task) + + class CreateTxtWorkflow(WorkflowRunner): def __init__(self, job): self.job = job @@ -369,8 +396,11 @@ class MainWorkflow(WorkflowRunner): continue if not file.lower().endswith('.pdf'): continue - self.jobs.append(PipelineJob(os.path.join(self.input_dir, file), - os.path.join(self.output_dir, file))) + job = PipelineJob( + os.path.join(self.input_dir, file), + os.path.join(self.output_dir, file) + ) + self.jobs.append(job) def workflow(self): if not self.jobs: @@ -469,6 +499,20 @@ class MainWorkflow(WorkflowRunner): ) create_tei_tasks.append(task) + ''' + ' ################################################## + ' # create-poco-zip # + ' ################################################## + ''' + create_poco_zip_tasks = [] + for i, job in enumerate(self.jobs): + task = self.addWorkflowTask( + 'create_poco_zip_-_{}'.format(i), + CreatePoCoZipWorkflow(job), + dependencies='create_tei_-_{}'.format(i) + ) + create_poco_zip_tasks.append(task) + ''' ' ################################################## ' # create-txt # @@ -488,45 +532,36 @@ class MainWorkflow(WorkflowRunner): # Remove temporary directory os.rmdir(job.tmp_dir) # Track output files - relative_input = os.path.relpath(job.file, start=self.input_dir) relative_output_dir = os.path.relpath(job.output_dir, start=self.output_dir) # noqa - for x in os.listdir(os.path.join(job.output_dir, 'images')): - self.output_files.append( - { - 'input': relative_input, - 'path': os.path.join(relative_output_dir, 'images', x), - 'mimetype': 'image/png' - } - ) self.output_files.append( { - 'input': relative_input, - 'path': os.path.join(relative_output_dir, '{}.hocr'.format(job.name)), # noqa - 'mimetype': 'application/xhtml+xml' + 'description': 'Post correction package (.png and .hocr).', + 'file': os.path.join(relative_output_dir, '{}.poco.zip'.format(job.name)), # noqa + 'mimetype': 'application/zip' } ) self.output_files.append( { - 'input': relative_input, - 'filename': os.path.join(relative_output_dir, '{}.pdf'.format(job.name)), # noqa + 'description': 'PDF file with text layer.', + 'file': os.path.join(relative_output_dir, '{}.pdf'.format(job.name)), # noqa 'mimetype': 'application/pdf' } ) self.output_files.append( { - 'input': relative_input, - 'filename': os.path.join(relative_output_dir, '{}.txt'.format(job.name)), # noqa + 'description': 'Plain text file.', + 'file': os.path.join(relative_output_dir, '{}.txt'.format(job.name)), # noqa 'mimetype': 'text/plain' } ) self.output_files.append( { - 'input': relative_input, - 'filename': os.path.join(relative_output_dir, '{}.xml'.format(job.name)), # noqa + 'description': 'TEI compliant XML file.', + 'file': os.path.join(relative_output_dir, '{}.tei.xml'.format(job.name)), # noqa 'mimetype': 'application/tei+xml' } ) - with open(os.path.join(self.output_dir, 'output_files.json'), 'w') as f: + with open(os.path.join(self.output_dir, 'output_records.json'), 'w') as f: # noqa json.dump(self.output_files, f, indent=4)