Codestyle enhacements

This commit is contained in:
Patrick Jentsch 2022-01-27 13:40:23 +01:00
parent aeab9b7802
commit 4518ca1c83
5 changed files with 187 additions and 82 deletions

View File

@ -26,8 +26,24 @@ This software implements a heavily parallelized pipeline to recognize text in PD
3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details.
```bash
cd /<my_data_location>
ocr -i input -o output -m models/<model_name> -l <language_code> <optional_pipeline_arguments>
# or
ocr -i input -o output -m models/* -l <language_code> <optional_pipeline_arguments>
# <model_code> is the model filename without the ".traineddata" suffix
ocr \
--input-dir input \
--output-dir output \
--model-file models/<model>
-m <model_code> <optional_pipeline_arguments>
# More then one model
ocr \
--input-dir input \
--output-dir output \
--model-file models/<model1>
--model-file models/<model2>
-m <model1_code>+<model2_code> <optional_pipeline_arguments>
# Instead of multiple --model-file statements, you can also use
ocr \
--input-dir input \
--output-dir output \
--model-file models/*
-m <model1_code>+<model2_code> <optional_pipeline_arguments>
```
4. Check your results in the `/<my_data_location>/output` directory.

View File

@ -1,33 +1,42 @@
#!/usr/bin/env python3.7
# coding=utf-8
""""Combine multiple hOCR files."""
''' Combine multiple hOCR files. '''
from argparse import ArgumentParser
from lxml import html
parser = ArgumentParser(description='Combine multiple hOCR files.')
parser.add_argument('file', help='Input file(s)', nargs='+')
parser.add_argument('-o', '--output-file', help='Output file', required=True)
parser.add_argument(
'-i', '--input-file',
help='Input file',
nargs='+',
required=True
)
parser.add_argument(
'-o', '--output-file',
help='Output file',
required=True
)
args = parser.parse_args()
print(args)
for file in args.file:
files = []
if file.startswith('@'):
with open(file[1:], 'r') as f:
files += [x for x in f.read().split("\n") if x != '']
for input_file in args.input_file:
input_files = []
if input_file.startswith('@'):
with open(input_file[1:], 'r') as f:
input_files += [x for x in f.read().split("\n") if x != '']
else:
files.append(file)
if len(files) == 0:
input_files.append(input_file)
if len(input_files) == 0:
exit(1)
hocr = html.parse(files[0])
hocr = html.parse(input_files[0])
hocr_body = hocr.find('body')
for file in files[1:]:
for ocr_page in html.parse(file).findall('//div[@class="ocr_page"]'):
for input_file in input_files[1:]:
for ocr_page in html.parse(input_file).findall('//div[@class="ocr_page"]'):
hocr_body.append(ocr_page)

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python3.7
# coding=utf-8
""""Convert hOCR to TEI XML."""
''' Convert hOCR to TEI XML. '''
from argparse import ArgumentParser
from lxml import html
@ -10,8 +10,15 @@ import re
parser = ArgumentParser(description='Convert hOCR to TEI XML.')
parser.add_argument('file', help='Input file')
parser.add_argument('-o', '--output-file', help='Output file', required=True)
parser.add_argument(
'-i', '--input-file',
help='Input file'
)
parser.add_argument(
'-o', '--output-file',
help='Output file',
required=True
)
args = parser.parse_args()
@ -32,7 +39,7 @@ tei += ' </fileDesc>\n'
tei += ' </teiHeader>\n'
tei += ' <text>\n'
tei += ' <body>\n'
hocr = html.parse(args.file)
hocr = html.parse(args.input_file)
for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
ocr_page_title_attrib = ocr_page.attrib.get('title')
facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1)
@ -42,11 +49,13 @@ for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
tei += ' <p>\n'
for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'):
tei += ' <lb/>'
indent = ''
is_first_word_in_line = True
for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'):
if ocrx_word.text is not None:
tei += indent + escape(ocrx_word.text)
indent = ' '
if not is_first_word_in_line:
tei += ' '
tei += escape(ocrx_word.text)
is_first_word_in_line = False
tei += '\n'
tei += ' </p>\n'
tei += ' </body>\n'

179
ocr
View File

@ -8,7 +8,6 @@ __version__ = '0.1.0'
from argparse import ArgumentParser
from pyflow import WorkflowRunner
import json
import multiprocessing
import os
import sys
@ -52,8 +51,8 @@ class SplitInputWorkflow(WorkflowRunner):
cmd += ' -dQUIET'
cmd += ' -r300'
cmd += ' -sDEVICE=png16m'
cmd += ' -sOutputFile="{}/page-%d.png"'.format(
os.path.join(self.job.tmp_dir, 'images')
cmd += ' -sOutputFile="{}"'.format(
os.path.join(self.job.tmp_dir, 'images', 'page-%d.png')
)
cmd += ' "{}"'.format(self.job.file)
self.addTask(
@ -82,13 +81,18 @@ class BinarizationWorkflow(WorkflowRunner):
os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')
)
cmd += ' && '
cmd += 'ocropus-nlbin "@{}"'.format(os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')) # noqa
cmd += 'ocropus-nlbin "@{}"'.format(
os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')
)
cmd += ' --nocheck'
cmd += ' --output "{}"'.format(
os.path.join(self.job.tmp_dir, 'images'))
os.path.join(self.job.tmp_dir, 'images')
)
cmd += ' --parallel "{}"'.format(n_cores)
cmd += ' && '
cmd += 'rm "{}"'.format(os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')) # noqa
cmd += 'rm "{}"'.format(
os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')
)
ocropus_nlbin_task = self.addTask(
'ocropus_nlbin',
command=cmd,
@ -130,9 +134,9 @@ class BinarizationWorkflow(WorkflowRunner):
class OCRWorkflow(WorkflowRunner):
def __init__(self, job, lang):
def __init__(self, job, model):
self.job = job
self.lang = lang
self.model = model
def workflow(self):
'''
@ -148,7 +152,7 @@ class OCRWorkflow(WorkflowRunner):
os.path.join(self.job.tmp_dir, 'images', file),
os.path.join(self.job.tmp_dir, file[:-4])
)
cmd += ' -l "{}"'.format(self.lang)
cmd += ' -l "{}"'.format(self.model)
cmd += ' hocr pdf txt'
cmd += ' || '
cmd += 'echo "${?}"'
@ -166,6 +170,7 @@ class OCRWorkflow(WorkflowRunner):
' # move_files #
' ##################################################
'''
move_files_tasks = []
n_cores = 1
mem_mb = min(128, self.getMemMb())
for i, file_extension in enumerate(['hocr', 'pdf', 'txt']):
@ -174,24 +179,26 @@ class OCRWorkflow(WorkflowRunner):
file_extension,
os.path.join(self.job.tmp_dir, file_extension)
)
self.addTask(
task = self.addTask(
'move_{}_files'.format(file_extension),
command=cmd,
dependencies=tesseract_tasks,
memMb=mem_mb,
nCores=n_cores
)
move_files_tasks.append(task)
cmd = 'mv "{}" "{}"'.format(
os.path.join(self.job.tmp_dir, 'images'),
os.path.join(self.job.output_dir)
)
self.addTask(
task = self.addTask(
'move_image_files',
command=cmd,
dependencies=tesseract_tasks,
memMb=mem_mb,
nCores=n_cores
)
move_files_tasks.append(task)
class CreateHOCRWorkflow(WorkflowRunner):
@ -256,13 +263,14 @@ class CreateHOCRWorkflow(WorkflowRunner):
' ##################################################
'''
n_cores = 1
mem_mb = min(512, self.getMemMb())
mem_mb = min(256, self.getMemMb())
cmd = 'ls -dv "{}/"* > "{}"'.format(
os.path.join(self.job.tmp_dir, 'hocr'),
os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt')
)
cmd += ' && '
cmd += 'hocr-combine "@{}"'.format(
cmd += 'hocr-combine'
cmd += ' --input-file "@{}"'.format(
os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt')
)
cmd += ' --output-file "{}.hocr"'.format(
@ -301,12 +309,17 @@ class CreatePDFWorkflow(WorkflowRunner):
cmd += ' -dPDFSETTINGS=/ebook'
cmd += ' -dQUIET'
cmd += ' -sDEVICE=pdfwrite'
cmd += ' -sOutputFile="{}.pdf"'.format(
os.path.join(self.job.output_dir, self.job.name)
cmd += ' -sOutputFile="{}"'.format(
os.path.join(self.job.output_dir, '{}.pdf'.format(self.job.name))
)
cmd += ' && '
cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'pdf'))
self.addTask('pdf_combine', command=cmd, memMb=mem_mb, nCores=n_cores)
self.addTask(
'pdf_combine',
command=cmd,
memMb=mem_mb,
nCores=n_cores
)
class CreateTEIWorkflow(WorkflowRunner):
@ -320,14 +333,23 @@ class CreateTEIWorkflow(WorkflowRunner):
' ##################################################
'''
n_cores = 1
mem_mb = min(512, self.getMemMb())
cmd = 'hocr2tei "{}.hocr"'.format(
os.path.join(self.job.output_dir, self.job.name)
mem_mb = min(256, self.getMemMb())
cmd = 'hocr2tei'
cmd += ' --input-file "{}"'.format(
os.path.join(self.job.output_dir, '{}.hocr'.format(self.job.name))
)
cmd += ' --output-file "{}.tei.xml"'.format(
os.path.join(self.job.output_dir, self.job.name)
cmd += ' --output-file "{}"'.format(
os.path.join(
self.job.output_dir,
'{}.tei.xml'.format(self.job.name)
)
)
self.addTask(
'hocr2tei',
command=cmd,
memMb=mem_mb,
nCores=n_cores
)
self.addTask('hocr2tei', command=cmd, memMb=mem_mb, nCores=n_cores)
class CreatePoCoZipWorkflow(WorkflowRunner):
@ -354,7 +376,12 @@ class CreatePoCoZipWorkflow(WorkflowRunner):
cmd += 'rm -r images'
cmd += ' && '
cmd += 'cd -'
task = self.addTask('zip', command=cmd, memMb=mem_mb, nCores=n_cores)
task = self.addTask(
'zip',
command=cmd,
memMb=mem_mb,
nCores=n_cores
)
zip_tasks.append(task)
@ -377,13 +404,18 @@ class CreateTxtWorkflow(WorkflowRunner):
cmd += '"{}.txt"'.format(os.path.join(self.job.output_dir, self.job.name)) # noqa
cmd += ' && '
cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'txt'))
self.addTask('txt_combine', command=cmd, memMb=mem_mb, nCores=n_cores)
self.addTask(
'txt_combine',
command=cmd,
memMb=mem_mb,
nCores=n_cores
)
class MainWorkflow(WorkflowRunner):
def __init__(self, input_dir, lang, output_dir, binarize):
def __init__(self, input_dir, model, output_dir, binarize):
self.input_dir = input_dir
self.lang = lang
self.model = model
self.output_dir = output_dir
self.binarize = binarize
self.jobs = []
@ -419,11 +451,13 @@ class MainWorkflow(WorkflowRunner):
' # split-input #
' ##################################################
'''
split_input_tasks = []
for i, job in enumerate(self.jobs):
self.addWorkflowTask(
task = self.addWorkflowTask(
'split_input_-_{}'.format(i),
SplitInputWorkflow(job)
)
split_input_tasks.append(task)
if self.binarize:
'''
@ -431,12 +465,14 @@ class MainWorkflow(WorkflowRunner):
' # binarization #
' ##################################################
'''
binarization_tasks = []
for i, job in enumerate(self.jobs):
self.addWorkflowTask(
task = self.addWorkflowTask(
'binarization_-_{}'.format(i),
BinarizationWorkflow(job),
dependencies='split_input_-_{}'.format(i)
)
binarization_tasks.append(task)
'''
' ##################################################
@ -451,7 +487,7 @@ class MainWorkflow(WorkflowRunner):
deps = 'split_input_-_{}'.format(i)
task = self.addWorkflowTask(
'ocr_-_{}'.format(i),
OCRWorkflow(job, self.lang),
OCRWorkflow(job, self.model),
dependencies=deps
)
ocr_tasks.append(task)
@ -527,55 +563,80 @@ class MainWorkflow(WorkflowRunner):
create_txt_tasks.append(task)
self.waitForTasks()
output_files = []
outputs = []
for job in self.jobs:
# Remove temporary directory
os.rmdir(job.tmp_dir)
# Track output files
relative_output_dir = os.path.relpath(job.output_dir, start=self.output_dir) # noqa
output_files.append(
relative_output_dir = os.path.relpath(
job.output_dir,
start=self.output_dir
)
outputs.append(
{
'description': 'Post correction package (.png and .hocr).',
'file': os.path.join(relative_output_dir, '{}.poco.zip'.format(job.name)), # noqa
'file': os.path.join(
relative_output_dir,
'{}.poco.zip'.format(job.name)
),
'mimetype': 'application/zip'
}
)
output_files.append(
outputs.append(
{
'description': 'PDF file with text layer.',
'file': os.path.join(relative_output_dir, '{}.pdf'.format(job.name)), # noqa
'file': os.path.join(
relative_output_dir,
'{}.pdf'.format(job.name)
),
'mimetype': 'application/pdf'
}
)
output_files.append(
outputs.append(
{
'description': 'Plain text file.',
'file': os.path.join(relative_output_dir, '{}.txt'.format(job.name)), # noqa
'file': os.path.join(
relative_output_dir,
'{}.txt'.format(job.name)
),
'mimetype': 'text/plain'
}
)
output_files.append(
outputs.append(
{
'description': 'TEI compliant XML file.',
'file': os.path.join(relative_output_dir, '{}.tei.xml'.format(job.name)), # noqa
'file': os.path.join(
relative_output_dir,
'{}.tei.xml'.format(job.name)
),
'mimetype': 'application/tei+xml'
}
)
with open(os.path.join(self.output_dir, 'output_records.json'), 'w') as f: # noqa
json.dump(output_files, f, indent=4)
with open(os.path.join(self.output_dir, 'outputs.json'), 'w') as f:
json.dump(outputs, f, indent=4)
def parse_args():
parser = ArgumentParser(description='Pipeline for PDF file OCR processing')
parser = ArgumentParser(
description='Pipeline for PDF file OCR processing'
)
parser.add_argument(
'-i', '--input-dir', help='Input directory', required=True)
'-i', '--input-dir',
help='Input directory',
required=True
)
parser.add_argument(
'-o', '--output-dir', help='Output directory', required=True)
'-o', '--output-dir',
help='Output directory',
required=True
)
parser.add_argument(
'-l', '--language',
choices=[x[:-12] for x in os.listdir('/usr/local/share/tessdata')
if x.endswith('.traineddata') and len(x) > 12],
help='Language of the input (3-character ISO 639-2 language codes)',
'-m', '--model',
choices=[
x[:-12] for x in os.listdir('/usr/local/share/tessdata')
if x.endswith('.traineddata') and len(x) > 12
],
help='Name of the model to be used',
required=True
)
parser.add_argument(
@ -584,16 +645,19 @@ def parse_args():
help='Add binarization as a preprocessing step'
)
parser.add_argument(
'--log-dir', help='Logging directory (Default: --output-dir)')
'--log-dir',
help='Logging directory (Default: --output-dir)'
)
parser.add_argument(
'--mem-mb',
help='Amount of system memory to be used (Default: min(--n-cores * 512, available system memory))', # noqa
help='Amount of system memory to be used '
'(Default: min(--n-cores * 512, available system memory))',
type=int
)
parser.add_argument(
'--n-cores',
default=min(4, multiprocessing.cpu_count()),
help='Number of CPU threads to be used (Default: min(4, CPU count))',
default=1,
help='Number of CPU threads to be used',
type=int
)
parser.add_argument(
@ -620,10 +684,17 @@ def parse_args():
def main():
args = parse_args()
main_workflow = MainWorkflow(
args.input_dir, args.language, args.output_dir, args.binarize)
args.input_dir,
args.model,
args.output_dir,
args.binarize
)
main_workflow.collect_jobs()
retval = main_workflow.run(
dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores)
dataDirRoot=args.log_dir,
memMb=args.mem_mb,
nCores=args.n_cores
)
sys.exit(retval)

View File

@ -17,7 +17,7 @@ GID = str(os.getgid())
parser = ArgumentParser(add_help=False)
parser.add_argument('-i', '--input-dir')
parser.add_argument('-o', '--output-dir')
parser.add_argument('-m', '--model', action='extend', dest='models', nargs='+')
parser.add_argument('-t', '--model-file', action='extend', nargs='+')
parser.add_argument('--log-dir')
args, remaining_args = parser.parse_known_args()
@ -30,9 +30,9 @@ if args.output_dir is not None:
mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}'
cmd += ['-v', mapping]
remaining_args += ['-o', CONTAINER_OUTPUT_DIR]
if args.models is not None:
for model in args.models:
mapping = f'{os.path.abspath(model)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model)}' # noqa
if args.model_file is not None:
for model_file in args.model_file:
mapping = f'{os.path.abspath(model_file)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model_file)}' # noqa
cmd += ['-v', mapping]
if args.log_dir is not None:
mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}'