Codestyle enhacements

This commit is contained in:
Patrick Jentsch 2022-01-27 13:40:23 +01:00
parent aeab9b7802
commit 4518ca1c83
5 changed files with 187 additions and 82 deletions

View File

@ -26,8 +26,24 @@ This software implements a heavily parallelized pipeline to recognize text in PD
3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details. 3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details.
```bash ```bash
cd /<my_data_location> cd /<my_data_location>
ocr -i input -o output -m models/<model_name> -l <language_code> <optional_pipeline_arguments> # <model_code> is the model filename without the ".traineddata" suffix
# or ocr \
ocr -i input -o output -m models/* -l <language_code> <optional_pipeline_arguments> --input-dir input \
--output-dir output \
--model-file models/<model>
-m <model_code> <optional_pipeline_arguments>
# More then one model
ocr \
--input-dir input \
--output-dir output \
--model-file models/<model1>
--model-file models/<model2>
-m <model1_code>+<model2_code> <optional_pipeline_arguments>
# Instead of multiple --model-file statements, you can also use
ocr \
--input-dir input \
--output-dir output \
--model-file models/*
-m <model1_code>+<model2_code> <optional_pipeline_arguments>
``` ```
4. Check your results in the `/<my_data_location>/output` directory. 4. Check your results in the `/<my_data_location>/output` directory.

View File

@ -1,33 +1,42 @@
#!/usr/bin/env python3.7 #!/usr/bin/env python3.7
# coding=utf-8 # coding=utf-8
""""Combine multiple hOCR files.""" ''' Combine multiple hOCR files. '''
from argparse import ArgumentParser from argparse import ArgumentParser
from lxml import html from lxml import html
parser = ArgumentParser(description='Combine multiple hOCR files.') parser = ArgumentParser(description='Combine multiple hOCR files.')
parser.add_argument('file', help='Input file(s)', nargs='+') parser.add_argument(
parser.add_argument('-o', '--output-file', help='Output file', required=True) '-i', '--input-file',
help='Input file',
nargs='+',
required=True
)
parser.add_argument(
'-o', '--output-file',
help='Output file',
required=True
)
args = parser.parse_args() args = parser.parse_args()
print(args)
for input_file in args.input_file:
for file in args.file: input_files = []
files = [] if input_file.startswith('@'):
if file.startswith('@'): with open(input_file[1:], 'r') as f:
with open(file[1:], 'r') as f: input_files += [x for x in f.read().split("\n") if x != '']
files += [x for x in f.read().split("\n") if x != '']
else: else:
files.append(file) input_files.append(input_file)
if len(files) == 0: if len(input_files) == 0:
exit(1) exit(1)
hocr = html.parse(files[0]) hocr = html.parse(input_files[0])
hocr_body = hocr.find('body') hocr_body = hocr.find('body')
for file in files[1:]: for input_file in input_files[1:]:
for ocr_page in html.parse(file).findall('//div[@class="ocr_page"]'): for ocr_page in html.parse(input_file).findall('//div[@class="ocr_page"]'):
hocr_body.append(ocr_page) hocr_body.append(ocr_page)

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python3.7 #!/usr/bin/env python3.7
# coding=utf-8 # coding=utf-8
""""Convert hOCR to TEI XML.""" ''' Convert hOCR to TEI XML. '''
from argparse import ArgumentParser from argparse import ArgumentParser
from lxml import html from lxml import html
@ -10,8 +10,15 @@ import re
parser = ArgumentParser(description='Convert hOCR to TEI XML.') parser = ArgumentParser(description='Convert hOCR to TEI XML.')
parser.add_argument('file', help='Input file') parser.add_argument(
parser.add_argument('-o', '--output-file', help='Output file', required=True) '-i', '--input-file',
help='Input file'
)
parser.add_argument(
'-o', '--output-file',
help='Output file',
required=True
)
args = parser.parse_args() args = parser.parse_args()
@ -32,7 +39,7 @@ tei += ' </fileDesc>\n'
tei += ' </teiHeader>\n' tei += ' </teiHeader>\n'
tei += ' <text>\n' tei += ' <text>\n'
tei += ' <body>\n' tei += ' <body>\n'
hocr = html.parse(args.file) hocr = html.parse(args.input_file)
for ocr_page in hocr.findall('.//div[@class="ocr_page"]'): for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
ocr_page_title_attrib = ocr_page.attrib.get('title') ocr_page_title_attrib = ocr_page.attrib.get('title')
facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1) facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1)
@ -42,11 +49,13 @@ for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
tei += ' <p>\n' tei += ' <p>\n'
for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'): for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'):
tei += ' <lb/>' tei += ' <lb/>'
indent = '' is_first_word_in_line = True
for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'): for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'):
if ocrx_word.text is not None: if ocrx_word.text is not None:
tei += indent + escape(ocrx_word.text) if not is_first_word_in_line:
indent = ' ' tei += ' '
tei += escape(ocrx_word.text)
is_first_word_in_line = False
tei += '\n' tei += '\n'
tei += ' </p>\n' tei += ' </p>\n'
tei += ' </body>\n' tei += ' </body>\n'

179
ocr
View File

@ -8,7 +8,6 @@ __version__ = '0.1.0'
from argparse import ArgumentParser from argparse import ArgumentParser
from pyflow import WorkflowRunner from pyflow import WorkflowRunner
import json import json
import multiprocessing
import os import os
import sys import sys
@ -52,8 +51,8 @@ class SplitInputWorkflow(WorkflowRunner):
cmd += ' -dQUIET' cmd += ' -dQUIET'
cmd += ' -r300' cmd += ' -r300'
cmd += ' -sDEVICE=png16m' cmd += ' -sDEVICE=png16m'
cmd += ' -sOutputFile="{}/page-%d.png"'.format( cmd += ' -sOutputFile="{}"'.format(
os.path.join(self.job.tmp_dir, 'images') os.path.join(self.job.tmp_dir, 'images', 'page-%d.png')
) )
cmd += ' "{}"'.format(self.job.file) cmd += ' "{}"'.format(self.job.file)
self.addTask( self.addTask(
@ -82,13 +81,18 @@ class BinarizationWorkflow(WorkflowRunner):
os.path.join(self.job.tmp_dir, 'images', 'inputs.txt') os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')
) )
cmd += ' && ' cmd += ' && '
cmd += 'ocropus-nlbin "@{}"'.format(os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')) # noqa cmd += 'ocropus-nlbin "@{}"'.format(
os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')
)
cmd += ' --nocheck' cmd += ' --nocheck'
cmd += ' --output "{}"'.format( cmd += ' --output "{}"'.format(
os.path.join(self.job.tmp_dir, 'images')) os.path.join(self.job.tmp_dir, 'images')
)
cmd += ' --parallel "{}"'.format(n_cores) cmd += ' --parallel "{}"'.format(n_cores)
cmd += ' && ' cmd += ' && '
cmd += 'rm "{}"'.format(os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')) # noqa cmd += 'rm "{}"'.format(
os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')
)
ocropus_nlbin_task = self.addTask( ocropus_nlbin_task = self.addTask(
'ocropus_nlbin', 'ocropus_nlbin',
command=cmd, command=cmd,
@ -130,9 +134,9 @@ class BinarizationWorkflow(WorkflowRunner):
class OCRWorkflow(WorkflowRunner): class OCRWorkflow(WorkflowRunner):
def __init__(self, job, lang): def __init__(self, job, model):
self.job = job self.job = job
self.lang = lang self.model = model
def workflow(self): def workflow(self):
''' '''
@ -148,7 +152,7 @@ class OCRWorkflow(WorkflowRunner):
os.path.join(self.job.tmp_dir, 'images', file), os.path.join(self.job.tmp_dir, 'images', file),
os.path.join(self.job.tmp_dir, file[:-4]) os.path.join(self.job.tmp_dir, file[:-4])
) )
cmd += ' -l "{}"'.format(self.lang) cmd += ' -l "{}"'.format(self.model)
cmd += ' hocr pdf txt' cmd += ' hocr pdf txt'
cmd += ' || ' cmd += ' || '
cmd += 'echo "${?}"' cmd += 'echo "${?}"'
@ -166,6 +170,7 @@ class OCRWorkflow(WorkflowRunner):
' # move_files # ' # move_files #
' ################################################## ' ##################################################
''' '''
move_files_tasks = []
n_cores = 1 n_cores = 1
mem_mb = min(128, self.getMemMb()) mem_mb = min(128, self.getMemMb())
for i, file_extension in enumerate(['hocr', 'pdf', 'txt']): for i, file_extension in enumerate(['hocr', 'pdf', 'txt']):
@ -174,24 +179,26 @@ class OCRWorkflow(WorkflowRunner):
file_extension, file_extension,
os.path.join(self.job.tmp_dir, file_extension) os.path.join(self.job.tmp_dir, file_extension)
) )
self.addTask( task = self.addTask(
'move_{}_files'.format(file_extension), 'move_{}_files'.format(file_extension),
command=cmd, command=cmd,
dependencies=tesseract_tasks, dependencies=tesseract_tasks,
memMb=mem_mb, memMb=mem_mb,
nCores=n_cores nCores=n_cores
) )
move_files_tasks.append(task)
cmd = 'mv "{}" "{}"'.format( cmd = 'mv "{}" "{}"'.format(
os.path.join(self.job.tmp_dir, 'images'), os.path.join(self.job.tmp_dir, 'images'),
os.path.join(self.job.output_dir) os.path.join(self.job.output_dir)
) )
self.addTask( task = self.addTask(
'move_image_files', 'move_image_files',
command=cmd, command=cmd,
dependencies=tesseract_tasks, dependencies=tesseract_tasks,
memMb=mem_mb, memMb=mem_mb,
nCores=n_cores nCores=n_cores
) )
move_files_tasks.append(task)
class CreateHOCRWorkflow(WorkflowRunner): class CreateHOCRWorkflow(WorkflowRunner):
@ -256,13 +263,14 @@ class CreateHOCRWorkflow(WorkflowRunner):
' ################################################## ' ##################################################
''' '''
n_cores = 1 n_cores = 1
mem_mb = min(512, self.getMemMb()) mem_mb = min(256, self.getMemMb())
cmd = 'ls -dv "{}/"* > "{}"'.format( cmd = 'ls -dv "{}/"* > "{}"'.format(
os.path.join(self.job.tmp_dir, 'hocr'), os.path.join(self.job.tmp_dir, 'hocr'),
os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt') os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt')
) )
cmd += ' && ' cmd += ' && '
cmd += 'hocr-combine "@{}"'.format( cmd += 'hocr-combine'
cmd += ' --input-file "@{}"'.format(
os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt') os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt')
) )
cmd += ' --output-file "{}.hocr"'.format( cmd += ' --output-file "{}.hocr"'.format(
@ -301,12 +309,17 @@ class CreatePDFWorkflow(WorkflowRunner):
cmd += ' -dPDFSETTINGS=/ebook' cmd += ' -dPDFSETTINGS=/ebook'
cmd += ' -dQUIET' cmd += ' -dQUIET'
cmd += ' -sDEVICE=pdfwrite' cmd += ' -sDEVICE=pdfwrite'
cmd += ' -sOutputFile="{}.pdf"'.format( cmd += ' -sOutputFile="{}"'.format(
os.path.join(self.job.output_dir, self.job.name) os.path.join(self.job.output_dir, '{}.pdf'.format(self.job.name))
) )
cmd += ' && ' cmd += ' && '
cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'pdf')) cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'pdf'))
self.addTask('pdf_combine', command=cmd, memMb=mem_mb, nCores=n_cores) self.addTask(
'pdf_combine',
command=cmd,
memMb=mem_mb,
nCores=n_cores
)
class CreateTEIWorkflow(WorkflowRunner): class CreateTEIWorkflow(WorkflowRunner):
@ -320,14 +333,23 @@ class CreateTEIWorkflow(WorkflowRunner):
' ################################################## ' ##################################################
''' '''
n_cores = 1 n_cores = 1
mem_mb = min(512, self.getMemMb()) mem_mb = min(256, self.getMemMb())
cmd = 'hocr2tei "{}.hocr"'.format( cmd = 'hocr2tei'
os.path.join(self.job.output_dir, self.job.name) cmd += ' --input-file "{}"'.format(
os.path.join(self.job.output_dir, '{}.hocr'.format(self.job.name))
) )
cmd += ' --output-file "{}.tei.xml"'.format( cmd += ' --output-file "{}"'.format(
os.path.join(self.job.output_dir, self.job.name) os.path.join(
self.job.output_dir,
'{}.tei.xml'.format(self.job.name)
)
)
self.addTask(
'hocr2tei',
command=cmd,
memMb=mem_mb,
nCores=n_cores
) )
self.addTask('hocr2tei', command=cmd, memMb=mem_mb, nCores=n_cores)
class CreatePoCoZipWorkflow(WorkflowRunner): class CreatePoCoZipWorkflow(WorkflowRunner):
@ -354,7 +376,12 @@ class CreatePoCoZipWorkflow(WorkflowRunner):
cmd += 'rm -r images' cmd += 'rm -r images'
cmd += ' && ' cmd += ' && '
cmd += 'cd -' cmd += 'cd -'
task = self.addTask('zip', command=cmd, memMb=mem_mb, nCores=n_cores) task = self.addTask(
'zip',
command=cmd,
memMb=mem_mb,
nCores=n_cores
)
zip_tasks.append(task) zip_tasks.append(task)
@ -377,13 +404,18 @@ class CreateTxtWorkflow(WorkflowRunner):
cmd += '"{}.txt"'.format(os.path.join(self.job.output_dir, self.job.name)) # noqa cmd += '"{}.txt"'.format(os.path.join(self.job.output_dir, self.job.name)) # noqa
cmd += ' && ' cmd += ' && '
cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'txt')) cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'txt'))
self.addTask('txt_combine', command=cmd, memMb=mem_mb, nCores=n_cores) self.addTask(
'txt_combine',
command=cmd,
memMb=mem_mb,
nCores=n_cores
)
class MainWorkflow(WorkflowRunner): class MainWorkflow(WorkflowRunner):
def __init__(self, input_dir, lang, output_dir, binarize): def __init__(self, input_dir, model, output_dir, binarize):
self.input_dir = input_dir self.input_dir = input_dir
self.lang = lang self.model = model
self.output_dir = output_dir self.output_dir = output_dir
self.binarize = binarize self.binarize = binarize
self.jobs = [] self.jobs = []
@ -419,11 +451,13 @@ class MainWorkflow(WorkflowRunner):
' # split-input # ' # split-input #
' ################################################## ' ##################################################
''' '''
split_input_tasks = []
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
self.addWorkflowTask( task = self.addWorkflowTask(
'split_input_-_{}'.format(i), 'split_input_-_{}'.format(i),
SplitInputWorkflow(job) SplitInputWorkflow(job)
) )
split_input_tasks.append(task)
if self.binarize: if self.binarize:
''' '''
@ -431,12 +465,14 @@ class MainWorkflow(WorkflowRunner):
' # binarization # ' # binarization #
' ################################################## ' ##################################################
''' '''
binarization_tasks = []
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
self.addWorkflowTask( task = self.addWorkflowTask(
'binarization_-_{}'.format(i), 'binarization_-_{}'.format(i),
BinarizationWorkflow(job), BinarizationWorkflow(job),
dependencies='split_input_-_{}'.format(i) dependencies='split_input_-_{}'.format(i)
) )
binarization_tasks.append(task)
''' '''
' ################################################## ' ##################################################
@ -451,7 +487,7 @@ class MainWorkflow(WorkflowRunner):
deps = 'split_input_-_{}'.format(i) deps = 'split_input_-_{}'.format(i)
task = self.addWorkflowTask( task = self.addWorkflowTask(
'ocr_-_{}'.format(i), 'ocr_-_{}'.format(i),
OCRWorkflow(job, self.lang), OCRWorkflow(job, self.model),
dependencies=deps dependencies=deps
) )
ocr_tasks.append(task) ocr_tasks.append(task)
@ -527,55 +563,80 @@ class MainWorkflow(WorkflowRunner):
create_txt_tasks.append(task) create_txt_tasks.append(task)
self.waitForTasks() self.waitForTasks()
output_files = [] outputs = []
for job in self.jobs: for job in self.jobs:
# Remove temporary directory # Remove temporary directory
os.rmdir(job.tmp_dir) os.rmdir(job.tmp_dir)
# Track output files # Track output files
relative_output_dir = os.path.relpath(job.output_dir, start=self.output_dir) # noqa relative_output_dir = os.path.relpath(
output_files.append( job.output_dir,
start=self.output_dir
)
outputs.append(
{ {
'description': 'Post correction package (.png and .hocr).', 'description': 'Post correction package (.png and .hocr).',
'file': os.path.join(relative_output_dir, '{}.poco.zip'.format(job.name)), # noqa 'file': os.path.join(
relative_output_dir,
'{}.poco.zip'.format(job.name)
),
'mimetype': 'application/zip' 'mimetype': 'application/zip'
} }
) )
output_files.append( outputs.append(
{ {
'description': 'PDF file with text layer.', 'description': 'PDF file with text layer.',
'file': os.path.join(relative_output_dir, '{}.pdf'.format(job.name)), # noqa 'file': os.path.join(
relative_output_dir,
'{}.pdf'.format(job.name)
),
'mimetype': 'application/pdf' 'mimetype': 'application/pdf'
} }
) )
output_files.append( outputs.append(
{ {
'description': 'Plain text file.', 'description': 'Plain text file.',
'file': os.path.join(relative_output_dir, '{}.txt'.format(job.name)), # noqa 'file': os.path.join(
relative_output_dir,
'{}.txt'.format(job.name)
),
'mimetype': 'text/plain' 'mimetype': 'text/plain'
} }
) )
output_files.append( outputs.append(
{ {
'description': 'TEI compliant XML file.', 'description': 'TEI compliant XML file.',
'file': os.path.join(relative_output_dir, '{}.tei.xml'.format(job.name)), # noqa 'file': os.path.join(
relative_output_dir,
'{}.tei.xml'.format(job.name)
),
'mimetype': 'application/tei+xml' 'mimetype': 'application/tei+xml'
} }
) )
with open(os.path.join(self.output_dir, 'output_records.json'), 'w') as f: # noqa with open(os.path.join(self.output_dir, 'outputs.json'), 'w') as f:
json.dump(output_files, f, indent=4) json.dump(outputs, f, indent=4)
def parse_args(): def parse_args():
parser = ArgumentParser(description='Pipeline for PDF file OCR processing') parser = ArgumentParser(
description='Pipeline for PDF file OCR processing'
)
parser.add_argument( parser.add_argument(
'-i', '--input-dir', help='Input directory', required=True) '-i', '--input-dir',
help='Input directory',
required=True
)
parser.add_argument( parser.add_argument(
'-o', '--output-dir', help='Output directory', required=True) '-o', '--output-dir',
help='Output directory',
required=True
)
parser.add_argument( parser.add_argument(
'-l', '--language', '-m', '--model',
choices=[x[:-12] for x in os.listdir('/usr/local/share/tessdata') choices=[
if x.endswith('.traineddata') and len(x) > 12], x[:-12] for x in os.listdir('/usr/local/share/tessdata')
help='Language of the input (3-character ISO 639-2 language codes)', if x.endswith('.traineddata') and len(x) > 12
],
help='Name of the model to be used',
required=True required=True
) )
parser.add_argument( parser.add_argument(
@ -584,16 +645,19 @@ def parse_args():
help='Add binarization as a preprocessing step' help='Add binarization as a preprocessing step'
) )
parser.add_argument( parser.add_argument(
'--log-dir', help='Logging directory (Default: --output-dir)') '--log-dir',
help='Logging directory (Default: --output-dir)'
)
parser.add_argument( parser.add_argument(
'--mem-mb', '--mem-mb',
help='Amount of system memory to be used (Default: min(--n-cores * 512, available system memory))', # noqa help='Amount of system memory to be used '
'(Default: min(--n-cores * 512, available system memory))',
type=int type=int
) )
parser.add_argument( parser.add_argument(
'--n-cores', '--n-cores',
default=min(4, multiprocessing.cpu_count()), default=1,
help='Number of CPU threads to be used (Default: min(4, CPU count))', help='Number of CPU threads to be used',
type=int type=int
) )
parser.add_argument( parser.add_argument(
@ -620,10 +684,17 @@ def parse_args():
def main(): def main():
args = parse_args() args = parse_args()
main_workflow = MainWorkflow( main_workflow = MainWorkflow(
args.input_dir, args.language, args.output_dir, args.binarize) args.input_dir,
args.model,
args.output_dir,
args.binarize
)
main_workflow.collect_jobs() main_workflow.collect_jobs()
retval = main_workflow.run( retval = main_workflow.run(
dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores) dataDirRoot=args.log_dir,
memMb=args.mem_mb,
nCores=args.n_cores
)
sys.exit(retval) sys.exit(retval)

View File

@ -17,7 +17,7 @@ GID = str(os.getgid())
parser = ArgumentParser(add_help=False) parser = ArgumentParser(add_help=False)
parser.add_argument('-i', '--input-dir') parser.add_argument('-i', '--input-dir')
parser.add_argument('-o', '--output-dir') parser.add_argument('-o', '--output-dir')
parser.add_argument('-m', '--model', action='extend', dest='models', nargs='+') parser.add_argument('-t', '--model-file', action='extend', nargs='+')
parser.add_argument('--log-dir') parser.add_argument('--log-dir')
args, remaining_args = parser.parse_known_args() args, remaining_args = parser.parse_known_args()
@ -30,9 +30,9 @@ if args.output_dir is not None:
mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}' mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}'
cmd += ['-v', mapping] cmd += ['-v', mapping]
remaining_args += ['-o', CONTAINER_OUTPUT_DIR] remaining_args += ['-o', CONTAINER_OUTPUT_DIR]
if args.models is not None: if args.model_file is not None:
for model in args.models: for model_file in args.model_file:
mapping = f'{os.path.abspath(model)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model)}' # noqa mapping = f'{os.path.abspath(model_file)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model_file)}' # noqa
cmd += ['-v', mapping] cmd += ['-v', mapping]
if args.log_dir is not None: if args.log_dir is not None:
mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}' mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}'