From 4518ca1c8308a1a3f7e3ce1f68be8a739fc7bd0c Mon Sep 17 00:00:00 2001
From: Patrick Jentsch
Date: Thu, 27 Jan 2022 13:40:23 +0100
Subject: [PATCH] Codestyle enhacements
---
README.md | 22 ++++++-
hocr-combine | 37 +++++++----
hocr2tei | 23 +++++--
ocr | 179 +++++++++++++++++++++++++++++++++++----------------
wrapper/ocr | 8 +--
5 files changed, 187 insertions(+), 82 deletions(-)
diff --git a/README.md b/README.md
index 98834e5..a9ec050 100644
--- a/README.md
+++ b/README.md
@@ -26,8 +26,24 @@ This software implements a heavily parallelized pipeline to recognize text in PD
3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details.
```bash
cd /
-ocr -i input -o output -m models/ -l
-# or
-ocr -i input -o output -m models/* -l
+# is the model filename without the ".traineddata" suffix
+ocr \
+ --input-dir input \
+ --output-dir output \
+ --model-file models/
+ -m
+# More then one model
+ocr \
+ --input-dir input \
+ --output-dir output \
+ --model-file models/
+ --model-file models/
+ -m +
+# Instead of multiple --model-file statements, you can also use
+ocr \
+ --input-dir input \
+ --output-dir output \
+ --model-file models/*
+ -m +
```
4. Check your results in the `//output` directory.
diff --git a/hocr-combine b/hocr-combine
index 4008890..d4b606c 100755
--- a/hocr-combine
+++ b/hocr-combine
@@ -1,33 +1,42 @@
#!/usr/bin/env python3.7
# coding=utf-8
-""""Combine multiple hOCR files."""
+''' Combine multiple hOCR files. '''
from argparse import ArgumentParser
from lxml import html
parser = ArgumentParser(description='Combine multiple hOCR files.')
-parser.add_argument('file', help='Input file(s)', nargs='+')
-parser.add_argument('-o', '--output-file', help='Output file', required=True)
+parser.add_argument(
+ '-i', '--input-file',
+ help='Input file',
+ nargs='+',
+ required=True
+)
+parser.add_argument(
+ '-o', '--output-file',
+ help='Output file',
+ required=True
+)
args = parser.parse_args()
+print(args)
-
-for file in args.file:
- files = []
- if file.startswith('@'):
- with open(file[1:], 'r') as f:
- files += [x for x in f.read().split("\n") if x != '']
+for input_file in args.input_file:
+ input_files = []
+ if input_file.startswith('@'):
+ with open(input_file[1:], 'r') as f:
+ input_files += [x for x in f.read().split("\n") if x != '']
else:
- files.append(file)
-if len(files) == 0:
+ input_files.append(input_file)
+if len(input_files) == 0:
exit(1)
-hocr = html.parse(files[0])
+hocr = html.parse(input_files[0])
hocr_body = hocr.find('body')
-for file in files[1:]:
- for ocr_page in html.parse(file).findall('//div[@class="ocr_page"]'):
+for input_file in input_files[1:]:
+ for ocr_page in html.parse(input_file).findall('//div[@class="ocr_page"]'):
hocr_body.append(ocr_page)
diff --git a/hocr2tei b/hocr2tei
index 04a3db7..0350efb 100755
--- a/hocr2tei
+++ b/hocr2tei
@@ -1,7 +1,7 @@
#!/usr/bin/env python3.7
# coding=utf-8
-""""Convert hOCR to TEI XML."""
+''' Convert hOCR to TEI XML. '''
from argparse import ArgumentParser
from lxml import html
@@ -10,8 +10,15 @@ import re
parser = ArgumentParser(description='Convert hOCR to TEI XML.')
-parser.add_argument('file', help='Input file')
-parser.add_argument('-o', '--output-file', help='Output file', required=True)
+parser.add_argument(
+ '-i', '--input-file',
+ help='Input file'
+)
+parser.add_argument(
+ '-o', '--output-file',
+ help='Output file',
+ required=True
+)
args = parser.parse_args()
@@ -32,7 +39,7 @@ tei += ' \n'
tei += ' \n'
tei += ' \n'
tei += ' \n'
-hocr = html.parse(args.file)
+hocr = html.parse(args.input_file)
for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
ocr_page_title_attrib = ocr_page.attrib.get('title')
facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1)
@@ -42,11 +49,13 @@ for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
tei += ' \n'
for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'):
tei += '
'
- indent = ''
+ is_first_word_in_line = True
for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'):
if ocrx_word.text is not None:
- tei += indent + escape(ocrx_word.text)
- indent = ' '
+ if not is_first_word_in_line:
+ tei += ' '
+ tei += escape(ocrx_word.text)
+ is_first_word_in_line = False
tei += '\n'
tei += '
\n'
tei += ' \n'
diff --git a/ocr b/ocr
index 5e13c5f..f8a31fe 100755
--- a/ocr
+++ b/ocr
@@ -8,7 +8,6 @@ __version__ = '0.1.0'
from argparse import ArgumentParser
from pyflow import WorkflowRunner
import json
-import multiprocessing
import os
import sys
@@ -52,8 +51,8 @@ class SplitInputWorkflow(WorkflowRunner):
cmd += ' -dQUIET'
cmd += ' -r300'
cmd += ' -sDEVICE=png16m'
- cmd += ' -sOutputFile="{}/page-%d.png"'.format(
- os.path.join(self.job.tmp_dir, 'images')
+ cmd += ' -sOutputFile="{}"'.format(
+ os.path.join(self.job.tmp_dir, 'images', 'page-%d.png')
)
cmd += ' "{}"'.format(self.job.file)
self.addTask(
@@ -82,13 +81,18 @@ class BinarizationWorkflow(WorkflowRunner):
os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')
)
cmd += ' && '
- cmd += 'ocropus-nlbin "@{}"'.format(os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')) # noqa
+ cmd += 'ocropus-nlbin "@{}"'.format(
+ os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')
+ )
cmd += ' --nocheck'
cmd += ' --output "{}"'.format(
- os.path.join(self.job.tmp_dir, 'images'))
+ os.path.join(self.job.tmp_dir, 'images')
+ )
cmd += ' --parallel "{}"'.format(n_cores)
cmd += ' && '
- cmd += 'rm "{}"'.format(os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')) # noqa
+ cmd += 'rm "{}"'.format(
+ os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')
+ )
ocropus_nlbin_task = self.addTask(
'ocropus_nlbin',
command=cmd,
@@ -130,9 +134,9 @@ class BinarizationWorkflow(WorkflowRunner):
class OCRWorkflow(WorkflowRunner):
- def __init__(self, job, lang):
+ def __init__(self, job, model):
self.job = job
- self.lang = lang
+ self.model = model
def workflow(self):
'''
@@ -148,7 +152,7 @@ class OCRWorkflow(WorkflowRunner):
os.path.join(self.job.tmp_dir, 'images', file),
os.path.join(self.job.tmp_dir, file[:-4])
)
- cmd += ' -l "{}"'.format(self.lang)
+ cmd += ' -l "{}"'.format(self.model)
cmd += ' hocr pdf txt'
cmd += ' || '
cmd += 'echo "${?}"'
@@ -166,6 +170,7 @@ class OCRWorkflow(WorkflowRunner):
' # move_files #
' ##################################################
'''
+ move_files_tasks = []
n_cores = 1
mem_mb = min(128, self.getMemMb())
for i, file_extension in enumerate(['hocr', 'pdf', 'txt']):
@@ -174,24 +179,26 @@ class OCRWorkflow(WorkflowRunner):
file_extension,
os.path.join(self.job.tmp_dir, file_extension)
)
- self.addTask(
+ task = self.addTask(
'move_{}_files'.format(file_extension),
command=cmd,
dependencies=tesseract_tasks,
memMb=mem_mb,
nCores=n_cores
)
+ move_files_tasks.append(task)
cmd = 'mv "{}" "{}"'.format(
os.path.join(self.job.tmp_dir, 'images'),
os.path.join(self.job.output_dir)
)
- self.addTask(
+ task = self.addTask(
'move_image_files',
command=cmd,
dependencies=tesseract_tasks,
memMb=mem_mb,
nCores=n_cores
)
+ move_files_tasks.append(task)
class CreateHOCRWorkflow(WorkflowRunner):
@@ -256,13 +263,14 @@ class CreateHOCRWorkflow(WorkflowRunner):
' ##################################################
'''
n_cores = 1
- mem_mb = min(512, self.getMemMb())
+ mem_mb = min(256, self.getMemMb())
cmd = 'ls -dv "{}/"* > "{}"'.format(
os.path.join(self.job.tmp_dir, 'hocr'),
os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt')
)
cmd += ' && '
- cmd += 'hocr-combine "@{}"'.format(
+ cmd += 'hocr-combine'
+ cmd += ' --input-file "@{}"'.format(
os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt')
)
cmd += ' --output-file "{}.hocr"'.format(
@@ -301,12 +309,17 @@ class CreatePDFWorkflow(WorkflowRunner):
cmd += ' -dPDFSETTINGS=/ebook'
cmd += ' -dQUIET'
cmd += ' -sDEVICE=pdfwrite'
- cmd += ' -sOutputFile="{}.pdf"'.format(
- os.path.join(self.job.output_dir, self.job.name)
+ cmd += ' -sOutputFile="{}"'.format(
+ os.path.join(self.job.output_dir, '{}.pdf'.format(self.job.name))
)
cmd += ' && '
cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'pdf'))
- self.addTask('pdf_combine', command=cmd, memMb=mem_mb, nCores=n_cores)
+ self.addTask(
+ 'pdf_combine',
+ command=cmd,
+ memMb=mem_mb,
+ nCores=n_cores
+ )
class CreateTEIWorkflow(WorkflowRunner):
@@ -320,14 +333,23 @@ class CreateTEIWorkflow(WorkflowRunner):
' ##################################################
'''
n_cores = 1
- mem_mb = min(512, self.getMemMb())
- cmd = 'hocr2tei "{}.hocr"'.format(
- os.path.join(self.job.output_dir, self.job.name)
+ mem_mb = min(256, self.getMemMb())
+ cmd = 'hocr2tei'
+ cmd += ' --input-file "{}"'.format(
+ os.path.join(self.job.output_dir, '{}.hocr'.format(self.job.name))
)
- cmd += ' --output-file "{}.tei.xml"'.format(
- os.path.join(self.job.output_dir, self.job.name)
+ cmd += ' --output-file "{}"'.format(
+ os.path.join(
+ self.job.output_dir,
+ '{}.tei.xml'.format(self.job.name)
+ )
+ )
+ self.addTask(
+ 'hocr2tei',
+ command=cmd,
+ memMb=mem_mb,
+ nCores=n_cores
)
- self.addTask('hocr2tei', command=cmd, memMb=mem_mb, nCores=n_cores)
class CreatePoCoZipWorkflow(WorkflowRunner):
@@ -354,7 +376,12 @@ class CreatePoCoZipWorkflow(WorkflowRunner):
cmd += 'rm -r images'
cmd += ' && '
cmd += 'cd -'
- task = self.addTask('zip', command=cmd, memMb=mem_mb, nCores=n_cores)
+ task = self.addTask(
+ 'zip',
+ command=cmd,
+ memMb=mem_mb,
+ nCores=n_cores
+ )
zip_tasks.append(task)
@@ -377,13 +404,18 @@ class CreateTxtWorkflow(WorkflowRunner):
cmd += '"{}.txt"'.format(os.path.join(self.job.output_dir, self.job.name)) # noqa
cmd += ' && '
cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'txt'))
- self.addTask('txt_combine', command=cmd, memMb=mem_mb, nCores=n_cores)
+ self.addTask(
+ 'txt_combine',
+ command=cmd,
+ memMb=mem_mb,
+ nCores=n_cores
+ )
class MainWorkflow(WorkflowRunner):
- def __init__(self, input_dir, lang, output_dir, binarize):
+ def __init__(self, input_dir, model, output_dir, binarize):
self.input_dir = input_dir
- self.lang = lang
+ self.model = model
self.output_dir = output_dir
self.binarize = binarize
self.jobs = []
@@ -419,11 +451,13 @@ class MainWorkflow(WorkflowRunner):
' # split-input #
' ##################################################
'''
+ split_input_tasks = []
for i, job in enumerate(self.jobs):
- self.addWorkflowTask(
+ task = self.addWorkflowTask(
'split_input_-_{}'.format(i),
SplitInputWorkflow(job)
)
+ split_input_tasks.append(task)
if self.binarize:
'''
@@ -431,12 +465,14 @@ class MainWorkflow(WorkflowRunner):
' # binarization #
' ##################################################
'''
+ binarization_tasks = []
for i, job in enumerate(self.jobs):
- self.addWorkflowTask(
+ task = self.addWorkflowTask(
'binarization_-_{}'.format(i),
BinarizationWorkflow(job),
dependencies='split_input_-_{}'.format(i)
)
+ binarization_tasks.append(task)
'''
' ##################################################
@@ -451,7 +487,7 @@ class MainWorkflow(WorkflowRunner):
deps = 'split_input_-_{}'.format(i)
task = self.addWorkflowTask(
'ocr_-_{}'.format(i),
- OCRWorkflow(job, self.lang),
+ OCRWorkflow(job, self.model),
dependencies=deps
)
ocr_tasks.append(task)
@@ -527,55 +563,80 @@ class MainWorkflow(WorkflowRunner):
create_txt_tasks.append(task)
self.waitForTasks()
- output_files = []
+ outputs = []
for job in self.jobs:
# Remove temporary directory
os.rmdir(job.tmp_dir)
# Track output files
- relative_output_dir = os.path.relpath(job.output_dir, start=self.output_dir) # noqa
- output_files.append(
+ relative_output_dir = os.path.relpath(
+ job.output_dir,
+ start=self.output_dir
+ )
+ outputs.append(
{
'description': 'Post correction package (.png and .hocr).',
- 'file': os.path.join(relative_output_dir, '{}.poco.zip'.format(job.name)), # noqa
+ 'file': os.path.join(
+ relative_output_dir,
+ '{}.poco.zip'.format(job.name)
+ ),
'mimetype': 'application/zip'
}
)
- output_files.append(
+ outputs.append(
{
'description': 'PDF file with text layer.',
- 'file': os.path.join(relative_output_dir, '{}.pdf'.format(job.name)), # noqa
+ 'file': os.path.join(
+ relative_output_dir,
+ '{}.pdf'.format(job.name)
+ ),
'mimetype': 'application/pdf'
}
)
- output_files.append(
+ outputs.append(
{
'description': 'Plain text file.',
- 'file': os.path.join(relative_output_dir, '{}.txt'.format(job.name)), # noqa
+ 'file': os.path.join(
+ relative_output_dir,
+ '{}.txt'.format(job.name)
+ ),
'mimetype': 'text/plain'
}
)
- output_files.append(
+ outputs.append(
{
'description': 'TEI compliant XML file.',
- 'file': os.path.join(relative_output_dir, '{}.tei.xml'.format(job.name)), # noqa
+ 'file': os.path.join(
+ relative_output_dir,
+ '{}.tei.xml'.format(job.name)
+ ),
'mimetype': 'application/tei+xml'
}
)
- with open(os.path.join(self.output_dir, 'output_records.json'), 'w') as f: # noqa
- json.dump(output_files, f, indent=4)
+ with open(os.path.join(self.output_dir, 'outputs.json'), 'w') as f:
+ json.dump(outputs, f, indent=4)
def parse_args():
- parser = ArgumentParser(description='Pipeline for PDF file OCR processing')
+ parser = ArgumentParser(
+ description='Pipeline for PDF file OCR processing'
+ )
parser.add_argument(
- '-i', '--input-dir', help='Input directory', required=True)
+ '-i', '--input-dir',
+ help='Input directory',
+ required=True
+ )
parser.add_argument(
- '-o', '--output-dir', help='Output directory', required=True)
+ '-o', '--output-dir',
+ help='Output directory',
+ required=True
+ )
parser.add_argument(
- '-l', '--language',
- choices=[x[:-12] for x in os.listdir('/usr/local/share/tessdata')
- if x.endswith('.traineddata') and len(x) > 12],
- help='Language of the input (3-character ISO 639-2 language codes)',
+ '-m', '--model',
+ choices=[
+ x[:-12] for x in os.listdir('/usr/local/share/tessdata')
+ if x.endswith('.traineddata') and len(x) > 12
+ ],
+ help='Name of the model to be used',
required=True
)
parser.add_argument(
@@ -584,16 +645,19 @@ def parse_args():
help='Add binarization as a preprocessing step'
)
parser.add_argument(
- '--log-dir', help='Logging directory (Default: --output-dir)')
+ '--log-dir',
+ help='Logging directory (Default: --output-dir)'
+ )
parser.add_argument(
'--mem-mb',
- help='Amount of system memory to be used (Default: min(--n-cores * 512, available system memory))', # noqa
+ help='Amount of system memory to be used '
+ '(Default: min(--n-cores * 512, available system memory))',
type=int
)
parser.add_argument(
'--n-cores',
- default=min(4, multiprocessing.cpu_count()),
- help='Number of CPU threads to be used (Default: min(4, CPU count))',
+ default=1,
+ help='Number of CPU threads to be used',
type=int
)
parser.add_argument(
@@ -620,10 +684,17 @@ def parse_args():
def main():
args = parse_args()
main_workflow = MainWorkflow(
- args.input_dir, args.language, args.output_dir, args.binarize)
+ args.input_dir,
+ args.model,
+ args.output_dir,
+ args.binarize
+ )
main_workflow.collect_jobs()
retval = main_workflow.run(
- dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores)
+ dataDirRoot=args.log_dir,
+ memMb=args.mem_mb,
+ nCores=args.n_cores
+ )
sys.exit(retval)
diff --git a/wrapper/ocr b/wrapper/ocr
index 58a0bca..d36b129 100755
--- a/wrapper/ocr
+++ b/wrapper/ocr
@@ -17,7 +17,7 @@ GID = str(os.getgid())
parser = ArgumentParser(add_help=False)
parser.add_argument('-i', '--input-dir')
parser.add_argument('-o', '--output-dir')
-parser.add_argument('-m', '--model', action='extend', dest='models', nargs='+')
+parser.add_argument('-t', '--model-file', action='extend', nargs='+')
parser.add_argument('--log-dir')
args, remaining_args = parser.parse_known_args()
@@ -30,9 +30,9 @@ if args.output_dir is not None:
mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}'
cmd += ['-v', mapping]
remaining_args += ['-o', CONTAINER_OUTPUT_DIR]
-if args.models is not None:
- for model in args.models:
- mapping = f'{os.path.abspath(model)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model)}' # noqa
+if args.model_file is not None:
+ for model_file in args.model_file:
+ mapping = f'{os.path.abspath(model_file)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model_file)}' # noqa
cmd += ['-v', mapping]
if args.log_dir is not None:
mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}'