From 4518ca1c8308a1a3f7e3ce1f68be8a739fc7bd0c Mon Sep 17 00:00:00 2001
From: Patrick Jentsch <p.jentsch@uni-bielefeld.de>
Date: Thu, 27 Jan 2022 13:40:23 +0100
Subject: [PATCH] Codestyle enhacements

---
 README.md    |  22 ++++++-
 hocr-combine |  37 +++++++----
 hocr2tei     |  23 +++++--
 ocr          | 179 +++++++++++++++++++++++++++++++++++----------------
 wrapper/ocr  |   8 +--
 5 files changed, 187 insertions(+), 82 deletions(-)
diff --git a/README.md b/README.md
index 98834e5..a9ec050 100644
--- a/README.md
+++ b/README.md
@@ -26,8 +26,24 @@ This software implements a heavily parallelized pipeline to recognize text in PD
 3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details.
 ```bash
 cd /<my_data_location>
-ocr -i input -o output -m models/<model_name> -l <language_code> <optional_pipeline_arguments>
-# or
-ocr -i input -o output -m models/* -l <language_code> <optional_pipeline_arguments>
+# <model_code> is the model filename without the ".traineddata" suffix
+ocr \
+  --input-dir input \
+  --output-dir output \
+  --model-file models/<model>
+  -m <model_code> <optional_pipeline_arguments>
+# More then one model
+ocr \
+  --input-dir input \
+  --output-dir output \
+  --model-file models/<model1>
+  --model-file models/<model2>
+  -m <model1_code>+<model2_code> <optional_pipeline_arguments>
+# Instead of multiple --model-file statements, you can also use
+ocr \
+  --input-dir input \
+  --output-dir output \
+  --model-file models/*
+  -m <model1_code>+<model2_code> <optional_pipeline_arguments>
 ```
 4. Check your results in the `/<my_data_location>/output` directory.
diff --git a/hocr-combine b/hocr-combine
index 4008890..d4b606c 100755
--- a/hocr-combine
+++ b/hocr-combine
@@ -1,33 +1,42 @@
 #!/usr/bin/env python3.7
 # coding=utf-8
 
-""""Combine multiple hOCR files."""
+''' Combine multiple hOCR files. '''
 
 from argparse import ArgumentParser
 from lxml import html
 
 
 parser = ArgumentParser(description='Combine multiple hOCR files.')
-parser.add_argument('file', help='Input file(s)', nargs='+')
-parser.add_argument('-o', '--output-file', help='Output file', required=True)
+parser.add_argument(
+    '-i', '--input-file',
+    help='Input file',
+    nargs='+',
+    required=True
+)
+parser.add_argument(
+    '-o', '--output-file',
+    help='Output file',
+    required=True
+)
 args = parser.parse_args()
+print(args)
 
-
-for file in args.file:
-    files = []
-    if file.startswith('@'):
-        with open(file[1:], 'r') as f:
-            files += [x for x in f.read().split("\n") if x != '']
+for input_file in args.input_file:
+    input_files = []
+    if input_file.startswith('@'):
+        with open(input_file[1:], 'r') as f:
+            input_files += [x for x in f.read().split("\n") if x != '']
     else:
-        files.append(file)
-if len(files) == 0:
+        input_files.append(input_file)
+if len(input_files) == 0:
     exit(1)
 
 
-hocr = html.parse(files[0])
+hocr = html.parse(input_files[0])
 hocr_body = hocr.find('body')
-for file in files[1:]:
-    for ocr_page in html.parse(file).findall('//div[@class="ocr_page"]'):
+for input_file in input_files[1:]:
+    for ocr_page in html.parse(input_file).findall('//div[@class="ocr_page"]'):
         hocr_body.append(ocr_page)
 
 
diff --git a/hocr2tei b/hocr2tei
index 04a3db7..0350efb 100755
--- a/hocr2tei
+++ b/hocr2tei
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3.7
 # coding=utf-8
 
-""""Convert hOCR to TEI XML."""
+''' Convert hOCR to TEI XML. '''
 
 from argparse import ArgumentParser
 from lxml import html
@@ -10,8 +10,15 @@ import re
 
 
 parser = ArgumentParser(description='Convert hOCR to TEI XML.')
-parser.add_argument('file', help='Input file')
-parser.add_argument('-o', '--output-file', help='Output file', required=True)
+parser.add_argument(
+    '-i', '--input-file',
+    help='Input file'
+)
+parser.add_argument(
+    '-o', '--output-file',
+    help='Output file',
+    required=True
+)
 args = parser.parse_args()
 
 
@@ -32,7 +39,7 @@ tei += '    </fileDesc>\n'
 tei += '  </teiHeader>\n'
 tei += '  <text>\n'
 tei += '    <body>\n'
-hocr = html.parse(args.file)
+hocr = html.parse(args.input_file)
 for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
     ocr_page_title_attrib = ocr_page.attrib.get('title')
     facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1)
@@ -42,11 +49,13 @@ for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
         tei += '      <p>\n'
         for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'):
             tei += '        <lb/>'
-            indent = ''
+            is_first_word_in_line = True
             for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'):
                 if ocrx_word.text is not None:
-                    tei += indent + escape(ocrx_word.text)
-                    indent = ' '
+                    if not is_first_word_in_line:
+                        tei += ' '
+                    tei += escape(ocrx_word.text)
+                    is_first_word_in_line = False
             tei += '\n'
         tei += '      </p>\n'
 tei += '    </body>\n'
diff --git a/ocr b/ocr
index 5e13c5f..f8a31fe 100755
--- a/ocr
+++ b/ocr
@@ -8,7 +8,6 @@ __version__ = '0.1.0'
 from argparse import ArgumentParser
 from pyflow import WorkflowRunner
 import json
-import multiprocessing
 import os
 import sys
 
@@ -52,8 +51,8 @@ class SplitInputWorkflow(WorkflowRunner):
         cmd += ' -dQUIET'
         cmd += ' -r300'
         cmd += ' -sDEVICE=png16m'
-        cmd += ' -sOutputFile="{}/page-%d.png"'.format(
-            os.path.join(self.job.tmp_dir, 'images')
+        cmd += ' -sOutputFile="{}"'.format(
+            os.path.join(self.job.tmp_dir, 'images', 'page-%d.png')
         )
         cmd += ' "{}"'.format(self.job.file)
         self.addTask(
@@ -82,13 +81,18 @@ class BinarizationWorkflow(WorkflowRunner):
             os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')
         )
         cmd += ' && '
-        cmd += 'ocropus-nlbin "@{}"'.format(os.path.join(self.job.tmp_dir, 'images', 'inputs.txt'))  # noqa
+        cmd += 'ocropus-nlbin "@{}"'.format(
+            os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')
+        )
         cmd += ' --nocheck'
         cmd += ' --output "{}"'.format(
-            os.path.join(self.job.tmp_dir, 'images'))
+            os.path.join(self.job.tmp_dir, 'images')
+        )
         cmd += ' --parallel "{}"'.format(n_cores)
         cmd += ' && '
-        cmd += 'rm "{}"'.format(os.path.join(self.job.tmp_dir, 'images', 'inputs.txt'))  # noqa
+        cmd += 'rm "{}"'.format(
+            os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')
+        )
         ocropus_nlbin_task = self.addTask(
             'ocropus_nlbin',
             command=cmd,
@@ -130,9 +134,9 @@ class BinarizationWorkflow(WorkflowRunner):
 
 
 class OCRWorkflow(WorkflowRunner):
-    def __init__(self, job, lang):
+    def __init__(self, job, model):
         self.job = job
-        self.lang = lang
+        self.model = model
 
     def workflow(self):
         '''
@@ -148,7 +152,7 @@ class OCRWorkflow(WorkflowRunner):
                 os.path.join(self.job.tmp_dir, 'images', file),
                 os.path.join(self.job.tmp_dir, file[:-4])
             )
-            cmd += ' -l "{}"'.format(self.lang)
+            cmd += ' -l "{}"'.format(self.model)
             cmd += ' hocr pdf txt'
             cmd += ' || '
             cmd += 'echo "${?}"'
@@ -166,6 +170,7 @@ class OCRWorkflow(WorkflowRunner):
         ' # move_files                                     #
         ' ##################################################
         '''
+        move_files_tasks = []
         n_cores = 1
         mem_mb = min(128, self.getMemMb())
         for i, file_extension in enumerate(['hocr', 'pdf', 'txt']):
@@ -174,24 +179,26 @@ class OCRWorkflow(WorkflowRunner):
                 file_extension,
                 os.path.join(self.job.tmp_dir, file_extension)
             )
-            self.addTask(
+            task = self.addTask(
                 'move_{}_files'.format(file_extension),
                 command=cmd,
                 dependencies=tesseract_tasks,
                 memMb=mem_mb,
                 nCores=n_cores
             )
+            move_files_tasks.append(task)
         cmd = 'mv "{}" "{}"'.format(
             os.path.join(self.job.tmp_dir, 'images'),
             os.path.join(self.job.output_dir)
         )
-        self.addTask(
+        task = self.addTask(
             'move_image_files',
             command=cmd,
             dependencies=tesseract_tasks,
             memMb=mem_mb,
             nCores=n_cores
         )
+        move_files_tasks.append(task)
 
 
 class CreateHOCRWorkflow(WorkflowRunner):
@@ -256,13 +263,14 @@ class CreateHOCRWorkflow(WorkflowRunner):
         ' ##################################################
         '''
         n_cores = 1
-        mem_mb = min(512, self.getMemMb())
+        mem_mb = min(256, self.getMemMb())
         cmd = 'ls -dv "{}/"* > "{}"'.format(
             os.path.join(self.job.tmp_dir, 'hocr'),
             os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt')
         )
         cmd += ' && '
-        cmd += 'hocr-combine "@{}"'.format(
+        cmd += 'hocr-combine'
+        cmd += ' --input-file "@{}"'.format(
             os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt')
         )
         cmd += ' --output-file "{}.hocr"'.format(
@@ -301,12 +309,17 @@ class CreatePDFWorkflow(WorkflowRunner):
         cmd += ' -dPDFSETTINGS=/ebook'
         cmd += ' -dQUIET'
         cmd += ' -sDEVICE=pdfwrite'
-        cmd += ' -sOutputFile="{}.pdf"'.format(
-            os.path.join(self.job.output_dir, self.job.name)
+        cmd += ' -sOutputFile="{}"'.format(
+            os.path.join(self.job.output_dir, '{}.pdf'.format(self.job.name))
         )
         cmd += ' && '
         cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'pdf'))
-        self.addTask('pdf_combine', command=cmd, memMb=mem_mb, nCores=n_cores)
+        self.addTask(
+            'pdf_combine',
+            command=cmd,
+            memMb=mem_mb,
+            nCores=n_cores
+        )
 
 
 class CreateTEIWorkflow(WorkflowRunner):
@@ -320,14 +333,23 @@ class CreateTEIWorkflow(WorkflowRunner):
         ' ##################################################
         '''
         n_cores = 1
-        mem_mb = min(512, self.getMemMb())
-        cmd = 'hocr2tei "{}.hocr"'.format(
-            os.path.join(self.job.output_dir, self.job.name)
+        mem_mb = min(256, self.getMemMb())
+        cmd = 'hocr2tei'
+        cmd += ' --input-file "{}"'.format(
+            os.path.join(self.job.output_dir, '{}.hocr'.format(self.job.name))
         )
-        cmd += ' --output-file "{}.tei.xml"'.format(
-            os.path.join(self.job.output_dir, self.job.name)
+        cmd += ' --output-file "{}"'.format(
+            os.path.join(
+                self.job.output_dir,
+                '{}.tei.xml'.format(self.job.name)
+            )
+        )
+        self.addTask(
+            'hocr2tei',
+            command=cmd,
+            memMb=mem_mb,
+            nCores=n_cores
         )
-        self.addTask('hocr2tei', command=cmd, memMb=mem_mb, nCores=n_cores)
 
 
 class CreatePoCoZipWorkflow(WorkflowRunner):
@@ -354,7 +376,12 @@ class CreatePoCoZipWorkflow(WorkflowRunner):
         cmd += 'rm -r images'
         cmd += ' && '
         cmd += 'cd -'
-        task = self.addTask('zip', command=cmd, memMb=mem_mb, nCores=n_cores)
+        task = self.addTask(
+            'zip',
+            command=cmd,
+            memMb=mem_mb,
+            nCores=n_cores
+        )
         zip_tasks.append(task)
 
 
@@ -377,13 +404,18 @@ class CreateTxtWorkflow(WorkflowRunner):
         cmd += '"{}.txt"'.format(os.path.join(self.job.output_dir, self.job.name))  # noqa
         cmd += ' && '
         cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'txt'))
-        self.addTask('txt_combine', command=cmd, memMb=mem_mb, nCores=n_cores)
+        self.addTask(
+            'txt_combine',
+            command=cmd,
+            memMb=mem_mb,
+            nCores=n_cores
+        )
 
 
 class MainWorkflow(WorkflowRunner):
-    def __init__(self, input_dir, lang, output_dir, binarize):
+    def __init__(self, input_dir, model, output_dir, binarize):
         self.input_dir = input_dir
-        self.lang = lang
+        self.model = model
         self.output_dir = output_dir
         self.binarize = binarize
         self.jobs = []
@@ -419,11 +451,13 @@ class MainWorkflow(WorkflowRunner):
         ' # split-input                                    #
         ' ##################################################
         '''
+        split_input_tasks = []
         for i, job in enumerate(self.jobs):
-            self.addWorkflowTask(
+            task = self.addWorkflowTask(
                 'split_input_-_{}'.format(i),
                 SplitInputWorkflow(job)
             )
+            split_input_tasks.append(task)
 
         if self.binarize:
             '''
@@ -431,12 +465,14 @@ class MainWorkflow(WorkflowRunner):
             ' # binarization                                   #
             ' ##################################################
             '''
+            binarization_tasks = []
             for i, job in enumerate(self.jobs):
-                self.addWorkflowTask(
+                task = self.addWorkflowTask(
                     'binarization_-_{}'.format(i),
                     BinarizationWorkflow(job),
                     dependencies='split_input_-_{}'.format(i)
                 )
+                binarization_tasks.append(task)
 
         '''
         ' ##################################################
@@ -451,7 +487,7 @@ class MainWorkflow(WorkflowRunner):
                 deps = 'split_input_-_{}'.format(i)
             task = self.addWorkflowTask(
                 'ocr_-_{}'.format(i),
-                OCRWorkflow(job, self.lang),
+                OCRWorkflow(job, self.model),
                 dependencies=deps
             )
             ocr_tasks.append(task)
@@ -527,55 +563,80 @@ class MainWorkflow(WorkflowRunner):
             create_txt_tasks.append(task)
 
         self.waitForTasks()
-        output_files = []
+        outputs = []
         for job in self.jobs:
             # Remove temporary directory
             os.rmdir(job.tmp_dir)
             # Track output files
-            relative_output_dir = os.path.relpath(job.output_dir, start=self.output_dir)  # noqa
-            output_files.append(
+            relative_output_dir = os.path.relpath(
+                job.output_dir,
+                start=self.output_dir
+            )
+            outputs.append(
                 {
                     'description': 'Post correction package (.png and .hocr).',
-                    'file': os.path.join(relative_output_dir, '{}.poco.zip'.format(job.name)),  # noqa
+                    'file': os.path.join(
+                        relative_output_dir,
+                        '{}.poco.zip'.format(job.name)
+                    ),
                     'mimetype': 'application/zip'
                 }
             )
-            output_files.append(
+            outputs.append(
                 {
                     'description': 'PDF file with text layer.',
-                    'file': os.path.join(relative_output_dir, '{}.pdf'.format(job.name)),  # noqa
+                    'file': os.path.join(
+                        relative_output_dir,
+                        '{}.pdf'.format(job.name)
+                    ),
                     'mimetype': 'application/pdf'
                 }
             )
-            output_files.append(
+            outputs.append(
                 {
                     'description': 'Plain text file.',
-                    'file': os.path.join(relative_output_dir, '{}.txt'.format(job.name)),  # noqa
+                    'file': os.path.join(
+                        relative_output_dir,
+                        '{}.txt'.format(job.name)
+                    ),
                     'mimetype': 'text/plain'
                 }
             )
-            output_files.append(
+            outputs.append(
                 {
                     'description': 'TEI compliant XML file.',
-                    'file': os.path.join(relative_output_dir, '{}.tei.xml'.format(job.name)),  # noqa
+                    'file': os.path.join(
+                        relative_output_dir,
+                        '{}.tei.xml'.format(job.name)
+                    ),
                     'mimetype': 'application/tei+xml'
                 }
             )
-        with open(os.path.join(self.output_dir, 'output_records.json'), 'w') as f:  # noqa
-            json.dump(output_files, f, indent=4)
+        with open(os.path.join(self.output_dir, 'outputs.json'), 'w') as f:
+            json.dump(outputs, f, indent=4)
 
 
 def parse_args():
-    parser = ArgumentParser(description='Pipeline for PDF file OCR processing')
+    parser = ArgumentParser(
+        description='Pipeline for PDF file OCR processing'
+    )
     parser.add_argument(
-        '-i', '--input-dir', help='Input directory', required=True)
+        '-i', '--input-dir',
+        help='Input directory',
+        required=True
+    )
     parser.add_argument(
-        '-o', '--output-dir', help='Output directory', required=True)
+        '-o', '--output-dir',
+        help='Output directory',
+        required=True
+    )
     parser.add_argument(
-        '-l', '--language',
-        choices=[x[:-12] for x in os.listdir('/usr/local/share/tessdata')
-                 if x.endswith('.traineddata') and len(x) > 12],
-        help='Language of the input (3-character ISO 639-2 language codes)',
+        '-m', '--model',
+        choices=[
+            x[:-12] for x in os.listdir('/usr/local/share/tessdata')
+            if x.endswith('.traineddata') and len(x) > 12
+        ],
+        help='Name of the model to be used',
         required=True
     )
     parser.add_argument(
@@ -584,16 +645,19 @@ def parse_args():
         help='Add binarization as a preprocessing step'
     )
     parser.add_argument(
-        '--log-dir', help='Logging directory (Default: --output-dir)')
+        '--log-dir',
+        help='Logging directory (Default: --output-dir)'
+    )
     parser.add_argument(
         '--mem-mb',
-        help='Amount of system memory to be used (Default: min(--n-cores * 512, available system memory))',  # noqa
+        help='Amount of system memory to be used '
+             '(Default: min(--n-cores * 512, available system memory))',
         type=int
     )
     parser.add_argument(
         '--n-cores',
-        default=min(4, multiprocessing.cpu_count()),
-        help='Number of CPU threads to be used (Default: min(4, CPU count))',
+        default=1,
+        help='Number of CPU threads to be used',
         type=int
     )
     parser.add_argument(
@@ -620,10 +684,17 @@ def parse_args():
 def main():
     args = parse_args()
     main_workflow = MainWorkflow(
-        args.input_dir, args.language, args.output_dir, args.binarize)
+        args.input_dir,
+        args.model,
+        args.output_dir,
+        args.binarize
+    )
     main_workflow.collect_jobs()
     retval = main_workflow.run(
-        dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores)
+        dataDirRoot=args.log_dir,
+        memMb=args.mem_mb,
+        nCores=args.n_cores
+    )
     sys.exit(retval)
 
 
diff --git a/wrapper/ocr b/wrapper/ocr
index 58a0bca..d36b129 100755
--- a/wrapper/ocr
+++ b/wrapper/ocr
@@ -17,7 +17,7 @@ GID = str(os.getgid())
 parser = ArgumentParser(add_help=False)
 parser.add_argument('-i', '--input-dir')
 parser.add_argument('-o', '--output-dir')
-parser.add_argument('-m', '--model', action='extend', dest='models', nargs='+')
+parser.add_argument('-t', '--model-file', action='extend', nargs='+')
 parser.add_argument('--log-dir')
 args, remaining_args = parser.parse_known_args()
 
@@ -30,9 +30,9 @@ if args.output_dir is not None:
     mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}'
     cmd += ['-v', mapping]
     remaining_args += ['-o', CONTAINER_OUTPUT_DIR]
-if args.models is not None:
-    for model in args.models:
-        mapping = f'{os.path.abspath(model)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model)}'  # noqa
+if args.model_file is not None:
+    for model_file in args.model_file:
+        mapping = f'{os.path.abspath(model_file)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model_file)}'  # noqa
         cmd += ['-v', mapping]
 if args.log_dir is not None:
     mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}'