Codestyle enhacements

2026-06-19 00:35:45 +00:00 · 2022-01-27 13:40:23 +01:00
parent aeab9b7802
commit 4518ca1c83
5 changed files with 187 additions and 82 deletions
@@ -26,8 +26,24 @@ This software implements a heavily parallelized pipeline to recognize text in PD
 3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details.
 ```bash
 cd /<my_data_location>
-ocr -i input -o output -m models/<model_name> -l <language_code> <optional_pipeline_arguments>
-# or
-ocr -i input -o output -m models/* -l <language_code> <optional_pipeline_arguments>
+# <model_code> is the model filename without the ".traineddata" suffix
+ocr \
+  --input-dir input \
+  --output-dir output \
+  --model-file models/<model>
+  -m <model_code> <optional_pipeline_arguments>
+# More then one model
+ocr \
+  --input-dir input \
+  --output-dir output \
+  --model-file models/<model1>
+  --model-file models/<model2>
+  -m <model1_code>+<model2_code> <optional_pipeline_arguments>
+# Instead of multiple --model-file statements, you can also use
+ocr \
+  --input-dir input \
+  --output-dir output \
+  --model-file models/*
+  -m <model1_code>+<model2_code> <optional_pipeline_arguments>
 ```
 4. Check your results in the `/<my_data_location>/output` directory.
@@ -1,33 +1,42 @@
 #!/usr/bin/env python3.7
 # coding=utf-8

-""""Combine multiple hOCR files."""
+''' Combine multiple hOCR files. '''

 from argparse import ArgumentParser
 from lxml import html


 parser = ArgumentParser(description='Combine multiple hOCR files.')
-parser.add_argument('file', help='Input file(s)', nargs='+')
-parser.add_argument('-o', '--output-file', help='Output file', required=True)
+parser.add_argument(
+    '-i', '--input-file',
+    help='Input file',
+    nargs='+',
+    required=True
+)
+parser.add_argument(
+    '-o', '--output-file',
+    help='Output file',
+    required=True
+)
 args = parser.parse_args()
+print(args)

-
-for file in args.file:
-    files = []
-    if file.startswith('@'):
-        with open(file[1:], 'r') as f:
-            files += [x for x in f.read().split("\n") if x != '']
+for input_file in args.input_file:
+    input_files = []
+    if input_file.startswith('@'):
+        with open(input_file[1:], 'r') as f:
+            input_files += [x for x in f.read().split("\n") if x != '']
    else:
-        files.append(file)
-if len(files) == 0:
+        input_files.append(input_file)
+if len(input_files) == 0:
    exit(1)


-hocr = html.parse(files[0])
+hocr = html.parse(input_files[0])
 hocr_body = hocr.find('body')
-for file in files[1:]:
-    for ocr_page in html.parse(file).findall('//div[@class="ocr_page"]'):
+for input_file in input_files[1:]:
+    for ocr_page in html.parse(input_file).findall('//div[@class="ocr_page"]'):
        hocr_body.append(ocr_page)


@@ -1,7 +1,7 @@
 #!/usr/bin/env python3.7
 # coding=utf-8

-""""Convert hOCR to TEI XML."""
+''' Convert hOCR to TEI XML. '''

 from argparse import ArgumentParser
 from lxml import html
@@ -10,8 +10,15 @@ import re


 parser = ArgumentParser(description='Convert hOCR to TEI XML.')
-parser.add_argument('file', help='Input file')
-parser.add_argument('-o', '--output-file', help='Output file', required=True)
+parser.add_argument(
+    '-i', '--input-file',
+    help='Input file'
+)
+parser.add_argument(
+    '-o', '--output-file',
+    help='Output file',
+    required=True
+)
 args = parser.parse_args()


@@ -32,7 +39,7 @@ tei += '    </fileDesc>\n'
 tei += '  </teiHeader>\n'
 tei += '  <text>\n'
 tei += '    <body>\n'
-hocr = html.parse(args.file)
+hocr = html.parse(args.input_file)
 for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
    ocr_page_title_attrib = ocr_page.attrib.get('title')
    facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1)
@@ -42,11 +49,13 @@ for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
        tei += '      <p>\n'
        for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'):
            tei += '        <lb/>'
-            indent = ''
+            is_first_word_in_line = True
            for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'):
                if ocrx_word.text is not None:
-                    tei += indent + escape(ocrx_word.text)
-                    indent = ' '
+                    if not is_first_word_in_line:
+                        tei += ' '
+                    tei += escape(ocrx_word.text)
+                    is_first_word_in_line = False
            tei += '\n'
        tei += '      </p>\n'
 tei += '    </body>\n'
@@ -8,7 +8,6 @@ __version__ = '0.1.0'
 from argparse import ArgumentParser
 from pyflow import WorkflowRunner
 import json
-import multiprocessing
 import os
 import sys

@@ -52,8 +51,8 @@ class SplitInputWorkflow(WorkflowRunner):
        cmd += ' -dQUIET'
        cmd += ' -r300'
        cmd += ' -sDEVICE=png16m'
-        cmd += ' -sOutputFile="{}/page-%d.png"'.format(
-            os.path.join(self.job.tmp_dir, 'images')
+        cmd += ' -sOutputFile="{}"'.format(
+            os.path.join(self.job.tmp_dir, 'images', 'page-%d.png')
        )
        cmd += ' "{}"'.format(self.job.file)
        self.addTask(
@@ -82,13 +81,18 @@ class BinarizationWorkflow(WorkflowRunner):
            os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')
        )
        cmd += ' && '
-        cmd += 'ocropus-nlbin "@{}"'.format(os.path.join(self.job.tmp_dir, 'images', 'inputs.txt'))  # noqa
+        cmd += 'ocropus-nlbin "@{}"'.format(
+            os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')
+        )
        cmd += ' --nocheck'
        cmd += ' --output "{}"'.format(
-            os.path.join(self.job.tmp_dir, 'images'))
+            os.path.join(self.job.tmp_dir, 'images')
+        )
        cmd += ' --parallel "{}"'.format(n_cores)
        cmd += ' && '
-        cmd += 'rm "{}"'.format(os.path.join(self.job.tmp_dir, 'images', 'inputs.txt'))  # noqa
+        cmd += 'rm "{}"'.format(
+            os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')
+        )
        ocropus_nlbin_task = self.addTask(
            'ocropus_nlbin',
            command=cmd,
@@ -130,9 +134,9 @@ class BinarizationWorkflow(WorkflowRunner):


 class OCRWorkflow(WorkflowRunner):
-    def __init__(self, job, lang):
+    def __init__(self, job, model):
        self.job = job
-        self.lang = lang
+        self.model = model

    def workflow(self):
        '''
@@ -148,7 +152,7 @@ class OCRWorkflow(WorkflowRunner):
                os.path.join(self.job.tmp_dir, 'images', file),
                os.path.join(self.job.tmp_dir, file[:-4])
            )
-            cmd += ' -l "{}"'.format(self.lang)
+            cmd += ' -l "{}"'.format(self.model)
            cmd += ' hocr pdf txt'
            cmd += ' || '
            cmd += 'echo "${?}"'
@@ -166,6 +170,7 @@ class OCRWorkflow(WorkflowRunner):
        ' # move_files                                     #
        ' ##################################################
        '''
+        move_files_tasks = []
        n_cores = 1
        mem_mb = min(128, self.getMemMb())
        for i, file_extension in enumerate(['hocr', 'pdf', 'txt']):
@@ -174,24 +179,26 @@ class OCRWorkflow(WorkflowRunner):
                file_extension,
                os.path.join(self.job.tmp_dir, file_extension)
            )
-            self.addTask(
+            task = self.addTask(
                'move_{}_files'.format(file_extension),
                command=cmd,
                dependencies=tesseract_tasks,
                memMb=mem_mb,
                nCores=n_cores
            )
+            move_files_tasks.append(task)
        cmd = 'mv "{}" "{}"'.format(
            os.path.join(self.job.tmp_dir, 'images'),
            os.path.join(self.job.output_dir)
        )
-        self.addTask(
+        task = self.addTask(
            'move_image_files',
            command=cmd,
            dependencies=tesseract_tasks,
            memMb=mem_mb,
            nCores=n_cores
        )
+        move_files_tasks.append(task)


 class CreateHOCRWorkflow(WorkflowRunner):
@@ -256,13 +263,14 @@ class CreateHOCRWorkflow(WorkflowRunner):
        ' ##################################################
        '''
        n_cores = 1
-        mem_mb = min(512, self.getMemMb())
+        mem_mb = min(256, self.getMemMb())
        cmd = 'ls -dv "{}/"* > "{}"'.format(
            os.path.join(self.job.tmp_dir, 'hocr'),
            os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt')
        )
        cmd += ' && '
-        cmd += 'hocr-combine "@{}"'.format(
+        cmd += 'hocr-combine'
+        cmd += ' --input-file "@{}"'.format(
            os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt')
        )
        cmd += ' --output-file "{}.hocr"'.format(
@@ -301,12 +309,17 @@ class CreatePDFWorkflow(WorkflowRunner):
        cmd += ' -dPDFSETTINGS=/ebook'
        cmd += ' -dQUIET'
        cmd += ' -sDEVICE=pdfwrite'
-        cmd += ' -sOutputFile="{}.pdf"'.format(
-            os.path.join(self.job.output_dir, self.job.name)
+        cmd += ' -sOutputFile="{}"'.format(
+            os.path.join(self.job.output_dir, '{}.pdf'.format(self.job.name))
        )
        cmd += ' && '
        cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'pdf'))
-        self.addTask('pdf_combine', command=cmd, memMb=mem_mb, nCores=n_cores)
+        self.addTask(
+            'pdf_combine',
+            command=cmd,
+            memMb=mem_mb,
+            nCores=n_cores
+        )


 class CreateTEIWorkflow(WorkflowRunner):
@@ -320,14 +333,23 @@ class CreateTEIWorkflow(WorkflowRunner):
        ' ##################################################
        '''
        n_cores = 1
-        mem_mb = min(512, self.getMemMb())
-        cmd = 'hocr2tei "{}.hocr"'.format(
-            os.path.join(self.job.output_dir, self.job.name)
+        mem_mb = min(256, self.getMemMb())
+        cmd = 'hocr2tei'
+        cmd += ' --input-file "{}"'.format(
+            os.path.join(self.job.output_dir, '{}.hocr'.format(self.job.name))
        )
-        cmd += ' --output-file "{}.tei.xml"'.format(
-            os.path.join(self.job.output_dir, self.job.name)
+        cmd += ' --output-file "{}"'.format(
+            os.path.join(
+                self.job.output_dir,
+                '{}.tei.xml'.format(self.job.name)
+            )
+        )
+        self.addTask(
+            'hocr2tei',
+            command=cmd,
+            memMb=mem_mb,
+            nCores=n_cores
        )
-        self.addTask('hocr2tei', command=cmd, memMb=mem_mb, nCores=n_cores)


 class CreatePoCoZipWorkflow(WorkflowRunner):
@@ -354,7 +376,12 @@ class CreatePoCoZipWorkflow(WorkflowRunner):
        cmd += 'rm -r images'
        cmd += ' && '
        cmd += 'cd -'
-        task = self.addTask('zip', command=cmd, memMb=mem_mb, nCores=n_cores)
+        task = self.addTask(
+            'zip',
+            command=cmd,
+            memMb=mem_mb,
+            nCores=n_cores
+        )
        zip_tasks.append(task)


@@ -377,13 +404,18 @@ class CreateTxtWorkflow(WorkflowRunner):
        cmd += '"{}.txt"'.format(os.path.join(self.job.output_dir, self.job.name))  # noqa
        cmd += ' && '
        cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'txt'))
-        self.addTask('txt_combine', command=cmd, memMb=mem_mb, nCores=n_cores)
+        self.addTask(
+            'txt_combine',
+            command=cmd,
+            memMb=mem_mb,
+            nCores=n_cores
+        )


 class MainWorkflow(WorkflowRunner):
-    def __init__(self, input_dir, lang, output_dir, binarize):
+    def __init__(self, input_dir, model, output_dir, binarize):
        self.input_dir = input_dir
-        self.lang = lang
+        self.model = model
        self.output_dir = output_dir
        self.binarize = binarize
        self.jobs = []
@@ -419,11 +451,13 @@ class MainWorkflow(WorkflowRunner):
        ' # split-input                                    #
        ' ##################################################
        '''
+        split_input_tasks = []
        for i, job in enumerate(self.jobs):
-            self.addWorkflowTask(
+            task = self.addWorkflowTask(
                'split_input_-_{}'.format(i),
                SplitInputWorkflow(job)
            )
+            split_input_tasks.append(task)

        if self.binarize:
            '''
@@ -431,12 +465,14 @@ class MainWorkflow(WorkflowRunner):
            ' # binarization                                   #
            ' ##################################################
            '''
+            binarization_tasks = []
            for i, job in enumerate(self.jobs):
-                self.addWorkflowTask(
+                task = self.addWorkflowTask(
                    'binarization_-_{}'.format(i),
                    BinarizationWorkflow(job),
                    dependencies='split_input_-_{}'.format(i)
                )
+                binarization_tasks.append(task)

        '''
        ' ##################################################
@@ -451,7 +487,7 @@ class MainWorkflow(WorkflowRunner):
                deps = 'split_input_-_{}'.format(i)
            task = self.addWorkflowTask(
                'ocr_-_{}'.format(i),
-                OCRWorkflow(job, self.lang),
+                OCRWorkflow(job, self.model),
                dependencies=deps
            )
            ocr_tasks.append(task)
@@ -527,55 +563,80 @@ class MainWorkflow(WorkflowRunner):
            create_txt_tasks.append(task)

        self.waitForTasks()
-        output_files = []
+        outputs = []
        for job in self.jobs:
            # Remove temporary directory
            os.rmdir(job.tmp_dir)
            # Track output files
-            relative_output_dir = os.path.relpath(job.output_dir, start=self.output_dir)  # noqa
-            output_files.append(
+            relative_output_dir = os.path.relpath(
+                job.output_dir,
+                start=self.output_dir
+            )
+            outputs.append(
                {
                    'description': 'Post correction package (.png and .hocr).',
-                    'file': os.path.join(relative_output_dir, '{}.poco.zip'.format(job.name)),  # noqa
+                    'file': os.path.join(
+                        relative_output_dir,
+                        '{}.poco.zip'.format(job.name)
+                    ),
                    'mimetype': 'application/zip'
                }
            )
-            output_files.append(
+            outputs.append(
                {
                    'description': 'PDF file with text layer.',
-                    'file': os.path.join(relative_output_dir, '{}.pdf'.format(job.name)),  # noqa
+                    'file': os.path.join(
+                        relative_output_dir,
+                        '{}.pdf'.format(job.name)
+                    ),
                    'mimetype': 'application/pdf'
                }
            )
-            output_files.append(
+            outputs.append(
                {
                    'description': 'Plain text file.',
-                    'file': os.path.join(relative_output_dir, '{}.txt'.format(job.name)),  # noqa
+                    'file': os.path.join(
+                        relative_output_dir,
+                        '{}.txt'.format(job.name)
+                    ),
                    'mimetype': 'text/plain'
                }
            )
-            output_files.append(
+            outputs.append(
                {
                    'description': 'TEI compliant XML file.',
-                    'file': os.path.join(relative_output_dir, '{}.tei.xml'.format(job.name)),  # noqa
+                    'file': os.path.join(
+                        relative_output_dir,
+                        '{}.tei.xml'.format(job.name)
+                    ),
                    'mimetype': 'application/tei+xml'
                }
            )
-        with open(os.path.join(self.output_dir, 'output_records.json'), 'w') as f:  # noqa
-            json.dump(output_files, f, indent=4)
+        with open(os.path.join(self.output_dir, 'outputs.json'), 'w') as f:
+            json.dump(outputs, f, indent=4)


 def parse_args():
-    parser = ArgumentParser(description='Pipeline for PDF file OCR processing')
+    parser = ArgumentParser(
+        description='Pipeline for PDF file OCR processing'
+    )
    parser.add_argument(
-        '-i', '--input-dir', help='Input directory', required=True)
+        '-i', '--input-dir',
+        help='Input directory',
+        required=True
+    )
    parser.add_argument(
-        '-o', '--output-dir', help='Output directory', required=True)
+        '-o', '--output-dir',
+        help='Output directory',
+        required=True
+    )
    parser.add_argument(
-        '-l', '--language',
-        choices=[x[:-12] for x in os.listdir('/usr/local/share/tessdata')
-                 if x.endswith('.traineddata') and len(x) > 12],
-        help='Language of the input (3-character ISO 639-2 language codes)',
+        '-m', '--model',
+        choices=[
+            x[:-12] for x in os.listdir('/usr/local/share/tessdata')
+            if x.endswith('.traineddata') and len(x) > 12
+        ],
+        help='Name of the model to be used',
        required=True
    )
    parser.add_argument(
@@ -584,16 +645,19 @@ def parse_args():
        help='Add binarization as a preprocessing step'
    )
    parser.add_argument(
-        '--log-dir', help='Logging directory (Default: --output-dir)')
+        '--log-dir',
+        help='Logging directory (Default: --output-dir)'
+    )
    parser.add_argument(
        '--mem-mb',
-        help='Amount of system memory to be used (Default: min(--n-cores * 512, available system memory))',  # noqa
+        help='Amount of system memory to be used '
+             '(Default: min(--n-cores * 512, available system memory))',
        type=int
    )
    parser.add_argument(
        '--n-cores',
-        default=min(4, multiprocessing.cpu_count()),
-        help='Number of CPU threads to be used (Default: min(4, CPU count))',
+        default=1,
+        help='Number of CPU threads to be used',
        type=int
    )
    parser.add_argument(
@@ -620,10 +684,17 @@ def parse_args():
 def main():
    args = parse_args()
    main_workflow = MainWorkflow(
-        args.input_dir, args.language, args.output_dir, args.binarize)
+        args.input_dir,
+        args.model,
+        args.output_dir,
+        args.binarize
+    )
    main_workflow.collect_jobs()
    retval = main_workflow.run(
-        dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores)
+        dataDirRoot=args.log_dir,
+        memMb=args.mem_mb,
+        nCores=args.n_cores
+    )
    sys.exit(retval)


@@ -17,7 +17,7 @@ GID = str(os.getgid())
 parser = ArgumentParser(add_help=False)
 parser.add_argument('-i', '--input-dir')
 parser.add_argument('-o', '--output-dir')
-parser.add_argument('-m', '--model', action='extend', dest='models', nargs='+')
+parser.add_argument('-t', '--model-file', action='extend', nargs='+')
 parser.add_argument('--log-dir')
 args, remaining_args = parser.parse_known_args()

@@ -30,9 +30,9 @@ if args.output_dir is not None:
    mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}'
    cmd += ['-v', mapping]
    remaining_args += ['-o', CONTAINER_OUTPUT_DIR]
-if args.models is not None:
-    for model in args.models:
-        mapping = f'{os.path.abspath(model)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model)}'  # noqa
+if args.model_file is not None:
+    for model_file in args.model_file:
+        mapping = f'{os.path.abspath(model_file)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model_file)}'  # noqa
        cmd += ['-v', mapping]
 if args.log_dir is not None:
    mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}'