Sort all lists before processing

2026-06-12 07:45:44 +00:00 · 2019-05-15 14:55:36 +02:00
parent b9dba80d7f
commit 03b1054560
1 changed files with 25 additions and 18 deletions
@@ -13,6 +13,7 @@ Author: Patrick Jentsch <p.jentsch@uni-bielefeld.de>
 import argparse
 import multiprocessing
 import os
+import re
 import sys
 from pyflow import WorkflowRunner

@@ -35,7 +36,6 @@ def parse_arguments():
        requirements: imagemagick, ocropus, pdftoppm, pdfunite, \
        poppler-utils, pyflow, python2.7, python3.5, tesseract'
    )
-
    parser.add_argument(
        '-l',
        dest='lang',
@@ -92,12 +92,6 @@ class OCRWorkflow(WorkflowRunner):
        self.nCores = args.nCores

    def workflow(self):
-        '''
-        ' Starting workflow...
-        '''
-        for index, job in enumerate(self.jobs):
-            print('%i: %s' % (index, job))
-
        '''
        ' Creating output directories...
        '''
@@ -114,9 +108,8 @@ class OCRWorkflow(WorkflowRunner):
                    os.path.join(job['output_dir'], 'tmp', 'txt')
                )
            if not self.skipBinarisation:
-                cmd += ' "%s" "%s"' % (
-                    os.path.join(job['output_dir'], 'tmp', 'bin.png'),
-                    os.path.join(job['output_dir'], 'tmp', 'nrm.png'),
+                cmd += ' "%s"' % (
+                    os.path.join(job['output_dir'], 'tmp', 'bin.png')
                )
            create_output_directories_jobs.append(
                self.addTask(
@@ -186,7 +179,13 @@ class OCRWorkflow(WorkflowRunner):
            post_binarisation_jobs = []
            for index, job in enumerate(self.jobs):
                number = 0
-                for file in filter(lambda x: x.endswith(('.bin.png', '.nrm.png')), os.listdir(os.path.join(job['output_dir'], 'tmp'))):
+                files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
+                files = filter(lambda x: x.endswith('.bin.png'), files)
+                files = sorted(
+                    files,
+                    key=lambda x: int(re.search(r'\d+', x).group(0))
+                )
+                for file in files:
                    cmd = 'mv "%s" "%s"' % (
                        os.path.join(job['output_dir'], 'tmp', file),
                        os.path.join(job['output_dir'], 'tmp', 'page-%i.%s' % (
@@ -210,7 +209,6 @@ class OCRWorkflow(WorkflowRunner):
        ' Performing OCR...
        '''
        self.waitForTasks()
-        print(self)
        ocr_jobs = []
        '''
        ' Tesseract runs fastest with four cores. So we run it with either four
@@ -226,7 +224,16 @@ class OCRWorkflow(WorkflowRunner):
            ocr_job_nCores = 1
        for index, job in enumerate(self.jobs):
            number = 0
-            for file in filter(lambda x: x.endswith('.tif') if self.skipBinarisation else x.endswith('.bin.png'), os.listdir(os.path.join(job['output_dir'], 'tmp'))):
+            files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
+            if self.skipBinarisation:
+                files = filter(lambda x: x.endswith('.tif'), files)
+            else:
+                files = filter(lambda x: x.endswith('.bin.png'), files)
+            files = sorted(
+                files,
+                key=lambda x: int(re.search(r'\d+', x).group(0))
+            )
+            for file in files:
                cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % (
                    os.path.join(job['output_dir'], 'tmp', file),
                    os.path.join(
@@ -240,8 +247,9 @@ class OCRWorkflow(WorkflowRunner):
                    ocr_job_dependencies = 'split_job_-_%i' % (index)
                else:
                    ocr_job_dependencies = filter(
-                        lambda x: x.startswith(
-                            'post_binarisation_job_-_%i' % (index)
+                        lambda x: x == 'post_binarisation_job_-_%i-%i' % (
+                            index,
+                            number
                        ),
                        post_binarisation_jobs
                    )
@@ -357,9 +365,8 @@ class OCRWorkflow(WorkflowRunner):
                        os.path.join(job['output_dir'], 'tmp'),
                        os.path.join(job['output_dir'], 'tmp', 'bin.png'),
                    )
-                    cmd += ' && mv "%s"/*.nrm.png "%s"' % (
-                        os.path.join(job['output_dir'], 'tmp'),
-                        os.path.join(job['output_dir'], 'tmp', 'nrm.png'),
+                    cmd += ' && rm "%s"/*.nrm.png' % (
+                        os.path.join(job['output_dir'], 'tmp')
                    )
                cleanup_jobs.append(
                    self.addTask(