From 03b10545607dbbf7376b582f49746d6d43b92fec Mon Sep 17 00:00:00 2001
From: Patrick Jentsch <pjentsch@pjentsch-Laptop.local>
Date: Wed, 15 May 2019 14:55:36 +0200
Subject: [PATCH] Sort all lists before processing

---
 ocr | 43 +++++++++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/ocr b/ocr
index 46a8dba..3ff4a0f 100755
--- a/ocr
+++ b/ocr
@@ -13,6 +13,7 @@ Author: Patrick Jentsch <p.jentsch@uni-bielefeld.de>
 import argparse
 import multiprocessing
 import os
+import re
 import sys
 from pyflow import WorkflowRunner
 
@@ -35,7 +36,6 @@ def parse_arguments():
         requirements: imagemagick, ocropus, pdftoppm, pdfunite, \
         poppler-utils, pyflow, python2.7, python3.5, tesseract'
     )
-
     parser.add_argument(
         '-l',
         dest='lang',
@@ -92,12 +92,6 @@ class OCRWorkflow(WorkflowRunner):
         self.nCores = args.nCores
 
     def workflow(self):
-        '''
-        ' Starting workflow...
-        '''
-        for index, job in enumerate(self.jobs):
-            print('%i: %s' % (index, job))
-
         '''
         ' Creating output directories...
         '''
@@ -114,9 +108,8 @@ class OCRWorkflow(WorkflowRunner):
                     os.path.join(job['output_dir'], 'tmp', 'txt')
                 )
             if not self.skipBinarisation:
-                cmd += ' "%s" "%s"' % (
-                    os.path.join(job['output_dir'], 'tmp', 'bin.png'),
-                    os.path.join(job['output_dir'], 'tmp', 'nrm.png'),
+                cmd += ' "%s"' % (
+                    os.path.join(job['output_dir'], 'tmp', 'bin.png')
                 )
             create_output_directories_jobs.append(
                 self.addTask(
@@ -186,7 +179,13 @@ class OCRWorkflow(WorkflowRunner):
             post_binarisation_jobs = []
             for index, job in enumerate(self.jobs):
                 number = 0
-                for file in filter(lambda x: x.endswith(('.bin.png', '.nrm.png')), os.listdir(os.path.join(job['output_dir'], 'tmp'))):
+                files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
+                files = filter(lambda x: x.endswith('.bin.png'), files)
+                files = sorted(
+                    files,
+                    key=lambda x: int(re.search(r'\d+', x).group(0))
+                )
+                for file in files:
                     cmd = 'mv "%s" "%s"' % (
                         os.path.join(job['output_dir'], 'tmp', file),
                         os.path.join(job['output_dir'], 'tmp', 'page-%i.%s' % (
@@ -210,7 +209,6 @@ class OCRWorkflow(WorkflowRunner):
         ' Performing OCR...
         '''
         self.waitForTasks()
-        print(self)
         ocr_jobs = []
         '''
         ' Tesseract runs fastest with four cores. So we run it with either four
@@ -226,7 +224,16 @@ class OCRWorkflow(WorkflowRunner):
             ocr_job_nCores = 1
         for index, job in enumerate(self.jobs):
             number = 0
-            for file in filter(lambda x: x.endswith('.tif') if self.skipBinarisation else x.endswith('.bin.png'), os.listdir(os.path.join(job['output_dir'], 'tmp'))):
+            files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
+            if self.skipBinarisation:
+                files = filter(lambda x: x.endswith('.tif'), files)
+            else:
+                files = filter(lambda x: x.endswith('.bin.png'), files)
+            files = sorted(
+                files,
+                key=lambda x: int(re.search(r'\d+', x).group(0))
+            )
+            for file in files:
                 cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % (
                     os.path.join(job['output_dir'], 'tmp', file),
                     os.path.join(
@@ -240,8 +247,9 @@ class OCRWorkflow(WorkflowRunner):
                     ocr_job_dependencies = 'split_job_-_%i' % (index)
                 else:
                     ocr_job_dependencies = filter(
-                        lambda x: x.startswith(
-                            'post_binarisation_job_-_%i' % (index)
+                        lambda x: x == 'post_binarisation_job_-_%i-%i' % (
+                            index,
+                            number
                         ),
                         post_binarisation_jobs
                     )
@@ -357,9 +365,8 @@ class OCRWorkflow(WorkflowRunner):
                         os.path.join(job['output_dir'], 'tmp'),
                         os.path.join(job['output_dir'], 'tmp', 'bin.png'),
                     )
-                    cmd += ' && mv "%s"/*.nrm.png "%s"' % (
-                        os.path.join(job['output_dir'], 'tmp'),
-                        os.path.join(job['output_dir'], 'tmp', 'nrm.png'),
+                    cmd += ' && rm "%s"/*.nrm.png' % (
+                        os.path.join(job['output_dir'], 'tmp')
                     )
                 cleanup_jobs.append(
                     self.addTask(