Sort all lists before processing

2026-01-01 04:10:55 +00:00 · 2019-05-15 14:55:36 +02:00
parent b9dba80d7f
commit 03b1054560
1 changed files with 25 additions and 18 deletions
--- a/43
+++ b/43
@@ -13,6 +13,7 @@ Author: Patrick Jentsch <p.jentsch@uni-bielefeld.de>
 import argparse
 import multiprocessing
 import os
 import re
 import sys
 from pyflow import WorkflowRunner
@@ -35,7 +36,6 @@ def parse_arguments():
        requirements: imagemagick, ocropus, pdftoppm, pdfunite, \
        poppler-utils, pyflow, python2.7, python3.5, tesseract'
    )
    parser.add_argument(
        '-l',
        dest='lang',
@@ -92,12 +92,6 @@ class OCRWorkflow(WorkflowRunner):
        self.nCores = args.nCores
    def workflow(self):
        '''
        ' Starting workflow...
        '''
        for index, job in enumerate(self.jobs):
            print('%i: %s' % (index, job))
        '''
        ' Creating output directories...
        '''
@@ -114,9 +108,8 @@ class OCRWorkflow(WorkflowRunner):
                    os.path.join(job['output_dir'], 'tmp', 'txt')
                )
            if not self.skipBinarisation:
-                cmd += ' "%s" "%s"' % (
+                cmd += ' "%s"' % (
-                    os.path.join(job['output_dir'], 'tmp', 'bin.png'),
+                    os.path.join(job['output_dir'], 'tmp', 'bin.png')
                    os.path.join(job['output_dir'], 'tmp', 'nrm.png'),
                )
            create_output_directories_jobs.append(
                self.addTask(
@@ -186,7 +179,13 @@ class OCRWorkflow(WorkflowRunner):
            post_binarisation_jobs = []
            for index, job in enumerate(self.jobs):
                number = 0
-                for file in filter(lambda x: x.endswith(('.bin.png', '.nrm.png')), os.listdir(os.path.join(job['output_dir'], 'tmp'))):
+                files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
                files = filter(lambda x: x.endswith('.bin.png'), files)
                files = sorted(
                    files,
                    key=lambda x: int(re.search(r'\d+', x).group(0))
                )
                for file in files:
                    cmd = 'mv "%s" "%s"' % (
                        os.path.join(job['output_dir'], 'tmp', file),
                        os.path.join(job['output_dir'], 'tmp', 'page-%i.%s' % (
@@ -210,7 +209,6 @@ class OCRWorkflow(WorkflowRunner):
        ' Performing OCR...
        '''
        self.waitForTasks()
        print(self)
        ocr_jobs = []
        '''
        ' Tesseract runs fastest with four cores. So we run it with either four
@@ -226,7 +224,16 @@ class OCRWorkflow(WorkflowRunner):
            ocr_job_nCores = 1
        for index, job in enumerate(self.jobs):
            number = 0
-            for file in filter(lambda x: x.endswith('.tif') if self.skipBinarisation else x.endswith('.bin.png'), os.listdir(os.path.join(job['output_dir'], 'tmp'))):
+            files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
            if self.skipBinarisation:
                files = filter(lambda x: x.endswith('.tif'), files)
            else:
                files = filter(lambda x: x.endswith('.bin.png'), files)
            files = sorted(
                files,
                key=lambda x: int(re.search(r'\d+', x).group(0))
            )
            for file in files:
                cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % (
                    os.path.join(job['output_dir'], 'tmp', file),
                    os.path.join(
@@ -240,8 +247,9 @@ class OCRWorkflow(WorkflowRunner):
                    ocr_job_dependencies = 'split_job_-_%i' % (index)
                else:
                    ocr_job_dependencies = filter(
-                        lambda x: x.startswith(
+                        lambda x: x == 'post_binarisation_job_-_%i-%i' % (
-                            'post_binarisation_job_-_%i' % (index)
+                            index,
                            number
                        ),
                        post_binarisation_jobs
                    )
@@ -357,9 +365,8 @@ class OCRWorkflow(WorkflowRunner):
                        os.path.join(job['output_dir'], 'tmp'),
                        os.path.join(job['output_dir'], 'tmp', 'bin.png'),
                    )
-                    cmd += ' && mv "%s"/*.nrm.png "%s"' % (
+                    cmd += ' && rm "%s"/*.nrm.png' % (
-                        os.path.join(job['output_dir'], 'tmp'),
+                        os.path.join(job['output_dir'], 'tmp')
                        os.path.join(job['output_dir'], 'tmp', 'nrm.png'),
                    )
                cleanup_jobs.append(
                    self.addTask(