From b9dba80d7ff372db346d5dda94c6ce656d3cfc52 Mon Sep 17 00:00:00 2001
From: Patrick Jentsch <pjentsch@pjentsch-Laptop.local>
Date: Wed, 15 May 2019 13:54:08 +0200
Subject: [PATCH] update for better graph

---
 ocr | 125 ++++++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 80 insertions(+), 45 deletions(-)

diff --git a/ocr b/ocr
index 59bce29..46a8dba 100755
--- a/ocr
+++ b/ocr
@@ -57,11 +57,11 @@ def parse_arguments():
         required=False
     )
     parser.add_argument(
-        '--skip-binarization',
+        '--skip-binarisation',
         action='store_true',
         default=False,
-        dest='skipBinarization',
-        help='Skip binarization.',
+        dest='skipBinarisation',
+        help='Skip binarisation.',
         required=False
     )
     parser.add_argument(
@@ -86,21 +86,21 @@ def parse_arguments():
 class OCRWorkflow(WorkflowRunner):
     def __init__(self, args):
         self.jobs = analyze_jobs(args.inputDirectory, args.outputDirectory)
-        self.skipBinarization = args.skipBinarization
+        self.skipBinarisation = args.skipBinarisation
         self.keepIntermediates = args.keepIntermediates
         self.lang = args.lang
         self.nCores = args.nCores
 
     def workflow(self):
-        print('##########################################################')
-        print('# Starting workflow...                                   #')
-        print('##########################################################')
+        '''
+        ' Starting workflow...
+        '''
         for index, job in enumerate(self.jobs):
             print('%i: %s' % (index, job))
 
-        print('##########################################################')
-        print('# Creating output directories...                         #')
-        print('##########################################################')
+        '''
+        ' Creating output directories...
+        '''
         create_output_directories_jobs = []
         for index, job in enumerate(self.jobs):
             cmd = 'mkdir -p "%s"' % (
@@ -113,7 +113,7 @@ class OCRWorkflow(WorkflowRunner):
                     os.path.join(job['output_dir'], 'tmp', 'tiff'),
                     os.path.join(job['output_dir'], 'tmp', 'txt')
                 )
-            if not self.skipBinarization:
+            if not self.skipBinarisation:
                 cmd += ' "%s" "%s"' % (
                     os.path.join(job['output_dir'], 'tmp', 'bin.png'),
                     os.path.join(job['output_dir'], 'tmp', 'nrm.png'),
@@ -124,11 +124,10 @@ class OCRWorkflow(WorkflowRunner):
                     label='create_output_directories_job_-_%i' % (index)
                 )
             )
-        self.waitForTasks()
 
-        print('##########################################################')
-        print('# Splitting...                                           #')
-        print('##########################################################')
+        '''
+        ' Splitting...
+        '''
         split_jobs = []
         split_job_nCores = min(
             self.nCores,
@@ -148,16 +147,16 @@ class OCRWorkflow(WorkflowRunner):
             split_jobs.append(
                 self.addTask(
                     command=cmd,
+                    dependencies='create_output_directories_job_-_%i' % (index),
                     label='split_job_-_%i' % (index),
                     nCores=split_job_nCores
                 )
             )
-        self.waitForTasks()
 
-        if not self.skipBinarization:
-            print('##########################################################')
-            print('# Binarising...                                          #')
-            print('##########################################################')
+        if not self.skipBinarisation:
+            '''
+            ' Binarising...
+            '''
             binarisation_jobs = []
             '''
             ' We run ocropus-nlbin with either four or, if there are less then
@@ -174,15 +173,16 @@ class OCRWorkflow(WorkflowRunner):
                 binarisation_jobs.append(
                     self.addTask(
                         command=cmd,
+                        dependencies='split_job_-_%i' % (index),
                         label='binarisation_job_-_%i' % (index),
                         nCores=binarisation_job_nCores
                     )
                 )
-            self.waitForTasks()
 
-            print('##########################################################')
-            print('# Normalising file names from binarisation...            #')
-            print('##########################################################')
+            '''
+            ' Normalising file names from binarisation...
+            '''
+            self.waitForTasks()
             post_binarisation_jobs = []
             for index, job in enumerate(self.jobs):
                 number = 0
@@ -197,6 +197,7 @@ class OCRWorkflow(WorkflowRunner):
                     post_binarisation_jobs.append(
                         self.addTask(
                             command=cmd,
+                            dependencies='binarisation_job_-_%i' % (index),
                             label='post_binarisation_job_-_%i-%i' % (
                                 index,
                                 number
@@ -204,11 +205,12 @@ class OCRWorkflow(WorkflowRunner):
                         )
                     )
                     number += 1
-            self.waitForTasks()
 
-        print('##########################################################')
-        print('# Performing OCR...                                      #')
-        print('##########################################################')
+        '''
+        ' Performing OCR...
+        '''
+        self.waitForTasks()
+        print(self)
         ocr_jobs = []
         '''
         ' Tesseract runs fastest with four cores. So we run it with either four
@@ -224,29 +226,39 @@ class OCRWorkflow(WorkflowRunner):
             ocr_job_nCores = 1
         for index, job in enumerate(self.jobs):
             number = 0
-            for file in filter(lambda x: x.endswith('.tif') if self.skipBinarization else x.endswith('.bin.png'), os.listdir(os.path.join(job['output_dir'], 'tmp'))):
+            for file in filter(lambda x: x.endswith('.tif') if self.skipBinarisation else x.endswith('.bin.png'), os.listdir(os.path.join(job['output_dir'], 'tmp'))):
                 cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % (
                     os.path.join(job['output_dir'], 'tmp', file),
                     os.path.join(
                         job['output_dir'],
                         'tmp',
-                        file.rsplit('.', 1 if self.skipBinarization else 2)[0]
+                        file.rsplit('.', 1 if self.skipBinarisation else 2)[0]
                     ),
                     self.lang
                 )
+                if self.skipBinarisation:
+                    ocr_job_dependencies = 'split_job_-_%i' % (index)
+                else:
+                    ocr_job_dependencies = filter(
+                        lambda x: x.startswith(
+                            'post_binarisation_job_-_%i' % (index)
+                        ),
+                        post_binarisation_jobs
+                    )
+                print(ocr_job_dependencies)
                 ocr_jobs.append(
                     self.addTask(
                         command=cmd,
+                        dependencies=ocr_job_dependencies,
                         label='ocr_job_-_%i-%i' % (index, number),
                         nCores=ocr_job_nCores
                     )
                 )
                 number += 1
-        self.waitForTasks()
 
-        print('##########################################################')
-        print('# Creating TEI P5 files...                               #')
-        print('##########################################################')
+        '''
+        ' Creating TEI P5 files...
+        '''
         hocr_to_tei_jobs = []
         for index, job in enumerate(self.jobs):
             cmd = 'hocrtotei "%s" "%s"' % (
@@ -259,13 +271,17 @@ class OCRWorkflow(WorkflowRunner):
             hocr_to_tei_jobs.append(
                 self.addTask(
                     command=cmd,
+                    dependencies=filter(
+                        lambda x: x.startswith('ocr_job_-_%i' % (index)),
+                        ocr_jobs
+                    ),
                     label='hocr_to_tei_job_-_%i' % (index)
                 )
             )
 
-        print('##########################################################')
-        print('# Merging PDF files...                                   #')
-        print('##########################################################')
+        '''
+        ' Merging PDF files...
+        '''
         pdf_merge_jobs = []
         for index, job in enumerate(self.jobs):
             cmd = '(ls --quoting-style=shell-escape -v "%s"/*.pdf && echo "\'%s\'") | xargs pdfunite' % (
@@ -278,13 +294,17 @@ class OCRWorkflow(WorkflowRunner):
             pdf_merge_jobs.append(
                 self.addTask(
                     command=cmd,
+                    dependencies=filter(
+                        lambda x: x.startswith('ocr_job_-_%i' % (index)),
+                        ocr_jobs
+                    ),
                     label='pdf_merge_job_-_%i' % (index)
                 )
             )
 
-        print('##########################################################')
-        print('# Merging text files...                                  #')
-        print('##########################################################')
+        '''
+        ' Merging text files...
+        '''
         txt_merge_jobs = []
         for index, job in enumerate(self.jobs):
             cmd = 'ls --quoting-style=shell-escape -v "%s"/*.txt | xargs cat > "%s"' % (
@@ -297,17 +317,25 @@ class OCRWorkflow(WorkflowRunner):
             txt_merge_jobs.append(
                 self.addTask(
                     command=cmd,
+                    dependencies=filter(
+                        lambda x: x.startswith('ocr_job_-_%i' % (index)),
+                        ocr_jobs
+                    ),
                     label='txt_merge_job_-_%i' % (index)
                 )
             )
-        self.waitForTasks()
 
-        print('##########################################################')
-        print('# Cleanup...                                             #')
-        print('##########################################################')
+        '''
+        ' Cleanup...
+        '''
         cleanup_jobs = []
         if self.keepIntermediates:
             for index, job in enumerate(self.jobs):
+                cleanup_job_dependencies = [
+                    'hocr_to_tei_job_-_%i' % (index),
+                    'pdf_merge_job_-_%i' % (index),
+                    'txt_merge_job_-_%i' % (index)
+                ]
                 cmd = 'mv "%s"/*.hocr "%s"' % (
                     os.path.join(job['output_dir'], 'tmp'),
                     os.path.join(job['output_dir'], 'tmp', 'hocr'),
@@ -324,7 +352,7 @@ class OCRWorkflow(WorkflowRunner):
                     os.path.join(job['output_dir'], 'tmp'),
                     os.path.join(job['output_dir'], 'tmp', 'txt'),
                 )
-                if not self.skipBinarization:
+                if not self.skipBinarisation:
                     cmd += ' && mv "%s"/*.bin.png "%s"' % (
                         os.path.join(job['output_dir'], 'tmp'),
                         os.path.join(job['output_dir'], 'tmp', 'bin.png'),
@@ -336,17 +364,24 @@ class OCRWorkflow(WorkflowRunner):
                 cleanup_jobs.append(
                     self.addTask(
                         command=cmd,
+                        dependencies=cleanup_job_dependencies,
                         label='cleanup_job_-_%i' % (index)
                     )
                 )
         else:
             for index, job in enumerate(self.jobs):
+                cleanup_job_dependencies = [
+                    'hocr_to_tei_job_-_%i' % (index),
+                    'pdf_merge_job_-_%i' % (index),
+                    'txt_merge_job_-_%i' % (index)
+                ]
                 cmd = 'rm -r "%s"' % (
                     os.path.join(job['output_dir'], 'tmp')
                 )
                 cleanup_jobs.append(
                     self.addTask(
                         command=cmd,
+                        dependencies=cleanup_job_dependencies,
                         label='cleanup_job_-_%i' % (index)
                     )
                 )