From 03b10545607dbbf7376b582f49746d6d43b92fec Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Wed, 15 May 2019 14:55:36 +0200 Subject: [PATCH] Sort all lists before processing --- ocr | 43 +++++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/ocr b/ocr index 46a8dba..3ff4a0f 100755 --- a/ocr +++ b/ocr @@ -13,6 +13,7 @@ Author: Patrick Jentsch import argparse import multiprocessing import os +import re import sys from pyflow import WorkflowRunner @@ -35,7 +36,6 @@ def parse_arguments(): requirements: imagemagick, ocropus, pdftoppm, pdfunite, \ poppler-utils, pyflow, python2.7, python3.5, tesseract' ) - parser.add_argument( '-l', dest='lang', @@ -92,12 +92,6 @@ class OCRWorkflow(WorkflowRunner): self.nCores = args.nCores def workflow(self): - ''' - ' Starting workflow... - ''' - for index, job in enumerate(self.jobs): - print('%i: %s' % (index, job)) - ''' ' Creating output directories... ''' @@ -114,9 +108,8 @@ class OCRWorkflow(WorkflowRunner): os.path.join(job['output_dir'], 'tmp', 'txt') ) if not self.skipBinarisation: - cmd += ' "%s" "%s"' % ( - os.path.join(job['output_dir'], 'tmp', 'bin.png'), - os.path.join(job['output_dir'], 'tmp', 'nrm.png'), + cmd += ' "%s"' % ( + os.path.join(job['output_dir'], 'tmp', 'bin.png') ) create_output_directories_jobs.append( self.addTask( @@ -186,7 +179,13 @@ class OCRWorkflow(WorkflowRunner): post_binarisation_jobs = [] for index, job in enumerate(self.jobs): number = 0 - for file in filter(lambda x: x.endswith(('.bin.png', '.nrm.png')), os.listdir(os.path.join(job['output_dir'], 'tmp'))): + files = os.listdir(os.path.join(job['output_dir'], 'tmp')) + files = filter(lambda x: x.endswith('.bin.png'), files) + files = sorted( + files, + key=lambda x: int(re.search(r'\d+', x).group(0)) + ) + for file in files: cmd = 'mv "%s" "%s"' % ( os.path.join(job['output_dir'], 'tmp', file), os.path.join(job['output_dir'], 'tmp', 'page-%i.%s' % ( @@ -210,7 +209,6 @@ class OCRWorkflow(WorkflowRunner): ' Performing OCR... ''' self.waitForTasks() - print(self) ocr_jobs = [] ''' ' Tesseract runs fastest with four cores. So we run it with either four @@ -226,7 +224,16 @@ class OCRWorkflow(WorkflowRunner): ocr_job_nCores = 1 for index, job in enumerate(self.jobs): number = 0 - for file in filter(lambda x: x.endswith('.tif') if self.skipBinarisation else x.endswith('.bin.png'), os.listdir(os.path.join(job['output_dir'], 'tmp'))): + files = os.listdir(os.path.join(job['output_dir'], 'tmp')) + if self.skipBinarisation: + files = filter(lambda x: x.endswith('.tif'), files) + else: + files = filter(lambda x: x.endswith('.bin.png'), files) + files = sorted( + files, + key=lambda x: int(re.search(r'\d+', x).group(0)) + ) + for file in files: cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % ( os.path.join(job['output_dir'], 'tmp', file), os.path.join( @@ -240,8 +247,9 @@ class OCRWorkflow(WorkflowRunner): ocr_job_dependencies = 'split_job_-_%i' % (index) else: ocr_job_dependencies = filter( - lambda x: x.startswith( - 'post_binarisation_job_-_%i' % (index) + lambda x: x == 'post_binarisation_job_-_%i-%i' % ( + index, + number ), post_binarisation_jobs ) @@ -357,9 +365,8 @@ class OCRWorkflow(WorkflowRunner): os.path.join(job['output_dir'], 'tmp'), os.path.join(job['output_dir'], 'tmp', 'bin.png'), ) - cmd += ' && mv "%s"/*.nrm.png "%s"' % ( - os.path.join(job['output_dir'], 'tmp'), - os.path.join(job['output_dir'], 'tmp', 'nrm.png'), + cmd += ' && rm "%s"/*.nrm.png' % ( + os.path.join(job['output_dir'], 'tmp') ) cleanup_jobs.append( self.addTask(