diff --git a/ocr b/ocr index 59bce29..46a8dba 100755 --- a/ocr +++ b/ocr @@ -57,11 +57,11 @@ def parse_arguments(): required=False ) parser.add_argument( - '--skip-binarization', + '--skip-binarisation', action='store_true', default=False, - dest='skipBinarization', - help='Skip binarization.', + dest='skipBinarisation', + help='Skip binarisation.', required=False ) parser.add_argument( @@ -86,21 +86,21 @@ def parse_arguments(): class OCRWorkflow(WorkflowRunner): def __init__(self, args): self.jobs = analyze_jobs(args.inputDirectory, args.outputDirectory) - self.skipBinarization = args.skipBinarization + self.skipBinarisation = args.skipBinarisation self.keepIntermediates = args.keepIntermediates self.lang = args.lang self.nCores = args.nCores def workflow(self): - print('##########################################################') - print('# Starting workflow... #') - print('##########################################################') + ''' + ' Starting workflow... + ''' for index, job in enumerate(self.jobs): print('%i: %s' % (index, job)) - print('##########################################################') - print('# Creating output directories... #') - print('##########################################################') + ''' + ' Creating output directories... + ''' create_output_directories_jobs = [] for index, job in enumerate(self.jobs): cmd = 'mkdir -p "%s"' % ( @@ -113,7 +113,7 @@ class OCRWorkflow(WorkflowRunner): os.path.join(job['output_dir'], 'tmp', 'tiff'), os.path.join(job['output_dir'], 'tmp', 'txt') ) - if not self.skipBinarization: + if not self.skipBinarisation: cmd += ' "%s" "%s"' % ( os.path.join(job['output_dir'], 'tmp', 'bin.png'), os.path.join(job['output_dir'], 'tmp', 'nrm.png'), @@ -124,11 +124,10 @@ class OCRWorkflow(WorkflowRunner): label='create_output_directories_job_-_%i' % (index) ) ) - self.waitForTasks() - print('##########################################################') - print('# Splitting... #') - print('##########################################################') + ''' + ' Splitting... + ''' split_jobs = [] split_job_nCores = min( self.nCores, @@ -148,16 +147,16 @@ class OCRWorkflow(WorkflowRunner): split_jobs.append( self.addTask( command=cmd, + dependencies='create_output_directories_job_-_%i' % (index), label='split_job_-_%i' % (index), nCores=split_job_nCores ) ) - self.waitForTasks() - if not self.skipBinarization: - print('##########################################################') - print('# Binarising... #') - print('##########################################################') + if not self.skipBinarisation: + ''' + ' Binarising... + ''' binarisation_jobs = [] ''' ' We run ocropus-nlbin with either four or, if there are less then @@ -174,15 +173,16 @@ class OCRWorkflow(WorkflowRunner): binarisation_jobs.append( self.addTask( command=cmd, + dependencies='split_job_-_%i' % (index), label='binarisation_job_-_%i' % (index), nCores=binarisation_job_nCores ) ) - self.waitForTasks() - print('##########################################################') - print('# Normalising file names from binarisation... #') - print('##########################################################') + ''' + ' Normalising file names from binarisation... + ''' + self.waitForTasks() post_binarisation_jobs = [] for index, job in enumerate(self.jobs): number = 0 @@ -197,6 +197,7 @@ class OCRWorkflow(WorkflowRunner): post_binarisation_jobs.append( self.addTask( command=cmd, + dependencies='binarisation_job_-_%i' % (index), label='post_binarisation_job_-_%i-%i' % ( index, number @@ -204,11 +205,12 @@ class OCRWorkflow(WorkflowRunner): ) ) number += 1 - self.waitForTasks() - print('##########################################################') - print('# Performing OCR... #') - print('##########################################################') + ''' + ' Performing OCR... + ''' + self.waitForTasks() + print(self) ocr_jobs = [] ''' ' Tesseract runs fastest with four cores. So we run it with either four @@ -224,29 +226,39 @@ class OCRWorkflow(WorkflowRunner): ocr_job_nCores = 1 for index, job in enumerate(self.jobs): number = 0 - for file in filter(lambda x: x.endswith('.tif') if self.skipBinarization else x.endswith('.bin.png'), os.listdir(os.path.join(job['output_dir'], 'tmp'))): + for file in filter(lambda x: x.endswith('.tif') if self.skipBinarisation else x.endswith('.bin.png'), os.listdir(os.path.join(job['output_dir'], 'tmp'))): cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % ( os.path.join(job['output_dir'], 'tmp', file), os.path.join( job['output_dir'], 'tmp', - file.rsplit('.', 1 if self.skipBinarization else 2)[0] + file.rsplit('.', 1 if self.skipBinarisation else 2)[0] ), self.lang ) + if self.skipBinarisation: + ocr_job_dependencies = 'split_job_-_%i' % (index) + else: + ocr_job_dependencies = filter( + lambda x: x.startswith( + 'post_binarisation_job_-_%i' % (index) + ), + post_binarisation_jobs + ) + print(ocr_job_dependencies) ocr_jobs.append( self.addTask( command=cmd, + dependencies=ocr_job_dependencies, label='ocr_job_-_%i-%i' % (index, number), nCores=ocr_job_nCores ) ) number += 1 - self.waitForTasks() - print('##########################################################') - print('# Creating TEI P5 files... #') - print('##########################################################') + ''' + ' Creating TEI P5 files... + ''' hocr_to_tei_jobs = [] for index, job in enumerate(self.jobs): cmd = 'hocrtotei "%s" "%s"' % ( @@ -259,13 +271,17 @@ class OCRWorkflow(WorkflowRunner): hocr_to_tei_jobs.append( self.addTask( command=cmd, + dependencies=filter( + lambda x: x.startswith('ocr_job_-_%i' % (index)), + ocr_jobs + ), label='hocr_to_tei_job_-_%i' % (index) ) ) - print('##########################################################') - print('# Merging PDF files... #') - print('##########################################################') + ''' + ' Merging PDF files... + ''' pdf_merge_jobs = [] for index, job in enumerate(self.jobs): cmd = '(ls --quoting-style=shell-escape -v "%s"/*.pdf && echo "\'%s\'") | xargs pdfunite' % ( @@ -278,13 +294,17 @@ class OCRWorkflow(WorkflowRunner): pdf_merge_jobs.append( self.addTask( command=cmd, + dependencies=filter( + lambda x: x.startswith('ocr_job_-_%i' % (index)), + ocr_jobs + ), label='pdf_merge_job_-_%i' % (index) ) ) - print('##########################################################') - print('# Merging text files... #') - print('##########################################################') + ''' + ' Merging text files... + ''' txt_merge_jobs = [] for index, job in enumerate(self.jobs): cmd = 'ls --quoting-style=shell-escape -v "%s"/*.txt | xargs cat > "%s"' % ( @@ -297,17 +317,25 @@ class OCRWorkflow(WorkflowRunner): txt_merge_jobs.append( self.addTask( command=cmd, + dependencies=filter( + lambda x: x.startswith('ocr_job_-_%i' % (index)), + ocr_jobs + ), label='txt_merge_job_-_%i' % (index) ) ) - self.waitForTasks() - print('##########################################################') - print('# Cleanup... #') - print('##########################################################') + ''' + ' Cleanup... + ''' cleanup_jobs = [] if self.keepIntermediates: for index, job in enumerate(self.jobs): + cleanup_job_dependencies = [ + 'hocr_to_tei_job_-_%i' % (index), + 'pdf_merge_job_-_%i' % (index), + 'txt_merge_job_-_%i' % (index) + ] cmd = 'mv "%s"/*.hocr "%s"' % ( os.path.join(job['output_dir'], 'tmp'), os.path.join(job['output_dir'], 'tmp', 'hocr'), @@ -324,7 +352,7 @@ class OCRWorkflow(WorkflowRunner): os.path.join(job['output_dir'], 'tmp'), os.path.join(job['output_dir'], 'tmp', 'txt'), ) - if not self.skipBinarization: + if not self.skipBinarisation: cmd += ' && mv "%s"/*.bin.png "%s"' % ( os.path.join(job['output_dir'], 'tmp'), os.path.join(job['output_dir'], 'tmp', 'bin.png'), @@ -336,17 +364,24 @@ class OCRWorkflow(WorkflowRunner): cleanup_jobs.append( self.addTask( command=cmd, + dependencies=cleanup_job_dependencies, label='cleanup_job_-_%i' % (index) ) ) else: for index, job in enumerate(self.jobs): + cleanup_job_dependencies = [ + 'hocr_to_tei_job_-_%i' % (index), + 'pdf_merge_job_-_%i' % (index), + 'txt_merge_job_-_%i' % (index) + ] cmd = 'rm -r "%s"' % ( os.path.join(job['output_dir'], 'tmp') ) cleanup_jobs.append( self.addTask( command=cmd, + dependencies=cleanup_job_dependencies, label='cleanup_job_-_%i' % (index) ) )