Sort all lists before processing

This commit is contained in:
Patrick Jentsch 2019-05-15 14:55:36 +02:00
parent b9dba80d7f
commit 03b1054560

43
ocr
View File

@ -13,6 +13,7 @@ Author: Patrick Jentsch <p.jentsch@uni-bielefeld.de>
import argparse import argparse
import multiprocessing import multiprocessing
import os import os
import re
import sys import sys
from pyflow import WorkflowRunner from pyflow import WorkflowRunner
@ -35,7 +36,6 @@ def parse_arguments():
requirements: imagemagick, ocropus, pdftoppm, pdfunite, \ requirements: imagemagick, ocropus, pdftoppm, pdfunite, \
poppler-utils, pyflow, python2.7, python3.5, tesseract' poppler-utils, pyflow, python2.7, python3.5, tesseract'
) )
parser.add_argument( parser.add_argument(
'-l', '-l',
dest='lang', dest='lang',
@ -92,12 +92,6 @@ class OCRWorkflow(WorkflowRunner):
self.nCores = args.nCores self.nCores = args.nCores
def workflow(self): def workflow(self):
'''
' Starting workflow...
'''
for index, job in enumerate(self.jobs):
print('%i: %s' % (index, job))
''' '''
' Creating output directories... ' Creating output directories...
''' '''
@ -114,9 +108,8 @@ class OCRWorkflow(WorkflowRunner):
os.path.join(job['output_dir'], 'tmp', 'txt') os.path.join(job['output_dir'], 'tmp', 'txt')
) )
if not self.skipBinarisation: if not self.skipBinarisation:
cmd += ' "%s" "%s"' % ( cmd += ' "%s"' % (
os.path.join(job['output_dir'], 'tmp', 'bin.png'), os.path.join(job['output_dir'], 'tmp', 'bin.png')
os.path.join(job['output_dir'], 'tmp', 'nrm.png'),
) )
create_output_directories_jobs.append( create_output_directories_jobs.append(
self.addTask( self.addTask(
@ -186,7 +179,13 @@ class OCRWorkflow(WorkflowRunner):
post_binarisation_jobs = [] post_binarisation_jobs = []
for index, job in enumerate(self.jobs): for index, job in enumerate(self.jobs):
number = 0 number = 0
for file in filter(lambda x: x.endswith(('.bin.png', '.nrm.png')), os.listdir(os.path.join(job['output_dir'], 'tmp'))): files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
files = filter(lambda x: x.endswith('.bin.png'), files)
files = sorted(
files,
key=lambda x: int(re.search(r'\d+', x).group(0))
)
for file in files:
cmd = 'mv "%s" "%s"' % ( cmd = 'mv "%s" "%s"' % (
os.path.join(job['output_dir'], 'tmp', file), os.path.join(job['output_dir'], 'tmp', file),
os.path.join(job['output_dir'], 'tmp', 'page-%i.%s' % ( os.path.join(job['output_dir'], 'tmp', 'page-%i.%s' % (
@ -210,7 +209,6 @@ class OCRWorkflow(WorkflowRunner):
' Performing OCR... ' Performing OCR...
''' '''
self.waitForTasks() self.waitForTasks()
print(self)
ocr_jobs = [] ocr_jobs = []
''' '''
' Tesseract runs fastest with four cores. So we run it with either four ' Tesseract runs fastest with four cores. So we run it with either four
@ -226,7 +224,16 @@ class OCRWorkflow(WorkflowRunner):
ocr_job_nCores = 1 ocr_job_nCores = 1
for index, job in enumerate(self.jobs): for index, job in enumerate(self.jobs):
number = 0 number = 0
for file in filter(lambda x: x.endswith('.tif') if self.skipBinarisation else x.endswith('.bin.png'), os.listdir(os.path.join(job['output_dir'], 'tmp'))): files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
if self.skipBinarisation:
files = filter(lambda x: x.endswith('.tif'), files)
else:
files = filter(lambda x: x.endswith('.bin.png'), files)
files = sorted(
files,
key=lambda x: int(re.search(r'\d+', x).group(0))
)
for file in files:
cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % ( cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % (
os.path.join(job['output_dir'], 'tmp', file), os.path.join(job['output_dir'], 'tmp', file),
os.path.join( os.path.join(
@ -240,8 +247,9 @@ class OCRWorkflow(WorkflowRunner):
ocr_job_dependencies = 'split_job_-_%i' % (index) ocr_job_dependencies = 'split_job_-_%i' % (index)
else: else:
ocr_job_dependencies = filter( ocr_job_dependencies = filter(
lambda x: x.startswith( lambda x: x == 'post_binarisation_job_-_%i-%i' % (
'post_binarisation_job_-_%i' % (index) index,
number
), ),
post_binarisation_jobs post_binarisation_jobs
) )
@ -357,9 +365,8 @@ class OCRWorkflow(WorkflowRunner):
os.path.join(job['output_dir'], 'tmp'), os.path.join(job['output_dir'], 'tmp'),
os.path.join(job['output_dir'], 'tmp', 'bin.png'), os.path.join(job['output_dir'], 'tmp', 'bin.png'),
) )
cmd += ' && mv "%s"/*.nrm.png "%s"' % ( cmd += ' && rm "%s"/*.nrm.png' % (
os.path.join(job['output_dir'], 'tmp'), os.path.join(job['output_dir'], 'tmp')
os.path.join(job['output_dir'], 'tmp', 'nrm.png'),
) )
cleanup_jobs.append( cleanup_jobs.append(
self.addTask( self.addTask(