diff --git a/ocr b/ocr index 9bf65a1..5783263 100755 --- a/ocr +++ b/ocr @@ -29,46 +29,48 @@ from pyflow import WorkflowRunner def parse_arguments(): - parser = argparse.ArgumentParser(description='Performs OCR of (historical) documents utilizing OCRopus for preprocessing and Tesseract OCR for OCR. Available outputs are hOCR, PDF, TEI compliant XML and raw text. Software requirements: imagemagick, ocropus, pdftoppm, pdfunite, poppler-utils, pyflow, python2.7, python3.5, tesseract') + parser = argparse.ArgumentParser( + description='Performs OCR of (historical) documents utilizing OCRopus for preprocessing and Tesseract OCR for OCR. The results are served as hOCR, PDF, raw text and TEI compliant XML files.\nSoftware requirements: imagemagick, ocropus, pdftoppm, pdfunite, poppler-utils, pyflow, python2.7, python3.5, tesseract' + ) parser.add_argument( '-i', - dest='inputDirectory', - help='The input directory.', + dest='input_dir', required=True ) parser.add_argument( '-l', + choices=[ + 'deu', 'deu_frak', 'eng', 'enm', 'fra', 'frm', 'ita', 'por', 'spa' + ], dest='lang', - help='Language for OCR.', required=True ) parser.add_argument( '-o', - dest='outputDirectory', - help='The output directory.', + dest='output_dir', required=True ) parser.add_argument( '--skip-binarisation', action='store_true', default=False, - dest='skipBinarisation', - help='Skip binarisation.', + dest='skip_binarisation', + help='skip ocropy binarisation', required=False ) parser.add_argument( '--keep-intermediates', action='store_true', default=False, - dest='keepIntermediates', - help='Keep intermediate files.', + dest='keep_intermediates', + help='keep intermediate files', required=False ) parser.add_argument( '--nCores', default=min(4, multiprocessing.cpu_count()), - dest='nCores', - help='Total number of cores available.', + dest='n_cores', + help='total number of cores available', required=False, type=int ) @@ -77,13 +79,16 @@ def parse_arguments(): class OCRWorkflow(WorkflowRunner): def __init__(self, args): - self.jobs = analyze_jobs(args.inputDirectory, args.outputDirectory) - self.skipBinarisation = args.skipBinarisation - self.keepIntermediates = args.keepIntermediates + self.jobs = analyze_jobs(args.input_dir, args.output_dir) + self.skip_binarisation = args.skip_binarisation + self.keep_intermediates = args.keep_intermediates self.lang = args.lang - self.nCores = args.nCores + self.n_cores = args.n_cores def workflow(self): + if len(self.jobs) == 0: + return + ''' ' ################################################## ' # Create output directories # @@ -94,14 +99,14 @@ class OCRWorkflow(WorkflowRunner): cmd = 'mkdir -p "%s"' % ( os.path.join(job['output_dir'], 'tmp') ) - if self.keepIntermediates: + if self.keep_intermediates: cmd += ' "%s" "%s" "%s" "%s"' % ( os.path.join(job['output_dir'], 'tmp', 'hocr'), os.path.join(job['output_dir'], 'tmp', 'pdf'), os.path.join(job['output_dir'], 'tmp', 'tiff'), os.path.join(job['output_dir'], 'tmp', 'txt') ) - if not self.skipBinarisation: + if not self.skip_binarisation: cmd += ' "%s"' % ( os.path.join(job['output_dir'], 'tmp', 'bin.png') ) @@ -119,12 +124,16 @@ class OCRWorkflow(WorkflowRunner): ''' split_jobs = [] split_job_nCores = min( - self.nCores, - max(1, int(self.nCores / len(self.jobs))) + self.n_cores, + max(1, int(self.n_cores / len(self.jobs))) ) for index, job in enumerate(self.jobs): if job['filename'].endswith(('.tif', '.tiff')): - cmd = 'convert "%s" -compress LZW -density 300 -scene 1 "%s/page-%%d.tif"' % ( + ''' + ' This command also works for PDF input but ocropus-nlbin + ' is not able to handle the TIFF output of it. + ''' + cmd = 'convert -density 300 "%s" -compress LZW -scene 1 "%s/page-%%d.tif"' % ( job['path'], os.path.join(job['output_dir'], 'tmp') ) @@ -133,6 +142,7 @@ class OCRWorkflow(WorkflowRunner): job['path'], os.path.join(job['output_dir'], 'tmp', 'page') ) + split_jobs.append( self.addTask( command=cmd, @@ -142,7 +152,7 @@ class OCRWorkflow(WorkflowRunner): ) ) - if not self.skipBinarisation: + if not self.skip_binarisation: ''' ' The binarisation_jobs are created based of the output files of ' the split_jobs. So wait until they are finished. @@ -160,7 +170,7 @@ class OCRWorkflow(WorkflowRunner): ' four cores available for this workflow, the available core ' number. ''' - binarisation_job_nCores = min(4, self.nCores) + binarisation_job_nCores = min(4, self.n_cores) for index, job in enumerate(self.jobs): files = os.listdir(os.path.join(job['output_dir'], 'tmp')) files = filter(lambda x: x.endswith('.tif'), files) @@ -239,7 +249,7 @@ class OCRWorkflow(WorkflowRunner): ' or, if there are less then four cores available for this workflow, ' the available core number. ''' - ocr_job_nCores = min(4, self.nCores) + ocr_job_nCores = min(4, self.n_cores) ''' ' WORKAROUND: Tesseract only uses one core for the deu_frak language ' model, so the workflow will also only reserve one in this case. @@ -248,7 +258,7 @@ class OCRWorkflow(WorkflowRunner): ocr_job_nCores = 1 for index, job in enumerate(self.jobs): files = os.listdir(os.path.join(job['output_dir'], 'tmp')) - if self.skipBinarisation: + if self.skip_binarisation: files = filter(lambda x: x.endswith('.tif'), files) else: files = filter(lambda x: x.endswith('.bin.png'), files) @@ -264,11 +274,11 @@ class OCRWorkflow(WorkflowRunner): os.path.join( job['output_dir'], 'tmp', - file.rsplit('.', 1 if self.skipBinarisation else 2)[0] + file.rsplit('.', 1 if self.skip_binarisation else 2)[0] ), self.lang ) - if self.skipBinarisation: + if self.skip_binarisation: ocr_job_dependencies = 'split_job_-_%i' % (index) else: ocr_job_dependencies = filter( @@ -396,7 +406,7 @@ class OCRWorkflow(WorkflowRunner): ' ################################################## ''' cleanup_jobs = [] - if self.keepIntermediates: + if self.keep_intermediates: for index, job in enumerate(self.jobs): cleanup_job_dependencies = [ 'hocr_to_tei_job_-_%i' % (index), @@ -419,7 +429,7 @@ class OCRWorkflow(WorkflowRunner): os.path.join(job['output_dir'], 'tmp'), os.path.join(job['output_dir'], 'tmp', 'txt'), ) - if not self.skipBinarisation: + if not self.skip_binarisation: cmd += ' && mv "%s"/*.bin.png "%s"' % ( os.path.join(job['output_dir'], 'tmp'), os.path.join(job['output_dir'], 'tmp', 'bin.png'), @@ -453,22 +463,22 @@ class OCRWorkflow(WorkflowRunner): ) -def analyze_jobs(inputDirectory, outputDirectory): +def analyze_jobs(input_dir, output_dir): jobs = [] - for file in os.listdir(inputDirectory): - if os.path.isdir(os.path.join(inputDirectory, file)): + for file in os.listdir(input_dir): + if os.path.isdir(os.path.join(input_dir, file)): jobs += analyze_jobs( - os.path.join(inputDirectory, file), - os.path.join(outputDirectory, file) + os.path.join(input_dir, file), + os.path.join(output_dir, file) ) elif file.endswith(('.pdf', '.tif', '.tiff')): jobs.append( { 'filename': file, 'name': file.rsplit('.', 1)[0], - 'output_dir': os.path.join(outputDirectory, file), - 'path': os.path.join(inputDirectory, file) + 'output_dir': os.path.join(output_dir, file), + 'path': os.path.join(input_dir, file) } ) @@ -480,7 +490,7 @@ def main(): wflow = OCRWorkflow(args) - retval = wflow.run(dataDirRoot=args.outputDirectory, nCores=args.nCores) + retval = wflow.run(dataDirRoot=args.output_dir, nCores=args.n_cores) sys.exit(retval)