From 4c0ba270db55ff131aa0db8132bf106e98e1f02b Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Thu, 16 May 2019 00:09:19 +0200 Subject: [PATCH] Update --- ocr | 147 ++++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 103 insertions(+), 44 deletions(-) diff --git a/ocr b/ocr index 3ff4a0f..312ac22 100755 --- a/ocr +++ b/ocr @@ -36,6 +36,12 @@ def parse_arguments(): requirements: imagemagick, ocropus, pdftoppm, pdfunite, \ poppler-utils, pyflow, python2.7, python3.5, tesseract' ) + parser.add_argument( + '-i', + dest='inputDirectory', + help='The input directory.', + required=True + ) parser.add_argument( '-l', dest='lang', @@ -43,18 +49,10 @@ def parse_arguments(): required=True ) parser.add_argument( - '--i', - default=os.path.normpath('/files_for_ocr'), - dest='inputDirectory', - help='The input directory.', - required=False - ) - parser.add_argument( - '--o', - default=os.path.normpath('/files_from_ocr'), + '-o', dest='outputDirectory', help='The output directory.', - required=False + required=True ) parser.add_argument( '--skip-binarisation', @@ -93,7 +91,9 @@ class OCRWorkflow(WorkflowRunner): def workflow(self): ''' - ' Creating output directories... + ' ################################################## + ' # Create output directories # + ' ################################################## ''' create_output_directories_jobs = [] for index, job in enumerate(self.jobs): @@ -119,7 +119,9 @@ class OCRWorkflow(WorkflowRunner): ) ''' - ' Splitting... + ' ################################################## + ' # Split # + ' ################################################## ''' split_jobs = [] split_job_nCores = min( @@ -128,7 +130,7 @@ class OCRWorkflow(WorkflowRunner): ) for index, job in enumerate(self.jobs): if job['filename'].endswith(('.tif', '.tiff')): - cmd = 'convert "%s" -compress LZW -density 300 -scene 1 "%s"/page-%%d.tif' % ( + cmd = 'convert "%s" -compress LZW -density 300 -scene 1 "%s/page-%%d.tif"' % ( job['path'], os.path.join(job['output_dir'], 'tmp') ) @@ -148,7 +150,15 @@ class OCRWorkflow(WorkflowRunner): if not self.skipBinarisation: ''' - ' Binarising... + ' The binarisation_jobs are created based of the output files of + ' the split_jobs. So wait until they are finished. + ''' + self.waitForTasks() + + ''' + ' ################################################## + ' # Binarise # + ' ################################################## ''' binarisation_jobs = [] ''' @@ -158,10 +168,17 @@ class OCRWorkflow(WorkflowRunner): ''' binarisation_job_nCores = min(4, self.nCores) for index, job in enumerate(self.jobs): - cmd = 'ls --quoting-style=shell-escape -v "%s"/*.tif | xargs ocropus-nlbin --output "%s" --parallel "%i"' % ( + files = os.listdir(os.path.join(job['output_dir'], 'tmp')) + files = filter(lambda x: x.endswith('.tif'), files) + files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) + files = map( + lambda x: '"' + os.path.join(job['output_dir'], 'tmp', x) + '"', + files + ) + cmd = 'ocropus-nlbin --output "%s" --parallel "%i" %s' % ( os.path.join(job['output_dir'], 'tmp'), - os.path.join(job['output_dir'], 'tmp'), - binarisation_job_nCores + binarisation_job_nCores, + ' '.join(files) ) binarisation_jobs.append( self.addTask( @@ -173,25 +190,30 @@ class OCRWorkflow(WorkflowRunner): ) ''' - ' Normalising file names from binarisation... + ' The post_binarisation_jobs are created based of the output files + ' of the binarisation_jobs. So wait until they are finished. ''' self.waitForTasks() + + ''' + ' ################################################## + ' # Normalise file names from binarisation # + ' ################################################## + ''' post_binarisation_jobs = [] for index, job in enumerate(self.jobs): number = 0 files = os.listdir(os.path.join(job['output_dir'], 'tmp')) files = filter(lambda x: x.endswith('.bin.png'), files) - files = sorted( - files, - key=lambda x: int(re.search(r'\d+', x).group(0)) - ) + files.sort() for file in files: cmd = 'mv "%s" "%s"' % ( os.path.join(job['output_dir'], 'tmp', file), - os.path.join(job['output_dir'], 'tmp', 'page-%i.%s' % ( - int(file.split('.', 1)[0]), - file.split('.', 1)[1]) - ), + os.path.join( + job['output_dir'], + 'tmp', + 'page-%i.bin.png' % (int(file.split('.', 1)[0])) + ) ) post_binarisation_jobs.append( self.addTask( @@ -206,9 +228,17 @@ class OCRWorkflow(WorkflowRunner): number += 1 ''' - ' Performing OCR... + ' The ocr_jobs are created based of the output files of either the + ' split_jobs or post_binarisation_jobs. So wait until they are + ' finished. ''' self.waitForTasks() + + ''' + ' ################################################## + ' # Optical Character Recognition # + ' ################################################## + ''' ocr_jobs = [] ''' ' Tesseract runs fastest with four cores. So we run it with either four @@ -223,19 +253,20 @@ class OCRWorkflow(WorkflowRunner): if self.lang == "deu_frak": ocr_job_nCores = 1 for index, job in enumerate(self.jobs): - number = 0 files = os.listdir(os.path.join(job['output_dir'], 'tmp')) if self.skipBinarisation: files = filter(lambda x: x.endswith('.tif'), files) else: files = filter(lambda x: x.endswith('.bin.png'), files) - files = sorted( - files, - key=lambda x: int(re.search(r'\d+', x).group(0)) + files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) + files = map( + lambda x: os.path.join(job['output_dir'], 'tmp', x), + files ) + number = 0 for file in files: cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % ( - os.path.join(job['output_dir'], 'tmp', file), + file, os.path.join( job['output_dir'], 'tmp', @@ -253,7 +284,6 @@ class OCRWorkflow(WorkflowRunner): ), post_binarisation_jobs ) - print(ocr_job_dependencies) ocr_jobs.append( self.addTask( command=cmd, @@ -265,7 +295,15 @@ class OCRWorkflow(WorkflowRunner): number += 1 ''' - ' Creating TEI P5 files... + ' The following jobs are created based of the output files of the + ' ocr_jobs. So wait until they are finished. + ''' + self.waitForTasks() + + ''' + ' ################################################## + ' # Create TEI P5 files # + ' ################################################## ''' hocr_to_tei_jobs = [] for index, job in enumerate(self.jobs): @@ -273,7 +311,7 @@ class OCRWorkflow(WorkflowRunner): os.path.join(job['output_dir'], 'tmp'), os.path.join( job['output_dir'], - job['filename'].rsplit('.', 1)[0] + '.xml' + os.path.join(job['output_dir'], job['name'] + '.xml') ) ) hocr_to_tei_jobs.append( @@ -288,15 +326,24 @@ class OCRWorkflow(WorkflowRunner): ) ''' - ' Merging PDF files... + ' ################################################## + ' # Merge PDF files # + ' ################################################## ''' pdf_merge_jobs = [] for index, job in enumerate(self.jobs): - cmd = '(ls --quoting-style=shell-escape -v "%s"/*.pdf && echo "\'%s\'") | xargs pdfunite' % ( - os.path.join(job['output_dir'], 'tmp'), + files = os.listdir(os.path.join(job['output_dir'], 'tmp')) + files = filter(lambda x: x.endswith('.pdf'), files) + files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) + files = map( + lambda x: '"' + os.path.join(job['output_dir'], 'tmp', x) + '"', + files + ) + cmd = 'pdfunite %s "%s"' % ( + ' '.join(files), os.path.join( job['output_dir'], - job['filename'].rsplit('.', 1)[0] + '.pdf' + os.path.join(job['output_dir'], job['name'] + '.pdf') ) ) pdf_merge_jobs.append( @@ -311,15 +358,24 @@ class OCRWorkflow(WorkflowRunner): ) ''' - ' Merging text files... + ' ################################################## + ' # Merge text files # + ' ################################################## ''' txt_merge_jobs = [] for index, job in enumerate(self.jobs): - cmd = 'ls --quoting-style=shell-escape -v "%s"/*.txt | xargs cat > "%s"' % ( - os.path.join(job['output_dir'], 'tmp'), + files = os.listdir(os.path.join(job['output_dir'], 'tmp')) + files = filter(lambda x: x.endswith('.txt'), files) + files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) + files = map( + lambda x: '"' + os.path.join(job['output_dir'], 'tmp', x) + '"', + files + ) + cmd = 'cat %s > "%s"' % ( + ' '.join(files), os.path.join( job['output_dir'], - job['filename'].rsplit('.', 1)[0] + '.txt' + os.path.join(job['output_dir'], job['name'] + '.txt') ) ) txt_merge_jobs.append( @@ -334,7 +390,9 @@ class OCRWorkflow(WorkflowRunner): ) ''' - ' Cleanup... + ' ################################################## + ' # Cleanup # + ' ################################################## ''' cleanup_jobs = [] if self.keepIntermediates: @@ -407,6 +465,7 @@ def analyze_jobs(inputDirectory, outputDirectory): jobs.append( { 'filename': file, + 'name': file.rsplit('.', 1)[0], 'output_dir': os.path.join(outputDirectory, file), 'path': os.path.join(inputDirectory, file) }