From e5c0d53a030c7c2d8be698233180c8bbccabab07 Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Wed, 15 May 2019 11:56:24 +0200 Subject: [PATCH] Add some output messages and code formatting. --- ocr | 407 ++++++++++++++++++++++++++++++------------------------------ 1 file changed, 206 insertions(+), 201 deletions(-) diff --git a/ocr b/ocr index 34a9c7e..59bce29 100755 --- a/ocr +++ b/ocr @@ -29,172 +29,187 @@ from pyflow import WorkflowRunner def parse_arguments(): parser = argparse.ArgumentParser( - "Performs OCR of (historical) documents utilizing OCRopus for \ + 'Performs OCR of (historical) documents utilizing OCRopus for \ preprocessing and Tesseract OCR for OCR. Available outputs are HOCR, \ PDF, shrinked PDF, and simple DTAbf (TEI P5 compliant). Software \ - requirements: imagemagick, ocropus, pdftk, pdftoppm, poppler-utils, \ - pyflow, python2.7, python3.5, tesseract" + requirements: imagemagick, ocropus, pdftoppm, pdfunite, \ + poppler-utils, pyflow, python2.7, python3.5, tesseract' ) - parser.add_argument("-l", - dest='lang', - help="Language for OCR", - required=True) - parser.add_argument("--skip-binarization", - action='store_true', - default=False, - dest="skipBinarization", - help="Skip binarization.", - required=False) - parser.add_argument("--keep-intermediates", - action='store_true', - default=False, - dest="keepIntermediates", - help="Keep intermediate files.", - required=False) - parser.add_argument("--nCores", - default=min(4, multiprocessing.cpu_count()), - dest="nCores", - help="Total number of cores available.", - required=False, - type=int) + parser.add_argument( + '-l', + dest='lang', + help='Language for OCR.', + required=True + ) + parser.add_argument( + '--i', + default=os.path.normpath('/files_for_ocr'), + dest='inputDirectory', + help='The input directory.', + required=False + ) + parser.add_argument( + '--o', + default=os.path.normpath('/files_from_ocr'), + dest='outputDirectory', + help='The output directory.', + required=False + ) + parser.add_argument( + '--skip-binarization', + action='store_true', + default=False, + dest='skipBinarization', + help='Skip binarization.', + required=False + ) + parser.add_argument( + '--keep-intermediates', + action='store_true', + default=False, + dest='keepIntermediates', + help='Keep intermediate files.', + required=False + ) + parser.add_argument( + '--nCores', + default=min(4, multiprocessing.cpu_count()), + dest='nCores', + help='Total number of cores available.', + required=False, + type=int + ) return parser.parse_args() class OCRWorkflow(WorkflowRunner): def __init__(self, args): - self.jobs = analyze_jobs() + self.jobs = analyze_jobs(args.inputDirectory, args.outputDirectory) self.skipBinarization = args.skipBinarization self.keepIntermediates = args.keepIntermediates self.lang = args.lang self.nCores = args.nCores - self.defaultNCores = min( - self.nCores, - max(1, int(self.nCores / len(self.jobs))) - ) def workflow(self): - ### - # Task "create_output_directories_job": create output directories - # Dependencies: None - ### + print('##########################################################') + print('# Starting workflow... #') + print('##########################################################') + for index, job in enumerate(self.jobs): + print('%i: %s' % (index, job)) + + print('##########################################################') + print('# Creating output directories... #') + print('##########################################################') create_output_directories_jobs = [] - create_output_directories_job_number = 0 - for job in self.jobs: - create_output_directories_job_number += 1 - cmd = 'mkdir -p "%s" "%s" "%s" "%s"' % ( - os.path.join(job["output_dir"], "tmp", "hocr"), - os.path.join(job["output_dir"], "tmp", "pdf"), - os.path.join(job["output_dir"], "tmp", "tiff"), - os.path.join(job["output_dir"], "tmp", "txt") + for index, job in enumerate(self.jobs): + cmd = 'mkdir -p "%s"' % ( + os.path.join(job['output_dir'], 'tmp') ) + if self.keepIntermediates: + cmd += ' "%s" "%s" "%s" "%s"' % ( + os.path.join(job['output_dir'], 'tmp', 'hocr'), + os.path.join(job['output_dir'], 'tmp', 'pdf'), + os.path.join(job['output_dir'], 'tmp', 'tiff'), + os.path.join(job['output_dir'], 'tmp', 'txt') + ) if not self.skipBinarization: cmd += ' "%s" "%s"' % ( - os.path.join(job["output_dir"], "tmp", "bin.png"), - os.path.join(job["output_dir"], "tmp", "nrm.png"), + os.path.join(job['output_dir'], 'tmp', 'bin.png'), + os.path.join(job['output_dir'], 'tmp', 'nrm.png'), ) create_output_directories_jobs.append( self.addTask( command=cmd, - label="create_output_directories_job_-_%i" % ( - create_output_directories_job_number - ), - nCores=self.defaultNCores) + label='create_output_directories_job_-_%i' % (index) ) + ) + self.waitForTasks() - ### - # Task "split_job": split input file into one tiff file per page - # Dependencies: create_output_directories_jobs - ### + print('##########################################################') + print('# Splitting... #') + print('##########################################################') split_jobs = [] - split_job_number = 0 - for job in self.jobs: - split_job_number += 1 - if job["filename"].endswith((".tif", ".tiff")): + split_job_nCores = min( + self.nCores, + max(1, int(self.nCores / len(self.jobs))) + ) + for index, job in enumerate(self.jobs): + if job['filename'].endswith(('.tif', '.tiff')): cmd = 'convert "%s" -compress LZW -density 300 -scene 1 "%s"/page-%%d.tif' % ( - job["path"], - os.path.join(job["output_dir"], "tmp") + job['path'], + os.path.join(job['output_dir'], 'tmp') ) else: cmd = 'pdftoppm -r 300 -tiff -tiffcompression lzw "%s" "%s"' % ( - job["path"], - os.path.join(job["output_dir"], "tmp", "page") + job['path'], + os.path.join(job['output_dir'], 'tmp', 'page') ) split_jobs.append( self.addTask( command=cmd, - dependencies=create_output_directories_jobs, - label="split_job_-_%i" % (split_job_number), - nCores=self.defaultNCores + label='split_job_-_%i' % (index), + nCores=split_job_nCores ) ) + self.waitForTasks() - ### - # Task "ocropus_nlbin_job": binarize tiff files from previous split - # Dependencies: split_jobs - ### - binarization_jobs = [] - binarization_job_number = 0 - ''' - ' We run ocropus-nlbin with either four or, if there are less then four - ' cores available for this workflow, the available core number. - ''' - binarization_job_nCores = min(4, self.nCores) if not self.skipBinarization: - for job in self.jobs: - binarization_job_number += 1 - cmd = 'ocropus-nlbin --output "%s" --parallel "%i" $(ls --quoting-style=shell-escape -v "%s"/*.tif)' % ( - os.path.join(job["output_dir"], "tmp"), - binarization_job_nCores, - os.path.join(job["output_dir"], "tmp") + print('##########################################################') + print('# Binarising... #') + print('##########################################################') + binarisation_jobs = [] + ''' + ' We run ocropus-nlbin with either four or, if there are less then + ' four cores available for this workflow, the available core + ' number. + ''' + binarisation_job_nCores = min(4, self.nCores) + for index, job in enumerate(self.jobs): + cmd = 'ls --quoting-style=shell-escape -v "%s"/*.tif | xargs ocropus-nlbin --output "%s" --parallel "%i"' % ( + os.path.join(job['output_dir'], 'tmp'), + os.path.join(job['output_dir'], 'tmp'), + binarisation_job_nCores ) - binarization_jobs.append( + binarisation_jobs.append( self.addTask( command=cmd, - dependencies=split_jobs, - label="binarization_job_-_%i" % ( - binarization_job_number - ), - nCores=binarization_job_nCores + label='binarisation_job_-_%i' % (index), + nCores=binarisation_job_nCores ) ) + self.waitForTasks() - ### - # Task "post_binarization_job": Normalize file names from binarization - # Dependencies: binarization_jobs - ### - self.waitForTasks() - post_binarization_jobs = [] - post_binarization_job_number = 0 - if not self.skipBinarization: - for job in self.jobs: - for file in filter(lambda x: x.endswith((".bin.png", ".nrm.png")), os.listdir(os.path.join(job["output_dir"], "tmp"))): - post_binarization_job_number += 1 + print('##########################################################') + print('# Normalising file names from binarisation... #') + print('##########################################################') + post_binarisation_jobs = [] + for index, job in enumerate(self.jobs): + number = 0 + for file in filter(lambda x: x.endswith(('.bin.png', '.nrm.png')), os.listdir(os.path.join(job['output_dir'], 'tmp'))): cmd = 'mv "%s" "%s"' % ( - os.path.join(job["output_dir"], "tmp", file), - os.path.join(job["output_dir"], "tmp", "page-%i.%s" % ( - int(file.split(".", 1)[0]), - file.split(".", 1)[1]) + os.path.join(job['output_dir'], 'tmp', file), + os.path.join(job['output_dir'], 'tmp', 'page-%i.%s' % ( + int(file.split('.', 1)[0]), + file.split('.', 1)[1]) ), ) - post_binarization_jobs.append( + post_binarisation_jobs.append( self.addTask( command=cmd, - dependencies=binarization_jobs, - label="post_binarization_job_-_%i" % ( - post_binarization_job_number - ), - nCores=self.defaultNCores + label='post_binarisation_job_-_%i-%i' % ( + index, + number + ) ) ) + number += 1 + self.waitForTasks() - ### - # Task "ocr_job": perform OCR - # Dependencies: waitForTasks - ### - self.waitForTasks() + print('##########################################################') + print('# Performing OCR... #') + print('##########################################################') ocr_jobs = [] - ocr_job_number = 0 ''' ' Tesseract runs fastest with four cores. So we run it with either four ' or, if there are less then four cores available for this workflow, @@ -207,161 +222,151 @@ class OCRWorkflow(WorkflowRunner): ''' if self.lang == "deu_frak": ocr_job_nCores = 1 - for job in self.jobs: - for file in filter(lambda x: x.endswith(".tif") if self.skipBinarization else x.endswith(".bin.png"), os.listdir(os.path.join(job["output_dir"], "tmp"))): - ocr_job_number += 1 + for index, job in enumerate(self.jobs): + number = 0 + for file in filter(lambda x: x.endswith('.tif') if self.skipBinarization else x.endswith('.bin.png'), os.listdir(os.path.join(job['output_dir'], 'tmp'))): cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % ( - os.path.join(job["output_dir"], "tmp", file), - os.path.join(job["output_dir"], "tmp", file.rsplit(".", 1 if self.skipBinarization else 2)[0]), + os.path.join(job['output_dir'], 'tmp', file), + os.path.join( + job['output_dir'], + 'tmp', + file.rsplit('.', 1 if self.skipBinarization else 2)[0] + ), self.lang ) ocr_jobs.append( self.addTask( command=cmd, - dependencies=post_binarization_jobs, - label="ocr_job_-_%i" % (ocr_job_number), + label='ocr_job_-_%i-%i' % (index, number), nCores=ocr_job_nCores ) ) + number += 1 + self.waitForTasks() - ### - # Task "hocr_to_tei_job": create TEI P5 file from hocr files - # Dependencies: ocr_jobs - ### + print('##########################################################') + print('# Creating TEI P5 files... #') + print('##########################################################') hocr_to_tei_jobs = [] - hocr_to_tei_job_number = 0 - for job in self.jobs: - hocr_to_tei_job_number += 1 + for index, job in enumerate(self.jobs): cmd = 'hocrtotei "%s" "%s"' % ( - os.path.join(job["output_dir"], "tmp"), - os.path.join(job["output_dir"], job["filename"].rsplit(".", 1)[0] + ".xml") + os.path.join(job['output_dir'], 'tmp'), + os.path.join( + job['output_dir'], + job['filename'].rsplit('.', 1)[0] + '.xml' + ) ) hocr_to_tei_jobs.append( self.addTask( command=cmd, - dependencies=ocr_jobs, - label="hocr_to_tei_job_-_%i" % (hocr_to_tei_job_number), - nCores=self.defaultNCores + label='hocr_to_tei_job_-_%i' % (index) ) ) - ### - # Task "pdf_merge_job": Merge PDF files - # Dependencies: ocr_jobs - ### + print('##########################################################') + print('# Merging PDF files... #') + print('##########################################################') pdf_merge_jobs = [] - pdf_merge_job_number = 0 - for job in self.jobs: - pdf_merge_job_number += 1 + for index, job in enumerate(self.jobs): cmd = '(ls --quoting-style=shell-escape -v "%s"/*.pdf && echo "\'%s\'") | xargs pdfunite' % ( - os.path.join(job["output_dir"], "tmp"), - os.path.join(job["output_dir"], job["filename"].rsplit(".", 1)[0] + ".pdf") + os.path.join(job['output_dir'], 'tmp'), + os.path.join( + job['output_dir'], + job['filename'].rsplit('.', 1)[0] + '.pdf' + ) ) pdf_merge_jobs.append( self.addTask( command=cmd, - dependencies=ocr_jobs, - label="pdf_merge_job_-_%i" % (pdf_merge_job_number), - nCores=self.defaultNCores + label='pdf_merge_job_-_%i' % (index) ) ) - ### - # Task "txt_merge_job": Merge .txt files - # Dependencies: ocr_jobs - ### + print('##########################################################') + print('# Merging text files... #') + print('##########################################################') txt_merge_jobs = [] - txt_merge_job_number = 0 - for job in self.jobs: - txt_merge_job_number += 1 + for index, job in enumerate(self.jobs): cmd = 'ls --quoting-style=shell-escape -v "%s"/*.txt | xargs cat > "%s"' % ( - os.path.join(job["output_dir"], "tmp"), - os.path.join(job["output_dir"], job["filename"].rsplit(".", 1)[0] + ".txt") + os.path.join(job['output_dir'], 'tmp'), + os.path.join( + job['output_dir'], + job['filename'].rsplit('.', 1)[0] + '.txt' + ) ) txt_merge_jobs.append( self.addTask( command=cmd, - dependencies=ocr_jobs, - label="txt_merge_job_-_%i" % (txt_merge_job_number), - nCores=self.defaultNCores + label='txt_merge_job_-_%i' % (index) ) ) + self.waitForTasks() - ### - # Task "cleanup_job": remove temporary files - # Dependencies: hocr_to_tei_jobs + pdf_merge_jobs + txt_merge_jobs - ### + print('##########################################################') + print('# Cleanup... #') + print('##########################################################') cleanup_jobs = [] - cleanup_job_counter = 0 if self.keepIntermediates: - for job in self.jobs: - cleanup_job_counter += 1 + for index, job in enumerate(self.jobs): cmd = 'mv "%s"/*.hocr "%s"' % ( - os.path.join(job["output_dir"], "tmp"), - os.path.join(job["output_dir"], "tmp", "hocr"), + os.path.join(job['output_dir'], 'tmp'), + os.path.join(job['output_dir'], 'tmp', 'hocr'), ) cmd += ' && mv "%s"/*.pdf "%s"' % ( - os.path.join(job["output_dir"], "tmp"), - os.path.join(job["output_dir"], "tmp", "pdf"), + os.path.join(job['output_dir'], 'tmp'), + os.path.join(job['output_dir'], 'tmp', 'pdf'), ) cmd += ' && mv "%s"/*.tif "%s"' % ( - os.path.join(job["output_dir"], "tmp"), - os.path.join(job["output_dir"], "tmp", "tiff"), + os.path.join(job['output_dir'], 'tmp'), + os.path.join(job['output_dir'], 'tmp', 'tiff'), ) cmd += ' && mv "%s"/*.txt "%s"' % ( - os.path.join(job["output_dir"], "tmp"), - os.path.join(job["output_dir"], "tmp", "txt"), + os.path.join(job['output_dir'], 'tmp'), + os.path.join(job['output_dir'], 'tmp', 'txt'), ) if not self.skipBinarization: cmd += ' && mv "%s"/*.bin.png "%s"' % ( - os.path.join(job["output_dir"], "tmp"), - os.path.join(job["output_dir"], "tmp", "bin.png"), + os.path.join(job['output_dir'], 'tmp'), + os.path.join(job['output_dir'], 'tmp', 'bin.png'), ) cmd += ' && mv "%s"/*.nrm.png "%s"' % ( - os.path.join(job["output_dir"], "tmp"), - os.path.join(job["output_dir"], "tmp", "nrm.png"), + os.path.join(job['output_dir'], 'tmp'), + os.path.join(job['output_dir'], 'tmp', 'nrm.png'), + ) + cleanup_jobs.append( + self.addTask( + command=cmd, + label='cleanup_job_-_%i' % (index) ) - cleanup_jobs.append( - self.addTask( - command=cmd, - dependencies=hocr_to_tei_jobs + pdf_merge_jobs + txt_merge_jobs, - label="cleanup_job_-_%i" % (cleanup_job_counter), - nCores=self.defaultNCores ) - ) else: - for job in self.jobs: - cleanup_job_counter += 1 + for index, job in enumerate(self.jobs): cmd = 'rm -r "%s"' % ( - os.path.join(job["output_dir"], "tmp") + os.path.join(job['output_dir'], 'tmp') ) cleanup_jobs.append( self.addTask( - label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, - dependencies=hocr_to_tei_jobs + pdf_merge_jobs + txt_merge_jobs, - nCores=self.defaultNCores + label='cleanup_job_-_%i' % (index) ) ) -def analyze_jobs(): - inputDir = "/files_for_ocr" +def analyze_jobs(inputDirectory, outputDirectory): jobs = [] - outputDir = "/files_from_ocr" - for file in os.listdir(inputDir): - if os.path.isdir(os.path.join(inputDir, file)): + for file in os.listdir(inputDirectory): + if os.path.isdir(os.path.join(inputDirectory, file)): jobs += analyze_jobs( - os.path.join(inputDir, file), - os.path.join(outputDir, file) + os.path.join(inputDirectory, file), + os.path.join(outputDirectory, file) ) - elif file.endswith((".pdf", ".tif", ".tiff")): + elif file.endswith(('.pdf', '.tif', '.tiff')): jobs.append( { - "filename": file, - "output_dir": os.path.join(outputDir, file), - "path": os.path.join(inputDir, file) + 'filename': file, + 'output_dir': os.path.join(outputDirectory, file), + 'path': os.path.join(inputDirectory, file) } ) @@ -373,10 +378,10 @@ def main(): wflow = OCRWorkflow(args) - retval = wflow.run(dataDirRoot="/files_from_ocr", nCores=args.nCores) + retval = wflow.run(dataDirRoot=args.outputDirectory, nCores=args.nCores) sys.exit(retval) -if __name__ == "__main__": +if __name__ == '__main__': main()