#!/usr/bin/env python2.7 # coding=utf-8 """ ocr_pyflow.py Usage: For usage instructions run with option --help Author: Patrick Jentsch """ import argparse import multiprocessing import os import sys from pyflow import WorkflowRunner ''' TODO: ' Implement --end-page: Last page to ocr ' Implement --memMb: Total amount of memory (RAM) available for this workflow. Default: 2048 * nCores ' Implement --rotate: Rotate pages from input (90, 180, 270) ' Implement --split-pages: Split pages in half after possible rotation ' Implement --start-page: First page to ocr ''' def parse_arguments(): parser = argparse.ArgumentParser("Performs OCR of (historical) documents utilizing OCRopus for preprocessing and Tesseract OCR \ for OCR. Available outputs are HOCR, PDF, shrinked PDF, and simple DTAbf \ (TEI P5 compliant). Software requirements: imagemagick, ocropus, pdftk, pdftoppm, poppler-utils, pyflow, python2.7, tesseract") parser.add_argument("-i", dest="inputDir", help="Input directory.", required=True) parser.add_argument("-l", dest='lang', help="Language for OCR", required=True) parser.add_argument("-o", dest="outputDir", help="Output directory.", required=True) parser.add_argument("--keep-intermediates", action='store_true', default=False, dest="keepIntermediates", help="Keep intermediate files.", required=False) parser.add_argument("--nCores", default=multiprocessing.cpu_count(), dest="nCores", help="Total number of cores available.", required=False, type=int) return parser.parse_args() class OCRWorkflow(WorkflowRunner): def __init__(self, jobs, keepIntermediates, lang, nCores): self.jobs = jobs self.keepIntermediates = keepIntermediates self.lang = lang self.nCores = nCores def workflow(self): ### # Task "mkdir_job": create output directories # Dependencies: None ### mkdir_jobs = [] mkdir_job_number = 0 for job in self.jobs: mkdir_job_number += 1 cmd = 'mkdir -p "%s" "%s" "%s" "%s"' % ( os.path.join(job["output_dir"], "hocr_files"), os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"), os.path.join(job["output_dir"], "tmp", "tesseract"), os.path.join(job["output_dir"], "tmp", "tiff_files") ) mkdir_jobs.append(self.addTask(label="mkdir_job_-_%i" % (mkdir_job_number), command=cmd)) ### # Task "split_job": split input file into one tiff file per page # Dependencies: mkdir_jobs ### split_jobs = [] split_job_number = 0 for job in self.jobs: split_job_number += 1 if job["basename"].endswith(".tif") or job["basename"].endswith(".tiff"): # TODO: Make the following command work ''' cmd = 'convert "%s" "%s"' % ( job["path"], os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + "-%sd.tif" % ("%"))) ''' # WORKAROUND cmd = 'tiff2pdf -o "%s" "%s" && pdftoppm "%s" "%s" -tiff -r 300 -tiffcompression lzw -cropbox && rm "%s"' % ( os.path.join(job["output_dir"], "tmp", "tiff_files", job["basename"].rsplit(".", 1)[0] + ".pdf"), job["path"], os.path.join(job["output_dir"], "tmp", "tiff_files", job["basename"].rsplit(".", 1)[0] + ".pdf"), os.path.join(job["output_dir"], "tmp", "tiff_files", job["basename"].rsplit(".", 1)[0]), os.path.join(job["output_dir"], "tmp", "tiff_files", job["basename"].rsplit(".", 1)[0] + ".pdf") ) else: cmd = 'pdftoppm "%s" "%s" -tiff -r 300 -tiffcompression lzw -cropbox' % ( job["path"], os.path.join(job["output_dir"], "tmp", "tiff_files", "page") ) split_jobs.append(self.addTask(label="split_job_-_%i" % (split_job_number), command=cmd, dependencies=mkdir_jobs)) ### # Task "ocropus_nlbin_job": binarize tiff files from previous split # Dependencies: split_jobs ### ocropusnlbin_jobs = [] ocropusnlbin_job_number = 0 for job in self.jobs: ocropusnlbin_job_number += 1 cmd = 'ocropus-nlbin -o "%s" "%s"/*' % ( os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"), os.path.join(job["output_dir"], "tmp", "tiff_files") ) ocropusnlbin_jobs.append(self.addTask(label="ocropusnlbin_job_-_%i" % (ocropusnlbin_job_number), command=cmd, dependencies=split_jobs)) ### # Task "tesseract_job": perform OCR on binarized images # Dependencies: ocropusnlbin_jobs ### self.waitForTasks() tesseract_jobs = [] tesseract_job_number = 0 for job in self.jobs: # This list is empty if you don't wait for ocropus_nlbin_jobs to complete for file in filter(lambda x: x.endswith(".bin.png"), os.listdir(os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"))): tesseract_job_number += 1 cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % ( os.path.join(job["output_dir"], "tmp", "ocropus-nlbin", file), os.path.join(job["output_dir"], "tmp", "tesseract", file.rsplit(".", 2)[0]), self.lang ) tesseract_jobs.append(self.addTask(label="tesseract_job_-_%i" % (tesseract_job_number), command=cmd, dependencies=ocropusnlbin_jobs, nCores=min(4, self.nCores))) ### # Task "hocr_to_teip5_job": create TEI P5 file from hocr files # Dependencies: tesseract_jobs ### hocr_to_teip5_jobs = [] hocr_to_teip5_job_number = 0 for job in self.jobs: hocr_to_teip5_job_number += 1 cmd = 'parse_hocr "%s" "%s"' % ( os.path.join(job["output_dir"], "tmp", "tesseract"), os.path.join(job["output_dir"], job["basename"].rsplit(".", 1)[0] + ".xml") ) hocr_to_teip5_jobs.append(self.addTask(label="hocr_to_teip5_job_-_%i" % (hocr_to_teip5_job_number), command=cmd, dependencies=tesseract_jobs)) ### # Task "move_hocr_job": move hocr files from /tmp/tesseract to /hocr_files # Dependencies: hocr_to_teip5_jobs ### move_hocr_jobs = [] move_hocr_job_number = 0 for job in self.jobs: move_hocr_job_number += 1 cmd = 'mv "%s"/*.hocr "%s"' % ( os.path.join(job["output_dir"], "tmp", "tesseract"), os.path.join(job["output_dir"], "hocr_files") ) move_hocr_jobs.append(self.addTask(label="move_hocr_job_-_%i" % (move_hocr_job_number), command=cmd, dependencies=hocr_to_teip5_jobs)) ### # Task "pdf_merge_job": Merge PDF files # Dependencies: tesseract_jobs ### pdf_merge_jobs = [] pdf_merge_job_number = 0 for job in self.jobs: pdf_merge_job_number += 1 cmd = 'pdftk "%s"/*.pdf cat output "%s"' % ( os.path.join(job["output_dir"], "tmp", "tesseract"), os.path.join(job["output_dir"], job["basename"].rsplit(".", 1)[0] + ".pdf") ) pdf_merge_jobs.append(self.addTask(label="pdf_merge_job_-_%i" % (pdf_merge_job_number), command=cmd, dependencies=tesseract_jobs)) ### # Task "pdf_to_txt_job": # Dependencies: pdf_merge_jobs ### pdf_to_txt_jobs = [] pdf_to_txt_job_number = 0 for job in self.jobs: pdf_to_txt_job_number += 1 cmd = 'pdftotext -raw "%s"' % ( os.path.join(job["output_dir"], job["basename"].rsplit(".", 1)[0] + ".pdf") ) pdf_merge_jobs.append(self.addTask(label="pdf_to_txt_job_-_%i" % (pdf_to_txt_job_number), command=cmd, dependencies=pdf_merge_jobs)) ### # Task "cleanup_job": remove temporary files # Dependencies: hocr_to_teip5_jobs + move_hocr_jobs + pdf_merge_jobs + pdf_to_txt_jobs ### cleanup_jobs = [] cleanup_job_counter = 0 if not self.keepIntermediates: for job in self.jobs: cleanup_job_counter += 1 cmd = 'rm -r "%s"' % ( os.path.join(job["output_dir"], "tmp") ) cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_teip5_jobs + move_hocr_jobs + pdf_merge_jobs + pdf_to_txt_jobs)) def analyze_jobs(inputDir, outputDir, level=1): jobs = [] if level > 2: return jobs for file in os.listdir(inputDir): if os.path.isdir(os.path.join(inputDir, file)): jobs += analyze_jobs( os.path.join(inputDir, file), os.path.join(outputDir, file), level + 1 ) elif file.endswith(".pdf") or file.endswith(".tif") or file.endswith(".tiff"): jobs.append({"basename": os.path.basename(file), "output_dir": os.path.join(outputDir, file.rsplit(".", 1)[0]), "path": os.path.join(inputDir, file)}) return jobs def main(): args = parse_arguments() wflow = OCRWorkflow( analyze_jobs(args.inputDir, args.outputDir), args.keepIntermediates, args.lang, args.nCores ) retval = wflow.run(nCores=args.nCores) sys.exit(retval) if __name__ == "__main__": main()