#!/usr/bin/env python2 # coding=utf-8 """ ocr Usage: For usage instructions run with option --help Author: Patrick Jentsch """ import argparse import multiprocessing import os import sys from pyflow import WorkflowRunner def parse_arguments(): parser = argparse.ArgumentParser( "Performs OCR of documents utilizing Tesseract OCR. \ Outputs are .pdf and .txt." ) parser.add_argument("-i", dest="inputDir", help="Input directory.", required=True) parser.add_argument("-l", dest='lang', help="Language for OCR", required=True) parser.add_argument("-o", dest="outputDir", help="Output directory.", required=True) parser.add_argument("--keep-intermediates", action='store_true', default=False, dest="keepIntermediates", help="Keep intermediate files.", required=False) parser.add_argument("--nCores", default=multiprocessing.cpu_count(), dest="nCores", help="Total number of cores available.", required=False, type=int) return parser.parse_args() class OCRWorkflow(WorkflowRunner): def __init__(self, jobs, keepIntermediates, lang, nCores): self.jobs = jobs self.keepIntermediates = keepIntermediates self.lang = lang self.nCores = nCores def workflow(self): ### # Task "mkdir_job": create output directories # Dependencies: None ### mkdir_jobs = [] mkdir_job_number = 0 for job in self.jobs: mkdir_job_number += 1 cmd = 'mkdir -p "%s" "%s" "%s"' % ( job["output_dir"], os.path.join(job["output_dir"], "tmp", "tesseract"), os.path.join(job["output_dir"], "tmp", "tiff_files") ) mkdir_jobs.append(self.addTask(label="mkdir_job_-_%i" % (mkdir_job_number), command=cmd)) ### # Task "split_job": split input file into one .tif file per page # Dependencies: mkdir_jobs ### split_jobs = [] split_job_number = 0 for job in self.jobs: split_job_number += 1 cmd = 'pdftoppm "%s" "%s" -tiff -r 300 -tiffcompression lzw -cropbox' % ( job["path"], os.path.join(job["output_dir"], "tmp", "tiff_files", "page") ) split_jobs.append(self.addTask(label="split_job_-_%i" % (mkdir_job_number), command=cmd)) ### # Task "tesseract_job": perform OCR # Dependencies: split_jobs ### self.waitForTasks() tesseract_jobs = [] tesseract_job_number = 0 for job in self.jobs: # This list is empty if you don't wait for split_jobs to complete for file in os.listdir(os.path.join(job["output_dir"], "tmp", "tiff_files")): tesseract_job_number += 1 cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % ( os.path.join(job["output_dir"], "tmp", "tiff_files", file), os.path.join(job["output_dir"], "tmp", "tesseract", file.rsplit(".", 1)[0]), self.lang ) tesseract_jobs.append(self.addTask(label="tesseract_job_-_%i" % (tesseract_job_number), command=cmd, dependencies=split_jobs, nCores=min(4, self.nCores))) ### # Task "hocr_to_teip5_job": create TEI P5 file from hocr files # Dependencies: tesseract_jobs ### hocr_to_tei_jobs = [] hocr_to_tei_job_number = 0 for job in self.jobs: hocr_to_tei_job_number += 1 cmd = 'hocrtotei "%s" "%s"' % ( os.path.join(job["output_dir"], "tmp", "tesseract"), os.path.join(job["output_dir"], os.path.basename(job["path"]).rsplit(".", 1)[0] + ".xml") ) hocr_to_tei_jobs.append(self.addTask(label="hocr_to_tei_job_-_%i" % (hocr_to_tei_job_number), command=cmd, dependencies=tesseract_jobs)) ### # Task "pdf_merge_job": Merge .pdf files # Dependencies: tesseract_jobs ### pdf_merge_jobs = [] pdf_merge_job_number = 0 for job in self.jobs: pdf_merge_job_number += 1 cmd = 'pdftk "%s"/*.pdf cat output "%s"' % ( os.path.join(job["output_dir"], "tmp", "tesseract"), os.path.join(job["output_dir"], os.path.basename(job["path"]).rsplit(".", 1)[0] + ".pdf") ) pdf_merge_jobs.append(self.addTask(label="pdf_merge_job_-_%i" % (pdf_merge_job_number), command=cmd, dependencies=tesseract_jobs)) ### # Task "txt_merge_job": Merge .txt files # Dependencies: tesseract_jobs ### txt_merge_jobs = [] txt_merge_job_number = 0 for job in self.jobs: txt_merge_job_number += 1 cmd = 'cat "%s"/*.txt > "%s"' % ( os.path.join(job["output_dir"], "tmp", "tesseract"), os.path.join(job["output_dir"], os.path.basename(job["path"]).rsplit(".", 1)[0] + ".txt") ) txt_merge_jobs.append(self.addTask(label="txt_merge_job_-_%i" % (txt_merge_job_number), command=cmd, dependencies=tesseract_jobs)) ### # Task "cleanup_job": remove temporary files # Dependencies: hocr_to_tei_jobs + pdf_merge_jobs + txt_merge_jobs ### cleanup_jobs = [] cleanup_job_counter = 0 if not self.keepIntermediates: for job in self.jobs: cleanup_job_counter += 1 cmd = 'rm -r "%s"' % ( os.path.join(job["output_dir"], "tmp") ) cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_tei_jobs + pdf_merge_jobs + txt_merge_jobs)) ### # Task "zip_job": compress output # Dependencies: cleanup_jobs ### zip_jobs = [] zip_job_number = 0 for job in self.jobs: zip_job_number += 1 cmd = 'zip -jqr %s %s' % ( job["output_dir"] + "_-_ocr", job["output_dir"] ) zip_jobs.append(self.addTask(label="zip_job_-_%i" % (zip_job_number), command=cmd, dependencies=cleanup_jobs)) def analyze_jobs(inputDir, outputDir, level=1): jobs = [] if level > 2: return jobs for file in os.listdir(inputDir): if os.path.isdir(os.path.join(inputDir, file)): jobs += analyze_jobs( os.path.join(inputDir, file), os.path.join(outputDir, file), level + 1 ) elif file.endswith(".pdf"): jobs.append({"path": os.path.join(inputDir, file), "output_dir": os.path.join(outputDir, file.rsplit(".", 1)[0])}) return jobs def main(): args = parse_arguments() wflow = OCRWorkflow( analyze_jobs(args.inputDir, args.outputDir), args.keepIntermediates, args.lang, args.nCores ) retval = wflow.run(nCores=args.nCores) sys.exit(retval) if __name__ == "__main__": main()