#!/usr/bin/env python2.7 # coding=utf-8 """ ocr_pyflow.py Date: 01/04/2018 Usage: For usage instructions run with option --help Author: Madis Rumming """ __author__ = "Madis Rumming " __copyright__ = "Copyright 2018, Data Infrastructure and Digital Humanities,\ SFB 1288, Bielefeld University" __version__ = "0.6" __maintainer__ = "Madis Rumming" __email__ = "mrumming@uni-bielefeld.de" __status__ = "Development" import argparse import os import sys # from string import maketrans # from lxml import html from pyflow import WorkflowRunner ocropusnlbin_bin = "/usr/local/bin/ocropus-nlbin" def parse_arguments(): parser = argparse.ArgumentParser( "Performs OCR of (historical) documents utilizing ocropy for preprocessing and tesseract \ for final OCR. Available outputs are HOCR, PDF, shrinked PDF, and simple DTAbf \ (TEI P5 compliant). Software requirements: python2.7, pyflow, pdftoppm, ocropus, \ tesseract, ghostscript, imagick, ") parser.add_argument("-i", "--input-directory", dest="input_dir", help="Input directory with input images/PDFs. For each PDF an OCR-Run is \ instantiated and output is created. If images reside in the directory, a \ single OCR run is performed. For each subdirectory containing images, a \ particular OCR run is instantiated.", required=True) parser.add_argument("-o", "--output-directory", dest="output_dir", help="Directory, where output directories are created if necessary. \ Default: %s" % (os.path.curdir), required=False, default=os.path.curdir) parser.add_argument("--image-suffix", dest="suffix", help="Input images suffix. Case-sensitive! tiff!=TIFF!=Tiff Default: tif.", default="tif", required=False) parser.add_argument("--skip-pdf-processing", dest="skip_pdf", help="Skip detection of PDFs as input.", default=False, action='store_true', required=False) parser.add_argument("--skip-image-processing", dest="skip_images", help="Skip detection of images as input.", default=False, action='store_true', required=False) parser.add_argument("--skip_binarization", dest="skip_bin", help="skip binarizaiton pre-processing.", default=False, action="store_true", required=False) parser.add_argument("--start-page", dest='startp', default=-1, help="First page to ocr.", required=False, type=int) parser.add_argument("--end-page", dest='endp', default=-1, help="Last page to ocr.", required=False, type=int) parser.add_argument("-r", "--rotate-pages", dest='rotate', default='norotation', help="Rotate pages from input. Values: clockwise, counterclockwise, \ upsidedown. Default: norotation", required=False, choices=['clockwise', 'counterclockwise', 'upsidedown', 'norotation']) parser.add_argument("-s", "--split-pages", dest='split', default=False, help="Split pages in half after possible rotation. Default: Not performed.", required=False, action='store_true') parser.add_argument("--ppi-import", dest="ppi_in", help="Down-/Scaling for input images. Default: 300 ppi.", default=300, required=False, type=int) parser.add_argument("-l", "--language", dest='lang', help="Language for OCR", required=True, type=str) parser.add_argument("-p", "--create-pdf", dest='pdf', default=False, action='store_true', required=False) parser.add_argument("-c", "--compress-pdf", dest='comp', default=False, action='store_true', required=False) parser.add_argument("--ppi-export", dest="ppi_out", help="Down-/Scaling for output images in PDF. Default: 150 ppi.", default=150, required=False, type=int) parser.add_argument("-t", "--temp-directory", dest="temp", help="Location of intermediate files. Defaults to tmp in output directory.", default="", required=False, type=str) parser.add_argument("-k", "--keep-intermediate", dest="intermediate", help="Keep intermediate files. Default: True", default=False, action='store_true') parser.add_argument("--cores", dest='nCores', help="Amount of CPUs to use for parallel jobs. Default: %i" % (4), default=4, required=False, type=int) parser.add_argument("--is-continued", dest='continued', help="Enables continuing an erroneous or paused workflow. MUST use the \ same dataDirRoot as before.", default=False, required=False, action='store_true') parser.add_argument("--is-dry-run", dest='dry_run', help="Check workflow without execution.", default=False, action='store_true', required=False) parser.add_argument("--memory", dest='mem', help="Total amount of memory (RAM) available for this workflow. \ Default: %i" % (8192), default=8192, required=False, type=int) args = parser.parse_args() return (args) class PureOCR(WorkflowRunner): def __init__(self, input_path, output_path): self.input_path = input_path self.output_path = output_path def workflow(self): pass class OCRWorkflow(WorkflowRunner): def __init__(self, pdfImageJobs, inputDir, outputDir, suffix, lang, pdf, intermediate, skip_bin, nCores, memMb, dry_run): self.pdfImageJobs = pdfImageJobs self.outputDir = outputDir self.inputDir = inputDir self.suffix = suffix self.lang = lang self.pdf = pdf self.intermediate = intermediate self.skip_bin = skip_bin self.nCores = nCores self.memMb = memMb self.dry_run = dry_run def workflow(self): cmd = None print(self.outputDir) if os.path.isabs(self.outputDir): if not os.path.exists(self.outputDir): cmd = "mkdir -p %s" % (self.outputDir) else: self.outputDir = os.path.join(os.path.abspath(self.inputDir), self.outputDir) if not os.path.exists(self.outputDir): cmd = "mkdir -p %s" % (self.outputDir) self.addTask(label="mkdir_outputdir", command=cmd, isForceLocal=True) deps = [] if self.pdfImageJobs["pdf"]: cmd = "mkdir %s" % (os.path.join(self.outputDir, "pdf")) deps.append("mkdir_pdf") self.addTask(label="mkdir_pdf", command=cmd, isForceLocal=True, dependencies="mkdir_outputdir") if self.pdfImageJobs["images"]: cmd = "mkdir %s" % (os.path.join(self.outputDir, "images")) deps.append("mkdir_images") self.addTask(label="mkdir_images", command=cmd, isForceLocal=True, dependencies="mkdir_outputdir") ### # Generate directories ### deps_dirs = [] if self.pdfImageJobs["pdf"]: i = 0 for k in self.pdfImageJobs["pdf"]: cmd = "mkdir -p %s %s %s %s %s" % ( os.path.join(os.path.abspath(self.outputDir), "pdf", k, "tmp", "tiff"), os.path.join(os.path.abspath(self.outputDir), "pdf", k, "tmp", "tesseract"), os.path.join(os.path.abspath(self.outputDir), "pdf", k, "tmp", "ocropy"), os.path.join(os.path.abspath(self.outputDir), "pdf", k, "tmp", "gs"), os.path.join(os.path.abspath(self.outputDir), "pdf", k, "HOCR")) deps_dirs.append("mkdir_pdf_%i" % (i)) self.addTask(label="mkdir_pdf_%i" % (i), command=cmd, isForceLocal=True, dependencies=deps) i += 1 if self.pdfImageJobs["images"]: i = 0 for k in self.pdfImageJobs["images"]: cmd = "mkdir -p %s %s %s %s" % ( os.path.join(os.path.abspath(self.outputDir), "images", k, "tmp", "tesseract"), os.path.join(os.path.abspath(self.outputDir), "images", k, "tmp", "ocropy"), os.path.join(os.path.abspath(self.outputDir), "images", k, "tmp", "gs"), os.path.join(os.path.abspath(self.outputDir), "images", k, "HOCR")) deps_dirs.append("mkdir_images_%i" % (i)) self.addTask(label="mkdir_images_%i" % (i), command=cmd, isForceLocal=True, dependencies=deps) i += 1 ### # Extract images from PDF input files ### pdf_counter = 0 deps_ppm = [] for pdf in self.pdfImageJobs["pdf"]: cmd = "pdftoppm %s %s -tiff -r 300 -tiffcompression lzw -cropbox" % (self.pdfImageJobs["pdf"][pdf], os.path.join( os.path.abspath(self.outputDir), "pdf", pdf, "tmp", "tiff", "interm")) self.addTask(label="pdftoppm_%i" % (pdf_counter), command=cmd, isForceLocal=True, dependencies=deps_dirs, nCores=1, memMb=1024) deps_ppm.append("pdftoppm_%i" % (pdf_counter)) pdf_counter += 1 ### # Perform image binarization if not skipped ### self.waitForTasks() ocropus_counter = 0 deps_ocropus = [] if not self.skip_bin: for pdf in self.pdfImageJobs["pdf"]: cmd = "%s -Q %i -o %s %s" % (ocropusnlbin_bin, self.nCores, os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", "ocropy"), os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", "tiff", "interm-*")) self.addTask(label="ocropusnlbin_%i" % (ocropus_counter), command=cmd, isForceLocal=True, dependencies=deps_ppm, nCores=self.nCores, memMb=self.memMb) deps_ocropus.append("ocropusnlbin_%i" % (ocropus_counter)) ocropus_counter += 1 for img in self.pdfImageJobs["images"]: cmd = "%s -Q %i -o %s %s" % (ocropusnlbin_bin, self.nCores, os.path.join(os.path.abspath(self.outputDir), "images", img, "tmp", "ocropy"), os.path.join(self.pdfImageJobs["images"][img], "*.%s" % (self.suffix))) self.addTask(label="ocropusnlbin_%i" % (ocropus_counter), command=cmd, isForceLocal=True, dependencies=deps_ppm, nCores=self.nCores, memMb=self.memMb) deps_ocropus.append("ocropusnlbin_%i" % (ocropus_counter)) ocropus_counter += 1 ### # Perform OCR, HOCR as default output, PDF only optional ### self.waitForTasks() pdf_counter = 0 for pdf in self.pdfImageJobs["pdf"]: if self.skip_bin: inp_images = sorted(os.listdir( os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", "tiff"))) else: inp_images = sorted(filter(lambda x: ".bin." in x, os.listdir( os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", "ocropy")))) deps_ocr_pdf = [] for image in inp_images: cmd = "tesseract %s %s -l %s hocr %s" % ( os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", "tiff" if self.skip_bin else "ocropy", image), os.path.join(self.outputDir, "pdf", pdf, "tmp", "tesseract", "%04i" % int(image.split(".")[0].split("-")[1]) if self.skip_bin else image.split(".")[ 0]), self.lang, "pdf" if self.pdf else "") self.addTask(label="ocr_%i_%s" % (pdf_counter, image.split(".")[0]), command=cmd, isForceLocal=True, dependencies=deps_ocropus, nCores=1, memMb=2048) deps_ocr_pdf.append("ocr_%i_%s" % (pdf_counter, image.split(".")[0])) pdf_counter += 1 for img in self.pdfImageJobs["images"]: if self.skip_bin: images = sorted( list(filter(lambda x: x.endswith(self.suffix), os.listdir(self.pdfImageJobs["images"][img])))) else: images = sorted( list(filter(lambda x: ".bin." in x, os.listdir( os.path.join(os.path.abspath(self.outputDir), "images", img, "tmp", "ocropy"))))) deps_ocr = [] for image in images: if self.skip_bin: cmd = "tesseract %s %s -l %s hocr %s" % ( os.path.join(self.pdfImageJobs["images"][img], image), os.path.join(self.outputDir, "images", img, "tmp", "tesseract", image.split(".")[0]), self.lang, "pdf" if self.pdf else "") else: cmd = "tesseract %s %s -l %s hocr %s" % ( os.path.join(os.path.abspath(self.outputDir), "images", img, "tmp", "ocropy", image), os.path.join(self.outputDir, "images", img, "tmp", "tesseract", image.split(".")[0]), self.lang, "pdf" if self.pdf else "") print(cmd) self.addTask(label="ocr_%s_%s" % (img, image.split(".")[0]), command=cmd, isForceLocal=True, dependencies=deps_ocropus, nCores=1, memMb=2048) deps_ocr.append("ocr_%s_%s" % (img, image.split(".")[0])) pdftk_jobs = [] ### # Creation of PDF output ### if self.pdf: pdf_counter_conv = 0 for img in self.pdfImageJobs["images"]: if self.skip_bin: images = sorted( list(filter(lambda x: x.endswith(self.suffix), os.listdir(self.pdfImageJobs["images"][img])))) else: images = sorted( list(filter(lambda x: ".bin." in x, os.listdir( os.path.join(os.path.abspath(self.outputDir), "images", img, "tmp", "ocropy"))))) deps_tiffpdf = [] for image in images: if self.skip_bin: cmd = "tiff2pdf -o %s.pdf %s" % ( os.path.join(self.outputDir, "images", img, "tmp", "gs", image.split(".")[0]), os.path.join(self.pdfImageJobs["images"][img], image)) self.addTask(label="tiffpdf_%s_%s" % (img, image.split(".")[0]), command=cmd, isForceLocal=True, dependencies=deps_dirs, nCores=1, memMb=256) deps_tiffpdf.append("tiffpdf_%s_%s" % (img, image.split(".")[0])) else: cmd = "convert %s %s.pdf" % ( os.path.join(os.path.abspath(self.outputDir), "images", img, "tmp", "ocropy", image), os.path.join(self.outputDir, "images", img, "tmp", "gs", image.split(".")[0])) self.addTask(label="tiffpdf_%s_%s" % (img, image.split(".")[0]), command=cmd, isForceLocal=True, dependencies=deps_ocropus, nCores=1, memMb=256) deps_tiffpdf.append("tiffpdf_%s_%s" % (img, image.split(".")[0])) cmd = "gs -dBATCH -dNOPAUSE -q -r150 -dPDFSETTINGS=/ebook -sDEVICE=pdfwrite -sOutputFile=%s %s" % ( os.path.join(self.outputDir, "images", img, "tmp", "gs", "image.pdf"), os.path.join(self.outputDir, "images", img, "tmp", "gs", "*.pdf")) self.addTask(label="gs_image_%s" % (img), command=cmd, isForceLocal=True, dependencies=deps_tiffpdf, nCores=1, memMb=4096) deps_ocr.append("gs_image_%s" % (img)) cmd = "gs -dBATCH -dNOPAUSE -q -r150 -dPDFSETTINGS=/ebook -dFILTERIMAGE -sDEVICE=pdfwrite -sOutputFile=%s %s" % ( os.path.join(self.outputDir, "images", img, "tmp", "gs", "text.pdf"), os.path.join(self.outputDir, "images", img, "tmp", "tesseract", "*.pdf")) self.addTask(label="gs_layer_%s" % (img), command=cmd, isForceLocal=True, dependencies=deps_ocr, nCores=1, memMb=4096) cmd = "pdftk %s multistamp %s output %s" % ( os.path.join(self.outputDir, "images", img, "tmp", "gs", "text.pdf"), os.path.join(self.outputDir, "images", img, "tmp", "gs", "image.pdf"), os.path.join(self.outputDir, "images", img, "%s.pdf" % (img))) self.addTask(label="pdftk_%s" % (img), command=cmd, isForceLocal=True, dependencies="gs_layer_%s" % (img), nCores=1, memMb=4096) pdftk_jobs.append("pdftk_%s" % (img)) for pdf in self.pdfImageJobs["pdf"]: pdf_counter_conv += 1 if self.skip_bin: inp_images = sorted(os.listdir( os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", "tiff"))) else: inp_images = sorted(filter(lambda x: ".bin." in x, os.listdir( os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", "ocropy")))) deps_pdf_tiffpdf = [] for image in inp_images: if self.skip_bin: cmd = "tiff2pdf -o %s.pdf %s" % ( os.path.join(self.outputDir, "pdf", pdf, "tmp", "gs", image.split(".")[0]), os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", "tiff", image)) self.addTask(label="tiffpdf_pdf%i_%s" % (pdf_counter_conv, image.split(".")[0]), command=cmd, isForceLocal=True, dependencies=deps_dirs, nCores=1, memMb=256) deps_pdf_tiffpdf.append("tiffpdf_pdf%i_%s" % (pdf_counter_conv, image.split(".")[0])) else: cmd = "convert %s %s.pdf" % ( os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", "ocropy", image), os.path.join(self.outputDir, "pdf", pdf, "tmp", "gs", image.split(".")[0])) self.addTask(label="tiffpdf_pdf%i_%s" % (pdf_counter_conv, image.split(".")[0]), command=cmd, isForceLocal=True, dependencies=deps_ocropus, nCores=1, memMb=256) deps_pdf_tiffpdf.append("tiffpdf_pdf%i_%s" % (pdf_counter_conv, image.split(".")[0])) cmd = "gs -dBATCH -dNOPAUSE -q -r150 -dPDFSETTINGS=/ebook -sDEVICE=pdfwrite -sOutputFile=%s %s" % ( os.path.join(self.outputDir, "pdf", pdf, "tmp", "gs", "image.pdf"), os.path.join(self.outputDir, "pdf", pdf, "tmp", "gs", "*.pdf")) self.addTask(label="gs_image_pdf%i" % (pdf_counter_conv), command=cmd, isForceLocal=True, dependencies=deps_pdf_tiffpdf, nCores=1, memMb=4096) deps_ocr.append("gs_image_pdf%i" % (pdf_counter_conv)) cmd = "gs -dBATCH -dNOPAUSE -q -r150 -dPDFSETTINGS=/ebook -dFILTERIMAGE -sDEVICE=pdfwrite -sOutputFile=%s %s" % ( os.path.join(self.outputDir, "pdf", pdf, "tmp", "gs", "text.pdf"), os.path.join(self.outputDir, "pdf", pdf, "tmp", "tesseract", "*.pdf")) self.addTask(label="gs_layer_pdf%i" % (pdf_counter_conv), command=cmd, isForceLocal=True, dependencies=deps_ocr, nCores=1, memMb=4096) cmd = "pdftk %s multistamp %s output %s" % ( os.path.join(self.outputDir, "pdf", pdf, "tmp", "gs", "text.pdf"), os.path.join(self.outputDir, "pdf", pdf, "tmp", "gs", "image.pdf"), os.path.join(self.outputDir, "pdf", pdf, "%s.pdf" % (pdf))) self.addTask(label="pdftk_pdf%i" % (pdf_counter_conv), command=cmd, isForceLocal=True, dependencies="gs_layer_pdf%i" % (pdf_counter_conv), nCores=1, memMb=4096) pdftk_jobs.append("pdftk_pdf%i" % (pdf_counter_conv)) else: self.addTask(label="pdftk_", command=None, dependencies=deps_ocr) pdftk_jobs.append("pdftk_") ### # Merging and cleanup ### self.waitForTasks() for img in self.pdfImageJobs["images"]: cmd = "mv %s %s" % (os.path.join(self.outputDir, "images", img, "tmp", "tesseract", "*.hocr"), os.path.join(os.path.abspath(self.outputDir), "images", img, "HOCR")) self.addTask(label="mv_%s" % (img), command=cmd, isForceLocal=True, dependencies=pdftk_jobs, nCores=1, memMb=32) cmd = "parse_hocr %s %s" % ( os.path.join(os.path.abspath(self.outputDir), "images", img, "HOCR"), os.path.join(os.path.abspath(self.outputDir), "images", img, "%s.xml" % (img))) self.addTask(label="generate_hocr_%s" % (img), command=cmd, isForceLocal=True, dependencies="mv_%s" % (img), nCores=1, memMb=250) if not self.intermediate: cmd = "rm -rf %s" % (os.path.join(self.outputDir, "images", img, "tmp")) self.addTask(label="cleanup_%s" % (img), command=cmd, isForceLocal=True, dependencies="mv_%s" % (img), nCores=1, memMb=32) pdf_counter = 0 for pdf in self.pdfImageJobs["pdf"]: pdf_counter += 1 cmd = "mv %s %s" % (os.path.join(self.outputDir, "pdf", pdf, "tmp", "tesseract", "*.hocr"), os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "HOCR")) self.addTask(label="mv_%i" % (pdf_counter), command=cmd, isForceLocal=True, dependencies=pdftk_jobs, nCores=1, memMb=32) cmd = "parse_hocr %s %s" % ( os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "HOCR"), os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "%s.xml" % (pdf))) self.addTask(label="generate_hocr_%i" % (pdf_counter), command=cmd, isForceLocal=True, dependencies="mv_%i" % (pdf_counter), nCores=1, memMb=250) if not self.intermediate: cmd = "rm -rf %s" % (os.path.join(self.outputDir, "pdf", pdf, "tmp")) self.addTask(label="cleanup_%i" % (pdf_counter), command=cmd, isForceLocal=True, dependencies="mv_%i" % (pdf_counter), nCores=1, memMb=32) def strip_files_of_spaces(current_path, old_path): """Strips inputfiles of spaces to avoid bugs and also for better usage and readability.""" os.chdir(current_path) for f in os.listdir("."): r = f.replace(" ", "_") if(r != f): os.rename(f, r) os.chdir(old_path) def analyze_jobs(input_dir, image_suffix="tiff", skip_pdf=False, skip_images=False): pdfs = {} images = {} files = os.listdir(os.path.abspath(input_dir)) dirs = list(filter(lambda x: os.path.isdir(os.path.join(os.path.abspath(input_dir), x)), files)) # Scan root input dir if not skip_pdf: pdfs_ = list(filter(lambda x: x.endswith(".pdf"), files)) pdfs = {".".join(k.split(".")[:-1]): os.path.join(os.path.abspath(input_dir), k) for k in pdfs_} if not skip_images: print("sdf") if list(filter(lambda x: x.endswith(".%s" % (image_suffix)), files)): images["input_root"] = os.path.abspath(input_dir) # Scan subdirectories for inputs if available for d in dirs: strip_files_of_spaces(os.path.join(input_dir, d), os.getcwd()) files = os.listdir(os.path.join(input_dir, d)) if not skip_pdf: for p in list(filter(lambda x: x.endswith(".pdf"), files)): pdfs[".".join(p.split(".")[:-1])] = os.path.join(os.path.abspath(input_dir), d, p) if not skip_images: if sorted(list(filter(lambda x: x.endswith(".%s" % (image_suffix)), files))): images[d] = os.path.join(os.path.abspath(input_dir), d) return {"pdf": pdfs, "images": images} def main(): args = parse_arguments() # DICT{DICT} # 'pdf' :: {name: path} # 'images' :: {name: path} current_path = os.getcwd() strip_files_of_spaces(args.input_dir, current_path) jobs = analyze_jobs(args.input_dir, image_suffix=args.suffix, skip_pdf=args.skip_pdf, skip_images=args.skip_images) wflow = OCRWorkflow(jobs, args.input_dir, args.output_dir, args.suffix, args.lang, args.pdf, args.intermediate, args.skip_bin, args.nCores, args.mem, args.dry_run) retval = wflow.run(mode="local", nCores=args.nCores, memMb=args.mem, isDryRun=args.dry_run) sys.exit(retval) if __name__ == "__main__": main()