From aa48ea6ed2f5692245938d3628a0e9c8ee33915d Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Tue, 9 Oct 2018 14:43:23 +0200 Subject: [PATCH] Initial commit --- Dockerfile | 67 +++++++ ocr_pyflow | 562 +++++++++++++++++++++++++++++++++++++++++++++++++++++ parse_hocr | 43 ++++ 3 files changed, 672 insertions(+) create mode 100644 Dockerfile create mode 100755 ocr_pyflow create mode 100755 parse_hocr diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..5e048f1 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,67 @@ +FROM ubuntu:18.04 + +MAINTAINER Patrick Jentsch + +ENV DEBIAN_FRONTEND=noninteractive +ENV LANG=en_US.UTF-8 +ENV PYFLOW_VERSION 1.1.20 + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + gnupg2 + +# Add PPA for pdftk +RUN echo "deb http://ppa.launchpad.net/malteworld/ppa/ubuntu bionic main" >> /etc/apt/sources.list && \ + echo "deb-src http://ppa.launchpad.net/malteworld/ppa/ubuntu bionic main" >> /etc/apt/sources.list && \ + apt-key adv --keyserver keyserver.ubuntu.com --recv-keys BCA68C33DA36783B03E981C820D0BB61B700CE29 + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + ghostscript \ + git \ + imagemagick \ + libtiff-tools \ + locales \ + pdftk \ + poppler-utils \ + python2.7 \ + python3.6 \ + tesseract-ocr \ + wget + +# Configure locales +RUN locale-gen "$LANG" + +WORKDIR /root + +# Install pyFlow +RUN wget -nv http://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \ + tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \ + cd pyflow-"$PYFLOW_VERSION" && \ + python2.7 setup.py build install && \ + cd /root && \ + rm pyflow-"$PYFLOW_VERSION".tar.gz + +# Install Tesseract OCR Data Files +RUN wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ + wget -nv http://github.com/tesseract-ocr/tessdata/raw/3.04.00/deu_frak.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ + wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ + wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ + wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ + wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata + +# Install OCRopus +RUN git clone http://github.com/tmbdev/ocropy && \ + cd ocropy && \ + apt-get install -y --no-install-recommends $(cat PACKAGES) && \ + wget -nv http://www.tmbdev.net/en-default.pyrnn.gz && \ + mv en-default.pyrnn.gz models/ && \ + python2.7 setup.py install && \ + cd /root + +COPY ocr_pyflow /usr/local/bin +COPY parse_hocr /usr/local/bin + +VOLUME /root/files_for_ocr + +CMD ["/bin/bash"] \ No newline at end of file diff --git a/ocr_pyflow b/ocr_pyflow new file mode 100755 index 0000000..d8f0300 --- /dev/null +++ b/ocr_pyflow @@ -0,0 +1,562 @@ +#!/usr/bin/env python2.7 +# coding=utf-8 + +""" +ocr_pyflow.py + +Date: 01/04/2018 +Usage: For usage instructions run with option --help +Author: Madis Rumming +""" + +__author__ = "Madis Rumming " +__copyright__ = "Copyright 2018, Data Infrastructure and Digital Humanities,\ +SFB 1288, Bielefeld University" + +__version__ = "0.6" +__maintainer__ = "Madis Rumming" +__email__ = "mrumming@uni-bielefeld.de" +__status__ = "Development" + +import argparse +import os +import sys +# from string import maketrans +# from lxml import html +from pyflow import WorkflowRunner + +ocropusnlbin_bin = "/usr/local/bin/ocropus-nlbin" + +def parse_arguments(): + parser = argparse.ArgumentParser( + "Performs OCR of (historical) documents utilizing ocropy for preprocessing and tesseract \ + for final OCR. Available outputs are HOCR, PDF, shrinked PDF, and simple DTAbf \ + (TEI P5 compliant). Software requirements: python2.7, pyflow, pdftoppm, ocropus, \ + tesseract, ghostscript, imagick, ") + + parser.add_argument("-i", "--input-directory", + dest="input_dir", + help="Input directory with input images/PDFs. For each PDF an OCR-Run is \ + instantiated and output is created. If images reside in the directory, a \ + single OCR run is performed. For each subdirectory containing images, a \ + particular OCR run is instantiated.", + required=True) + parser.add_argument("-o", "--output-directory", + dest="output_dir", + help="Directory, where output directories are created if necessary. \ + Default: %s" % (os.path.curdir), + required=False, + default=os.path.curdir) + parser.add_argument("--image-suffix", + dest="suffix", + help="Input images suffix. Case-sensitive! tiff!=TIFF!=Tiff Default: tif.", + default="tif", + required=False) + parser.add_argument("--skip-pdf-processing", + dest="skip_pdf", + help="Skip detection of PDFs as input.", + default=False, + action='store_true', + required=False) + parser.add_argument("--skip-image-processing", + dest="skip_images", + help="Skip detection of images as input.", + default=False, + action='store_true', + required=False) + parser.add_argument("--skip_binarization", + dest="skip_bin", + help="skip binarizaiton pre-processing.", + default=False, + action="store_true", + required=False) + parser.add_argument("--start-page", + dest='startp', + default=-1, help="First page to ocr.", + required=False, type=int) + parser.add_argument("--end-page", + dest='endp', + default=-1, + help="Last page to ocr.", + required=False, type=int) + parser.add_argument("-r", "--rotate-pages", + dest='rotate', + default='norotation', + help="Rotate pages from input. Values: clockwise, counterclockwise, \ + upsidedown. Default: norotation", + required=False, + choices=['clockwise', 'counterclockwise', 'upsidedown', 'norotation']) + parser.add_argument("-s", "--split-pages", + dest='split', + default=False, + help="Split pages in half after possible rotation. Default: Not performed.", + required=False, + action='store_true') + parser.add_argument("--ppi-import", + dest="ppi_in", help="Down-/Scaling for input images. Default: 300 ppi.", + default=300, + required=False, + type=int) + parser.add_argument("-l", "--language", + dest='lang', + help="Language for OCR", + required=True, + type=str) + parser.add_argument("-p", "--create-pdf", + dest='pdf', + default=False, + action='store_true', + required=False) + parser.add_argument("-c", "--compress-pdf", + dest='comp', + default=False, + action='store_true', + required=False) + parser.add_argument("--ppi-export", + dest="ppi_out", + help="Down-/Scaling for output images in PDF. Default: 150 ppi.", + default=150, + required=False, + type=int) + parser.add_argument("-t", "--temp-directory", + dest="temp", + help="Location of intermediate files. Defaults to tmp in output directory.", + default="", + required=False, + type=str) + parser.add_argument("-k", "--keep-intermediate", + dest="intermediate", + help="Keep intermediate files. Default: True", + default=False, + action='store_true') + parser.add_argument("--cores", + dest='nCores', + help="Amount of CPUs to use for parallel jobs. Default: %i" % (4), + default=4, + required=False, + type=int) + parser.add_argument("--is-continued", dest='continued', + help="Enables continuing an erroneous or paused workflow. MUST use the \ + same dataDirRoot as before.", + default=False, + required=False, + action='store_true') + parser.add_argument("--is-dry-run", + dest='dry_run', + help="Check workflow without execution.", + default=False, + action='store_true', + required=False) + parser.add_argument("--memory", + dest='mem', + help="Total amount of memory (RAM) available for this workflow. \ + Default: %i" % (8192), + default=8192, + required=False, + type=int) + args = parser.parse_args() + return (args) + + +class PureOCR(WorkflowRunner): + def __init__(self, input_path, output_path): + self.input_path = input_path + self.output_path = output_path + + def workflow(self): + pass + + +class OCRWorkflow(WorkflowRunner): + + def __init__(self, pdfImageJobs, inputDir, outputDir, suffix, lang, pdf, intermediate, skip_bin, + nCores, memMb, dry_run): + self.pdfImageJobs = pdfImageJobs + self.outputDir = outputDir + self.inputDir = inputDir + self.suffix = suffix + self.lang = lang + self.pdf = pdf + self.intermediate = intermediate + self.skip_bin = skip_bin + self.nCores = nCores + self.memMb = memMb + self.dry_run = dry_run + + def workflow(self): + + cmd = None + print(self.outputDir) + if os.path.isabs(self.outputDir): + if not os.path.exists(self.outputDir): + cmd = "mkdir -p %s" % (self.outputDir) + else: + self.outputDir = os.path.join(os.path.abspath(self.inputDir), self.outputDir) + if not os.path.exists(self.outputDir): + cmd = "mkdir -p %s" % (self.outputDir) + + self.addTask(label="mkdir_outputdir", command=cmd, isForceLocal=True) + + deps = [] + if self.pdfImageJobs["pdf"]: + cmd = "mkdir %s" % (os.path.join(self.outputDir, "pdf")) + deps.append("mkdir_pdf") + self.addTask(label="mkdir_pdf", command=cmd, isForceLocal=True, + dependencies="mkdir_outputdir") + if self.pdfImageJobs["images"]: + cmd = "mkdir %s" % (os.path.join(self.outputDir, "images")) + deps.append("mkdir_images") + self.addTask(label="mkdir_images", command=cmd, isForceLocal=True, + dependencies="mkdir_outputdir") + + ### + # Generate directories + ### + deps_dirs = [] + if self.pdfImageJobs["pdf"]: + i = 0 + for k in self.pdfImageJobs["pdf"]: + cmd = "mkdir -p %s %s %s %s %s" % ( + os.path.join(os.path.abspath(self.outputDir), "pdf", k, "tmp", "tiff"), + os.path.join(os.path.abspath(self.outputDir), "pdf", k, "tmp", "tesseract"), + os.path.join(os.path.abspath(self.outputDir), "pdf", k, "tmp", "ocropy"), + os.path.join(os.path.abspath(self.outputDir), "pdf", k, "tmp", "gs"), + os.path.join(os.path.abspath(self.outputDir), "pdf", k, "HOCR")) + + deps_dirs.append("mkdir_pdf_%i" % (i)) + self.addTask(label="mkdir_pdf_%i" % (i), command=cmd, isForceLocal=True, + dependencies=deps) + i += 1 + if self.pdfImageJobs["images"]: + i = 0 + for k in self.pdfImageJobs["images"]: + cmd = "mkdir -p %s %s %s %s" % ( + os.path.join(os.path.abspath(self.outputDir), "images", k, "tmp", "tesseract"), + os.path.join(os.path.abspath(self.outputDir), "images", k, "tmp", "ocropy"), + os.path.join(os.path.abspath(self.outputDir), "images", k, "tmp", "gs"), + os.path.join(os.path.abspath(self.outputDir), "images", k, "HOCR")) + deps_dirs.append("mkdir_images_%i" % (i)) + self.addTask(label="mkdir_images_%i" % (i), command=cmd, isForceLocal=True, + dependencies=deps) + i += 1 + + ### + # Extract images from PDF input files + ### + pdf_counter = 0 + deps_ppm = [] + for pdf in self.pdfImageJobs["pdf"]: + cmd = "pdftoppm %s %s -tiff -r 300 -tiffcompression lzw -cropbox" % (self.pdfImageJobs["pdf"][pdf], + os.path.join( + os.path.abspath(self.outputDir), + "pdf", pdf, "tmp", + "tiff", "interm")) + self.addTask(label="pdftoppm_%i" % (pdf_counter), command=cmd, isForceLocal=True, + dependencies=deps_dirs, nCores=1, memMb=1024) + deps_ppm.append("pdftoppm_%i" % (pdf_counter)) + pdf_counter += 1 + + ### + # Perform image binarization if not skipped + ### + self.waitForTasks() + ocropus_counter = 0 + deps_ocropus = [] + if not self.skip_bin: + for pdf in self.pdfImageJobs["pdf"]: + cmd = "%s -Q %i -o %s %s" % (ocropusnlbin_bin, self.nCores, + os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", "ocropy"), + os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", "tiff", + "interm-*")) + self.addTask(label="ocropusnlbin_%i" % (ocropus_counter), command=cmd, isForceLocal=True, + dependencies=deps_ppm, + nCores=self.nCores, memMb=self.memMb) + deps_ocropus.append("ocropusnlbin_%i" % (ocropus_counter)) + ocropus_counter += 1 + for img in self.pdfImageJobs["images"]: + cmd = "%s -Q %i -o %s %s" % (ocropusnlbin_bin, self.nCores, + os.path.join(os.path.abspath(self.outputDir), "images", img, "tmp", + "ocropy"), + os.path.join(self.pdfImageJobs["images"][img], "*.%s" % (self.suffix))) + self.addTask(label="ocropusnlbin_%i" % (ocropus_counter), command=cmd, isForceLocal=True, + dependencies=deps_ppm, + nCores=self.nCores, memMb=self.memMb) + deps_ocropus.append("ocropusnlbin_%i" % (ocropus_counter)) + ocropus_counter += 1 + + ### + # Perform OCR, HOCR as default output, PDF only optional + ### + self.waitForTasks() + pdf_counter = 0 + for pdf in self.pdfImageJobs["pdf"]: + if self.skip_bin: + inp_images = sorted(os.listdir( + os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", "tiff"))) + else: + inp_images = sorted(filter(lambda x: ".bin." in x, os.listdir( + os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", + "ocropy")))) + + deps_ocr_pdf = [] + + for image in inp_images: + cmd = "tesseract %s %s -l %s hocr %s" % ( + os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", + "tiff" if self.skip_bin else "ocropy", image), + os.path.join(self.outputDir, "pdf", pdf, "tmp", "tesseract", + "%04i" % int(image.split(".")[0].split("-")[1]) if self.skip_bin else image.split(".")[ + 0]), + self.lang, "pdf" if self.pdf else "") + + self.addTask(label="ocr_%i_%s" % (pdf_counter, image.split(".")[0]), command=cmd, isForceLocal=True, + dependencies=deps_ocropus, nCores=1, memMb=2048) + deps_ocr_pdf.append("ocr_%i_%s" % (pdf_counter, image.split(".")[0])) + + pdf_counter += 1 + + for img in self.pdfImageJobs["images"]: + if self.skip_bin: + images = sorted( + list(filter(lambda x: x.endswith(self.suffix), os.listdir(self.pdfImageJobs["images"][img])))) + else: + images = sorted( + list(filter(lambda x: ".bin." in x, os.listdir( + os.path.join(os.path.abspath(self.outputDir), "images", img, "tmp", "ocropy"))))) + + deps_ocr = [] + for image in images: + if self.skip_bin: + cmd = "tesseract %s %s -l %s hocr %s" % ( + os.path.join(self.pdfImageJobs["images"][img], image), + os.path.join(self.outputDir, "images", img, "tmp", "tesseract", image.split(".")[0]), + self.lang, "pdf" if self.pdf else "") + else: + cmd = "tesseract %s %s -l %s hocr %s" % ( + os.path.join(os.path.abspath(self.outputDir), "images", img, "tmp", "ocropy", image), + os.path.join(self.outputDir, "images", img, "tmp", "tesseract", image.split(".")[0]), + self.lang, "pdf" if self.pdf else "") + print(cmd) + self.addTask(label="ocr_%s_%s" % (img, image.split(".")[0]), command=cmd, isForceLocal=True, + dependencies=deps_ocropus, nCores=1, memMb=2048) + + deps_ocr.append("ocr_%s_%s" % (img, image.split(".")[0])) + + pdftk_jobs = [] + ### + # Creation of PDF output + ### + if self.pdf: + pdf_counter_conv = 0 + for img in self.pdfImageJobs["images"]: + if self.skip_bin: + images = sorted( + list(filter(lambda x: x.endswith(self.suffix), os.listdir(self.pdfImageJobs["images"][img])))) + else: + images = sorted( + list(filter(lambda x: ".bin." in x, os.listdir( + os.path.join(os.path.abspath(self.outputDir), "images", img, "tmp", "ocropy"))))) + + deps_tiffpdf = [] + + for image in images: + if self.skip_bin: + cmd = "tiff2pdf -o %s.pdf %s" % ( + os.path.join(self.outputDir, "images", img, "tmp", "gs", image.split(".")[0]), + os.path.join(self.pdfImageJobs["images"][img], image)) + self.addTask(label="tiffpdf_%s_%s" % (img, image.split(".")[0]), command=cmd, isForceLocal=True, + dependencies=deps_dirs, nCores=1, memMb=256) + deps_tiffpdf.append("tiffpdf_%s_%s" % (img, image.split(".")[0])) + else: + cmd = "convert %s %s.pdf" % ( + os.path.join(os.path.abspath(self.outputDir), "images", img, "tmp", "ocropy", image), + os.path.join(self.outputDir, "images", img, "tmp", "gs", image.split(".")[0])) + self.addTask(label="tiffpdf_%s_%s" % (img, image.split(".")[0]), command=cmd, isForceLocal=True, + dependencies=deps_ocropus, nCores=1, memMb=256) + deps_tiffpdf.append("tiffpdf_%s_%s" % (img, image.split(".")[0])) + + cmd = "gs -dBATCH -dNOPAUSE -q -r150 -dPDFSETTINGS=/ebook -sDEVICE=pdfwrite -sOutputFile=%s %s" % ( + os.path.join(self.outputDir, "images", img, "tmp", "gs", "image.pdf"), + os.path.join(self.outputDir, "images", img, "tmp", "gs", "*.pdf")) + self.addTask(label="gs_image_%s" % (img), command=cmd, isForceLocal=True, + dependencies=deps_tiffpdf, nCores=1, memMb=4096) + + deps_ocr.append("gs_image_%s" % (img)) + + cmd = "gs -dBATCH -dNOPAUSE -q -r150 -dPDFSETTINGS=/ebook -dFILTERIMAGE -sDEVICE=pdfwrite -sOutputFile=%s %s" % ( + os.path.join(self.outputDir, "images", img, "tmp", "gs", "text.pdf"), + os.path.join(self.outputDir, "images", img, "tmp", "tesseract", "*.pdf")) + + self.addTask(label="gs_layer_%s" % (img), command=cmd, isForceLocal=True, + dependencies=deps_ocr, nCores=1, memMb=4096) + + cmd = "pdftk %s multistamp %s output %s" % ( + os.path.join(self.outputDir, "images", img, "tmp", "gs", "text.pdf"), + os.path.join(self.outputDir, "images", img, "tmp", "gs", "image.pdf"), + os.path.join(self.outputDir, "images", img, "%s.pdf" % (img))) + + self.addTask(label="pdftk_%s" % (img), command=cmd, isForceLocal=True, + dependencies="gs_layer_%s" % (img), nCores=1, memMb=4096) + pdftk_jobs.append("pdftk_%s" % (img)) + + for pdf in self.pdfImageJobs["pdf"]: + + pdf_counter_conv += 1 + if self.skip_bin: + inp_images = sorted(os.listdir( + os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", "tiff"))) + else: + inp_images = sorted(filter(lambda x: ".bin." in x, os.listdir( + os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", + "ocropy")))) + + deps_pdf_tiffpdf = [] + + for image in inp_images: + if self.skip_bin: + cmd = "tiff2pdf -o %s.pdf %s" % ( + os.path.join(self.outputDir, "pdf", pdf, "tmp", "gs", image.split(".")[0]), + os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", "tiff", image)) + self.addTask(label="tiffpdf_pdf%i_%s" % (pdf_counter_conv, image.split(".")[0]), command=cmd, + isForceLocal=True, + dependencies=deps_dirs, nCores=1, memMb=256) + deps_pdf_tiffpdf.append("tiffpdf_pdf%i_%s" % (pdf_counter_conv, image.split(".")[0])) + else: + cmd = "convert %s %s.pdf" % ( + os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", "ocropy", image), + os.path.join(self.outputDir, "pdf", pdf, "tmp", "gs", image.split(".")[0])) + self.addTask(label="tiffpdf_pdf%i_%s" % (pdf_counter_conv, image.split(".")[0]), command=cmd, + isForceLocal=True, + dependencies=deps_ocropus, nCores=1, memMb=256) + deps_pdf_tiffpdf.append("tiffpdf_pdf%i_%s" % (pdf_counter_conv, image.split(".")[0])) + + cmd = "gs -dBATCH -dNOPAUSE -q -r150 -dPDFSETTINGS=/ebook -sDEVICE=pdfwrite -sOutputFile=%s %s" % ( + os.path.join(self.outputDir, "pdf", pdf, "tmp", "gs", "image.pdf"), + os.path.join(self.outputDir, "pdf", pdf, "tmp", "gs", "*.pdf")) + self.addTask(label="gs_image_pdf%i" % (pdf_counter_conv), command=cmd, isForceLocal=True, + dependencies=deps_pdf_tiffpdf, nCores=1, memMb=4096) + + deps_ocr.append("gs_image_pdf%i" % (pdf_counter_conv)) + + cmd = "gs -dBATCH -dNOPAUSE -q -r150 -dPDFSETTINGS=/ebook -dFILTERIMAGE -sDEVICE=pdfwrite -sOutputFile=%s %s" % ( + os.path.join(self.outputDir, "pdf", pdf, "tmp", "gs", "text.pdf"), + os.path.join(self.outputDir, "pdf", pdf, "tmp", "tesseract", "*.pdf")) + + self.addTask(label="gs_layer_pdf%i" % (pdf_counter_conv), command=cmd, isForceLocal=True, + dependencies=deps_ocr, nCores=1, memMb=4096) + + cmd = "pdftk %s multistamp %s output %s" % ( + os.path.join(self.outputDir, "pdf", pdf, "tmp", "gs", "text.pdf"), + os.path.join(self.outputDir, "pdf", pdf, "tmp", "gs", "image.pdf"), + os.path.join(self.outputDir, "pdf", pdf, "%s.pdf" % (pdf))) + + self.addTask(label="pdftk_pdf%i" % (pdf_counter_conv), command=cmd, isForceLocal=True, + dependencies="gs_layer_pdf%i" % (pdf_counter_conv), nCores=1, memMb=4096) + + pdftk_jobs.append("pdftk_pdf%i" % (pdf_counter_conv)) + + else: + self.addTask(label="pdftk_", command=None, dependencies=deps_ocr) + pdftk_jobs.append("pdftk_") + + ### + # Merging and cleanup + ### + self.waitForTasks() + for img in self.pdfImageJobs["images"]: + cmd = "mv %s %s" % (os.path.join(self.outputDir, "images", img, "tmp", "tesseract", "*.hocr"), + os.path.join(os.path.abspath(self.outputDir), "images", img, "HOCR")) + self.addTask(label="mv_%s" % (img), command=cmd, isForceLocal=True, + dependencies=pdftk_jobs, nCores=1, memMb=32) + + cmd = "parse_hocr %s %s" % ( + os.path.join(os.path.abspath(self.outputDir), "images", img, "HOCR"), + os.path.join(os.path.abspath(self.outputDir), "images", img, "%s.xml" % (img))) + self.addTask(label="generate_hocr_%s" % (img), command=cmd, isForceLocal=True, + dependencies="mv_%s" % (img), nCores=1, memMb=250) + + if not self.intermediate: + cmd = "rm -rf %s" % (os.path.join(self.outputDir, "images", img, "tmp")) + self.addTask(label="cleanup_%s" % (img), command=cmd, isForceLocal=True, + dependencies="mv_%s" % (img), nCores=1, memMb=32) + + pdf_counter = 0 + for pdf in self.pdfImageJobs["pdf"]: + pdf_counter += 1 + cmd = "mv %s %s" % (os.path.join(self.outputDir, "pdf", pdf, "tmp", "tesseract", "*.hocr"), + os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "HOCR")) + self.addTask(label="mv_%i" % (pdf_counter), command=cmd, isForceLocal=True, + dependencies=pdftk_jobs, nCores=1, memMb=32) + + cmd = "parse_hocr %s %s" % ( + os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "HOCR"), + os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "%s.xml" % (pdf))) + self.addTask(label="generate_hocr_%i" % (pdf_counter), command=cmd, isForceLocal=True, + dependencies="mv_%i" % (pdf_counter), nCores=1, memMb=250) + + if not self.intermediate: + cmd = "rm -rf %s" % (os.path.join(self.outputDir, "pdf", pdf, "tmp")) + self.addTask(label="cleanup_%i" % (pdf_counter), command=cmd, isForceLocal=True, + dependencies="mv_%i" % (pdf_counter), nCores=1, memMb=32) + + +def strip_files_of_spaces(current_path, old_path): + """Strips inputfiles of spaces to avoid bugs and also for better usage and readability.""" + os.chdir(current_path) + for f in os.listdir("."): + r = f.replace(" ", "_") + if(r != f): + os.rename(f, r) + os.chdir(old_path) + + +def analyze_jobs(input_dir, image_suffix="tiff", skip_pdf=False, skip_images=False): + pdfs = {} + images = {} + files = os.listdir(os.path.abspath(input_dir)) + dirs = list(filter(lambda x: os.path.isdir(os.path.join(os.path.abspath(input_dir), x)), files)) + + # Scan root input dir + if not skip_pdf: + pdfs_ = list(filter(lambda x: x.endswith(".pdf"), files)) + pdfs = {".".join(k.split(".")[:-1]): os.path.join(os.path.abspath(input_dir), k) for k in pdfs_} + if not skip_images: + print("sdf") + if list(filter(lambda x: x.endswith(".%s" % (image_suffix)), files)): + images["input_root"] = os.path.abspath(input_dir) + + # Scan subdirectories for inputs if available + for d in dirs: + strip_files_of_spaces(os.path.join(input_dir, d), os.getcwd()) + files = os.listdir(os.path.join(input_dir, d)) + + if not skip_pdf: + for p in list(filter(lambda x: x.endswith(".pdf"), files)): + pdfs[".".join(p.split(".")[:-1])] = os.path.join(os.path.abspath(input_dir), d, p) + if not skip_images: + if sorted(list(filter(lambda x: x.endswith(".%s" % (image_suffix)), files))): + images[d] = os.path.join(os.path.abspath(input_dir), d) + + return {"pdf": pdfs, "images": images} + + +def main(): + args = parse_arguments() + + # DICT{DICT} + # 'pdf' :: {name: path} + # 'images' :: {name: path} + current_path = os.getcwd() + strip_files_of_spaces(args.input_dir, current_path) + jobs = analyze_jobs(args.input_dir, image_suffix=args.suffix, skip_pdf=args.skip_pdf, skip_images=args.skip_images) + + wflow = OCRWorkflow(jobs, args.input_dir, args.output_dir, args.suffix, args.lang, args.pdf, args.intermediate, + args.skip_bin, + args.nCores, args.mem, + args.dry_run) + retval = wflow.run(mode="local", nCores=args.nCores, memMb=args.mem, isDryRun=args.dry_run) + sys.exit(retval) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/parse_hocr b/parse_hocr new file mode 100755 index 0000000..0e1f258 --- /dev/null +++ b/parse_hocr @@ -0,0 +1,43 @@ +#!/usr/bin/env python3.6 +# coding=utf-8 + +import xml.etree.ElementTree as ET +from xml.sax.saxutils import escape +import os +import sys + +input_files = sorted(os.listdir(sys.argv[1])) +output_file = open(sys.argv[2], "w") + +output_file.write('\n' + + '\n' + + ' \n' + + ' \n' + + ' \n' + + ' \n' + + ' \n' + + ' \n' + + ' \n' + + ' \n' + + ' \n' + + ' \n' + + ' \n') + +for input_file in input_files: + tree = ET.parse(os.path.join(sys.argv[1], input_file)) + output_file.write(' \n' % (input_file.split(".")[0], input_file.split(".")[0])) + for para in tree.findall(".//*[@class='ocr_par']"): + output_file.write('

\n') + for line in para.findall(".//*[@class='ocr_line']"): + first_word_in_line = True + for word in line.findall(".//*[@class='ocrx_word']"): + if word.text is not None: + output_file.write((" " if first_word_in_line else " ") + escape(word.text.strip())) + first_word_in_line = False + if not first_word_in_line: + output_file.write('\n') + output_file.write('

\n') +output_file.write(' \n' + + '
\n' + + '
\n') +output_file.close() \ No newline at end of file