diff --git a/Dockerfile b/Dockerfile index 8a7b0b5..83fb254 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,66 +1,57 @@ -FROM ubuntu:18.04 +FROM debian:stretch MAINTAINER Patrick Jentsch ENV DEBIAN_FRONTEND=noninteractive -ENV LANG=en_US.UTF-8 +ENV LANG=C.UTF-8 ENV PYFLOW_VERSION 1.1.20 +ENV OCROPY_VERSION 1.3.3 RUN apt-get update && \ apt-get install -y --no-install-recommends \ - gnupg2 - -# Add PPA for pdftk -RUN echo "deb http://ppa.launchpad.net/malteworld/ppa/ubuntu bionic main" >> /etc/apt/sources.list && \ - echo "deb-src http://ppa.launchpad.net/malteworld/ppa/ubuntu bionic main" >> /etc/apt/sources.list && \ - apt-key adv --keyserver keyserver.ubuntu.com --recv-keys BCA68C33DA36783B03E981C820D0BB61B700CE29 - -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - ghostscript \ - git \ + apt-transport-https \ + ca-certificates \ + gnupg2 \ imagemagick \ libtiff-tools \ - locales \ pdftk \ poppler-utils \ python2.7 \ - python3.6 \ - python-pip \ - python-tk \ - tesseract-ocr \ + python3.5 \ + python-numpy \ wget -# Configure locales -RUN locale-gen "$LANG" - WORKDIR /root # Install pyFlow -RUN wget -nv http://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \ +RUN wget -nv https://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \ tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \ + rm pyflow-"$PYFLOW_VERSION".tar.gz && \ cd pyflow-"$PYFLOW_VERSION" && \ python2.7 setup.py build install && \ - cd /root && \ - rm pyflow-"$PYFLOW_VERSION".tar.gz + cd .. -# Install Tesseract OCR Data Files -RUN wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ - wget -nv http://github.com/tesseract-ocr/tessdata/raw/3.04.00/deu_frak.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ - wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ - wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ - wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ - wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata +# Install Tesseract OCR and Data Files +RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list && \ + wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - && \ + apt-get update && \ + apt-get install -y --no-install-recommends tesseract-ocr && \ + wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ + wget -nv https://github.com/tesseract-ocr/tessdata/raw/master/deu_frak.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ + wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ + wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ + wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ + wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata -# Install OCRopus -RUN git clone http://github.com/tmbdev/ocropy && \ - cd ocropy && \ - apt-get install -y --no-install-recommends $(cat PACKAGES) && \ - pip install -r requirements.txt && \ - wget -nv http://www.tmbdev.net/en-default.pyrnn.gz && \ - mv en-default.pyrnn.gz models/ && \ +# Install ocropy +RUN wget -nv https://github.com/tmbdev/ocropy/archive/v"$OCROPY_VERSION".tar.gz && \ + tar -xzf v"$OCROPY_VERSION".tar.gz && \ + rm v"$OCROPY_VERSION".tar.gz && \ + cd ocropy-"$OCROPY_VERSION" && \ + apt-get install -y --no-install-recommends $(cat PACKAGES) python-pil python-tk && \ + wget -nv http://www.tmbdev.net/en-default.pyrnn.gz -P models/ && \ python2.7 setup.py install && \ - cd /root + cd .. COPY ocr_pyflow /usr/local/bin COPY parse_hocr /usr/local/bin diff --git a/ocr_pyflow b/ocr_pyflow index d8f0300..849c00c 100755 --- a/ocr_pyflow +++ b/ocr_pyflow @@ -1,10 +1,12 @@ #!/usr/bin/env python2.7 # coding=utf-8 + + """ ocr_pyflow.py -Date: 01/04/2018 +Date: 18/10/2018 Usage: For usage instructions run with option --help Author: Madis Rumming """ @@ -13,45 +15,43 @@ __author__ = "Madis Rumming " __copyright__ = "Copyright 2018, Data Infrastructure and Digital Humanities,\ SFB 1288, Bielefeld University" -__version__ = "0.6" -__maintainer__ = "Madis Rumming" -__email__ = "mrumming@uni-bielefeld.de" +__version__ = "0.7" +__maintainer__ = "Patrick Jentsch" +__email__ = "p.jentsch@uni-bielefeld.de" __status__ = "Development" + + import argparse +import multiprocessing import os import sys -# from string import maketrans -# from lxml import html +import unicodedata from pyflow import WorkflowRunner + + ocropusnlbin_bin = "/usr/local/bin/ocropus-nlbin" + + def parse_arguments(): parser = argparse.ArgumentParser( - "Performs OCR of (historical) documents utilizing ocropy for preprocessing and tesseract \ - for final OCR. Available outputs are HOCR, PDF, shrinked PDF, and simple DTAbf \ - (TEI P5 compliant). Software requirements: python2.7, pyflow, pdftoppm, ocropus, \ - tesseract, ghostscript, imagick, ") + "Performs OCR of (historical) documents utilizing OCRopus for preprocessing and Tesseract OCR \ + for OCR. Available outputs are HOCR, PDF, shrinked PDF, and simple DTAbf \ + (TEI P5 compliant). Software requirements: imagemagick, ocropus, pdftk, pdftoppm, poppler-utils, pyflow, python2.7, tesseract") parser.add_argument("-i", "--input-directory", dest="input_dir", - help="Input directory with input images/PDFs. For each PDF an OCR-Run is \ - instantiated and output is created. If images reside in the directory, a \ - single OCR run is performed. For each subdirectory containing images, a \ - particular OCR run is instantiated.", + help="Input directory with input (Multipage-)TIFs or PDFs. For each input file an OCR-Run is \ + instantiated and output is created.", required=True) parser.add_argument("-o", "--output-directory", dest="output_dir", help="Directory, where output directories are created if necessary. \ - Default: %s" % (os.path.curdir), + Default: %s" % (os.path.join(os.path.curdir, "ocr_pyflow")), required=False, - default=os.path.curdir) - parser.add_argument("--image-suffix", - dest="suffix", - help="Input images suffix. Case-sensitive! tiff!=TIFF!=Tiff Default: tif.", - default="tif", - required=False) + default=(os.path.join(os.path.curdir, "ocr_pyflow"))) parser.add_argument("--skip-pdf-processing", dest="skip_pdf", help="Skip detection of PDFs as input.", @@ -64,36 +64,32 @@ def parse_arguments(): default=False, action='store_true', required=False) - parser.add_argument("--skip_binarization", - dest="skip_bin", - help="skip binarizaiton pre-processing.", - default=False, - action="store_true", - required=False) parser.add_argument("--start-page", dest='startp', - default=-1, help="First page to ocr.", + help="NOT IMPLEMENTED! First page to ocr.", + default=-1, required=False, type=int) parser.add_argument("--end-page", dest='endp', default=-1, - help="Last page to ocr.", + help="NOT IMPLEMENTED! Last page to ocr.", required=False, type=int) parser.add_argument("-r", "--rotate-pages", dest='rotate', default='norotation', - help="Rotate pages from input. Values: clockwise, counterclockwise, \ + help="NOT IMPLEMENTED! Rotate pages from input. Values: clockwise, counterclockwise, \ upsidedown. Default: norotation", required=False, choices=['clockwise', 'counterclockwise', 'upsidedown', 'norotation']) parser.add_argument("-s", "--split-pages", dest='split', default=False, - help="Split pages in half after possible rotation. Default: Not performed.", + help="NOT IMPLEMENTED! Split pages in half after possible rotation. Default: Not performed.", required=False, action='store_true') parser.add_argument("--ppi-import", - dest="ppi_in", help="Down-/Scaling for input images. Default: 300 ppi.", + dest="ppi_in", + help="NOT IMPLEMENTED! Scaling for input images. Default: 300 ppi.", default=300, required=False, type=int) @@ -109,47 +105,36 @@ def parse_arguments(): required=False) parser.add_argument("-c", "--compress-pdf", dest='comp', + help="NOT IMPLEMENTED!", default=False, action='store_true', required=False) parser.add_argument("--ppi-export", dest="ppi_out", - help="Down-/Scaling for output images in PDF. Default: 150 ppi.", + help="NOT IMPLEMENTED! Scaling for output images in PDF. Default: 150 ppi.", default=150, required=False, type=int) - parser.add_argument("-t", "--temp-directory", - dest="temp", - help="Location of intermediate files. Defaults to tmp in output directory.", - default="", - required=False, - type=str) parser.add_argument("-k", "--keep-intermediate", dest="intermediate", - help="Keep intermediate files. Default: True", + help="Keep intermediate files. Default: False", default=False, action='store_true') parser.add_argument("--cores", dest='nCores', - help="Amount of CPUs to use for parallel jobs. Default: %i" % (4), - default=4, + help="Amount of CPUs to use for parallel jobs. Default: Number of available CPUs", + default=multiprocessing.cpu_count(), required=False, type=int) parser.add_argument("--is-continued", dest='continued', - help="Enables continuing an erroneous or paused workflow. MUST use the \ + help="NOT IMPLEMENTED! Enables continuing an erroneous or paused workflow. MUST use the \ same dataDirRoot as before.", default=False, required=False, action='store_true') - parser.add_argument("--is-dry-run", - dest='dry_run', - help="Check workflow without execution.", - default=False, - action='store_true', - required=False) parser.add_argument("--memory", dest='mem', - help="Total amount of memory (RAM) available for this workflow. \ + help="NOT IMPLEMENTED! Total amount of memory (RAM) available for this workflow. \ Default: %i" % (8192), default=8192, required=False, @@ -158,405 +143,206 @@ def parse_arguments(): return (args) -class PureOCR(WorkflowRunner): - def __init__(self, input_path, output_path): - self.input_path = input_path - self.output_path = output_path - - def workflow(self): - pass - class OCRWorkflow(WorkflowRunner): - - def __init__(self, pdfImageJobs, inputDir, outputDir, suffix, lang, pdf, intermediate, skip_bin, - nCores, memMb, dry_run): + def __init__(self, pdfImageJobs, inputDir, outputDir, lang, pdf, intermediate, nCores, memMb): self.pdfImageJobs = pdfImageJobs self.outputDir = outputDir self.inputDir = inputDir - self.suffix = suffix self.lang = lang self.pdf = pdf self.intermediate = intermediate - self.skip_bin = skip_bin self.nCores = nCores self.memMb = memMb - self.dry_run = dry_run + + def workflow(self): + ### + # Task "mkdir_job": create output directories + # Dependencies: None + ### + mkdir_jobs = [] + mkdir_job_number = 0 + for job in self.pdfImageJobs["images"] + self.pdfImageJobs["pdf"]: + mkdir_job_number += 1 + cmd = "mkdir -p %s %s %s %s" % ( + os.path.join(job["output_dir"], "hocr_files"), + os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"), + os.path.join(job["output_dir"], "tmp", "tesseract"), + os.path.join(job["output_dir"], "tmp", "tiff_files")) + mkdir_jobs.append(self.addTask(label="mkdir_job_-_%i" % (mkdir_job_number), command=cmd)) - cmd = None - print(self.outputDir) - if os.path.isabs(self.outputDir): - if not os.path.exists(self.outputDir): - cmd = "mkdir -p %s" % (self.outputDir) - else: - self.outputDir = os.path.join(os.path.abspath(self.inputDir), self.outputDir) - if not os.path.exists(self.outputDir): - cmd = "mkdir -p %s" % (self.outputDir) - - self.addTask(label="mkdir_outputdir", command=cmd, isForceLocal=True) - - deps = [] - if self.pdfImageJobs["pdf"]: - cmd = "mkdir %s" % (os.path.join(self.outputDir, "pdf")) - deps.append("mkdir_pdf") - self.addTask(label="mkdir_pdf", command=cmd, isForceLocal=True, - dependencies="mkdir_outputdir") - if self.pdfImageJobs["images"]: - cmd = "mkdir %s" % (os.path.join(self.outputDir, "images")) - deps.append("mkdir_images") - self.addTask(label="mkdir_images", command=cmd, isForceLocal=True, - dependencies="mkdir_outputdir") ### - # Generate directories + # Task "split_job": split input file into one tiff file per page + # Dependencies: mkdir_jobs ### - deps_dirs = [] - if self.pdfImageJobs["pdf"]: - i = 0 - for k in self.pdfImageJobs["pdf"]: - cmd = "mkdir -p %s %s %s %s %s" % ( - os.path.join(os.path.abspath(self.outputDir), "pdf", k, "tmp", "tiff"), - os.path.join(os.path.abspath(self.outputDir), "pdf", k, "tmp", "tesseract"), - os.path.join(os.path.abspath(self.outputDir), "pdf", k, "tmp", "ocropy"), - os.path.join(os.path.abspath(self.outputDir), "pdf", k, "tmp", "gs"), - os.path.join(os.path.abspath(self.outputDir), "pdf", k, "HOCR")) + split_jobs = [] + split_job_number = 0 + for job in self.pdfImageJobs["images"]: + split_job_number += 1 + # TODO: Make the following command work + ''' + cmd = "convert %s %s" % ( + job["path"], + os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + "-%sd.tif" % ("%"))) + ''' + # WORKAROUND + cmd = "tiff2pdf -o %s %s && pdftoppm %s %s -tiff -r 300 -tiffcompression lzw -cropbox && rm %s" % ( + os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + ".pdf"), + job["path"], + os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + ".pdf"), + os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0]), + os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + ".pdf")) + split_jobs.append(self.addTask(label="split_job_-_%i" % (split_job_number), command=cmd, dependencies=mkdir_jobs, nCores=1, memMb=1024)) + for job in self.pdfImageJobs["pdf"]: + split_job_number += 1 + cmd = "pdftoppm %s %s -tiff -r 300 -tiffcompression lzw -cropbox" % ( + job["path"], + os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0])) + split_jobs.append(self.addTask(label="split_job_-_%i" % (split_job_number), command=cmd, dependencies=mkdir_jobs, nCores=1, memMb=1024)) - deps_dirs.append("mkdir_pdf_%i" % (i)) - self.addTask(label="mkdir_pdf_%i" % (i), command=cmd, isForceLocal=True, - dependencies=deps) - i += 1 - if self.pdfImageJobs["images"]: - i = 0 - for k in self.pdfImageJobs["images"]: - cmd = "mkdir -p %s %s %s %s" % ( - os.path.join(os.path.abspath(self.outputDir), "images", k, "tmp", "tesseract"), - os.path.join(os.path.abspath(self.outputDir), "images", k, "tmp", "ocropy"), - os.path.join(os.path.abspath(self.outputDir), "images", k, "tmp", "gs"), - os.path.join(os.path.abspath(self.outputDir), "images", k, "HOCR")) - deps_dirs.append("mkdir_images_%i" % (i)) - self.addTask(label="mkdir_images_%i" % (i), command=cmd, isForceLocal=True, - dependencies=deps) - i += 1 ### - # Extract images from PDF input files + # Task "ocropus_nlbin_job": binarize tiff files from previous split + # Dependencies: split_jobs ### - pdf_counter = 0 - deps_ppm = [] - for pdf in self.pdfImageJobs["pdf"]: - cmd = "pdftoppm %s %s -tiff -r 300 -tiffcompression lzw -cropbox" % (self.pdfImageJobs["pdf"][pdf], - os.path.join( - os.path.abspath(self.outputDir), - "pdf", pdf, "tmp", - "tiff", "interm")) - self.addTask(label="pdftoppm_%i" % (pdf_counter), command=cmd, isForceLocal=True, - dependencies=deps_dirs, nCores=1, memMb=1024) - deps_ppm.append("pdftoppm_%i" % (pdf_counter)) - pdf_counter += 1 + ocropusnlbin_jobs = [] + ocropusnlbin_job_number = 0 + for job in self.pdfImageJobs["images"] + self.pdfImageJobs["pdf"]: + ocropusnlbin_job_number += 1 + cmd = "%s -Q %i -o %s %s" % ( + ocropusnlbin_bin, + self.nCores, + os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"), + os.path.join(job["output_dir"], "tmp", "tiff_files", os.path.basename(job["path"]).rsplit(".", 1)[0] + "-*.tif")) + ocropusnlbin_jobs.append(self.addTask(label="ocropusnlbin_job_-_%i" % (ocropusnlbin_job_number), command=cmd, dependencies=split_jobs, nCores=self.nCores, memMb=self.memMb)) + ### - # Perform image binarization if not skipped + # Task "tesseract_job": perform OCR on binarized images + # Dependencies: ocropusnlbin_jobs ### self.waitForTasks() - ocropus_counter = 0 - deps_ocropus = [] - if not self.skip_bin: - for pdf in self.pdfImageJobs["pdf"]: - cmd = "%s -Q %i -o %s %s" % (ocropusnlbin_bin, self.nCores, - os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", "ocropy"), - os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", "tiff", - "interm-*")) - self.addTask(label="ocropusnlbin_%i" % (ocropus_counter), command=cmd, isForceLocal=True, - dependencies=deps_ppm, - nCores=self.nCores, memMb=self.memMb) - deps_ocropus.append("ocropusnlbin_%i" % (ocropus_counter)) - ocropus_counter += 1 - for img in self.pdfImageJobs["images"]: - cmd = "%s -Q %i -o %s %s" % (ocropusnlbin_bin, self.nCores, - os.path.join(os.path.abspath(self.outputDir), "images", img, "tmp", - "ocropy"), - os.path.join(self.pdfImageJobs["images"][img], "*.%s" % (self.suffix))) - self.addTask(label="ocropusnlbin_%i" % (ocropus_counter), command=cmd, isForceLocal=True, - dependencies=deps_ppm, - nCores=self.nCores, memMb=self.memMb) - deps_ocropus.append("ocropusnlbin_%i" % (ocropus_counter)) - ocropus_counter += 1 - - ### - # Perform OCR, HOCR as default output, PDF only optional - ### - self.waitForTasks() - pdf_counter = 0 - for pdf in self.pdfImageJobs["pdf"]: - if self.skip_bin: - inp_images = sorted(os.listdir( - os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", "tiff"))) - else: - inp_images = sorted(filter(lambda x: ".bin." in x, os.listdir( - os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", - "ocropy")))) - - deps_ocr_pdf = [] - - for image in inp_images: + tesseract_jobs = [] + tesseract_job_number = 0 + for job in self.pdfImageJobs["images"] + self.pdfImageJobs["pdf"]: + # This list is empty if you don't wait for ocropus_nlbin_jobs to complete + for file in filter(lambda x: x.endswith(".bin.png"), os.listdir(os.path.join(job["output_dir"], "tmp", "ocropus-nlbin"))): + tesseract_job_number += 1 cmd = "tesseract %s %s -l %s hocr %s" % ( - os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", - "tiff" if self.skip_bin else "ocropy", image), - os.path.join(self.outputDir, "pdf", pdf, "tmp", "tesseract", - "%04i" % int(image.split(".")[0].split("-")[1]) if self.skip_bin else image.split(".")[ - 0]), - self.lang, "pdf" if self.pdf else "") + os.path.join(job["output_dir"], "tmp", "ocropus-nlbin", file), + os.path.join(job["output_dir"], "tmp", "tesseract", file.rsplit(".", 2)[0]), + self.lang, + "pdf" if self.pdf else "") + tesseract_jobs.append(self.addTask(label="tesseract_job_-_%i" % (tesseract_job_number), command=cmd, dependencies=ocropusnlbin_jobs, nCores=1, memMb=2048)) - self.addTask(label="ocr_%i_%s" % (pdf_counter, image.split(".")[0]), command=cmd, isForceLocal=True, - dependencies=deps_ocropus, nCores=1, memMb=2048) - deps_ocr_pdf.append("ocr_%i_%s" % (pdf_counter, image.split(".")[0])) - pdf_counter += 1 - - for img in self.pdfImageJobs["images"]: - if self.skip_bin: - images = sorted( - list(filter(lambda x: x.endswith(self.suffix), os.listdir(self.pdfImageJobs["images"][img])))) - else: - images = sorted( - list(filter(lambda x: ".bin." in x, os.listdir( - os.path.join(os.path.abspath(self.outputDir), "images", img, "tmp", "ocropy"))))) - - deps_ocr = [] - for image in images: - if self.skip_bin: - cmd = "tesseract %s %s -l %s hocr %s" % ( - os.path.join(self.pdfImageJobs["images"][img], image), - os.path.join(self.outputDir, "images", img, "tmp", "tesseract", image.split(".")[0]), - self.lang, "pdf" if self.pdf else "") - else: - cmd = "tesseract %s %s -l %s hocr %s" % ( - os.path.join(os.path.abspath(self.outputDir), "images", img, "tmp", "ocropy", image), - os.path.join(self.outputDir, "images", img, "tmp", "tesseract", image.split(".")[0]), - self.lang, "pdf" if self.pdf else "") - print(cmd) - self.addTask(label="ocr_%s_%s" % (img, image.split(".")[0]), command=cmd, isForceLocal=True, - dependencies=deps_ocropus, nCores=1, memMb=2048) - - deps_ocr.append("ocr_%s_%s" % (img, image.split(".")[0])) - - pdftk_jobs = [] ### - # Creation of PDF output + # Task "pdf_merge_job": Merge PDF files + # Dependencies: tesseract_jobs ### + pdf_merge_jobs = [] + pdf_merge_job_number = 0 if self.pdf: - pdf_counter_conv = 0 - for img in self.pdfImageJobs["images"]: - if self.skip_bin: - images = sorted( - list(filter(lambda x: x.endswith(self.suffix), os.listdir(self.pdfImageJobs["images"][img])))) - else: - images = sorted( - list(filter(lambda x: ".bin." in x, os.listdir( - os.path.join(os.path.abspath(self.outputDir), "images", img, "tmp", "ocropy"))))) + for job in self.pdfImageJobs["images"] + self.pdfImageJobs["pdf"]: + pdf_merge_job_number += 1 + cmd = "pdftk %s cat output %s" % ( + os.path.join(job["output_dir"], "tmp", "tesseract", "*.pdf"), + os.path.join(job["output_dir"], os.path.basename(job["path"].rsplit(".", 1)[0] + ".pdf"))) + pdf_merge_jobs.append(self.addTask(label="pdf_merge_job_-_%i" % (pdf_merge_job_number), command=cmd, dependencies=tesseract_jobs, nCores=1, memMb=4096)) - deps_tiffpdf = [] - - for image in images: - if self.skip_bin: - cmd = "tiff2pdf -o %s.pdf %s" % ( - os.path.join(self.outputDir, "images", img, "tmp", "gs", image.split(".")[0]), - os.path.join(self.pdfImageJobs["images"][img], image)) - self.addTask(label="tiffpdf_%s_%s" % (img, image.split(".")[0]), command=cmd, isForceLocal=True, - dependencies=deps_dirs, nCores=1, memMb=256) - deps_tiffpdf.append("tiffpdf_%s_%s" % (img, image.split(".")[0])) - else: - cmd = "convert %s %s.pdf" % ( - os.path.join(os.path.abspath(self.outputDir), "images", img, "tmp", "ocropy", image), - os.path.join(self.outputDir, "images", img, "tmp", "gs", image.split(".")[0])) - self.addTask(label="tiffpdf_%s_%s" % (img, image.split(".")[0]), command=cmd, isForceLocal=True, - dependencies=deps_ocropus, nCores=1, memMb=256) - deps_tiffpdf.append("tiffpdf_%s_%s" % (img, image.split(".")[0])) - - cmd = "gs -dBATCH -dNOPAUSE -q -r150 -dPDFSETTINGS=/ebook -sDEVICE=pdfwrite -sOutputFile=%s %s" % ( - os.path.join(self.outputDir, "images", img, "tmp", "gs", "image.pdf"), - os.path.join(self.outputDir, "images", img, "tmp", "gs", "*.pdf")) - self.addTask(label="gs_image_%s" % (img), command=cmd, isForceLocal=True, - dependencies=deps_tiffpdf, nCores=1, memMb=4096) - - deps_ocr.append("gs_image_%s" % (img)) - - cmd = "gs -dBATCH -dNOPAUSE -q -r150 -dPDFSETTINGS=/ebook -dFILTERIMAGE -sDEVICE=pdfwrite -sOutputFile=%s %s" % ( - os.path.join(self.outputDir, "images", img, "tmp", "gs", "text.pdf"), - os.path.join(self.outputDir, "images", img, "tmp", "tesseract", "*.pdf")) - - self.addTask(label="gs_layer_%s" % (img), command=cmd, isForceLocal=True, - dependencies=deps_ocr, nCores=1, memMb=4096) - - cmd = "pdftk %s multistamp %s output %s" % ( - os.path.join(self.outputDir, "images", img, "tmp", "gs", "text.pdf"), - os.path.join(self.outputDir, "images", img, "tmp", "gs", "image.pdf"), - os.path.join(self.outputDir, "images", img, "%s.pdf" % (img))) - - self.addTask(label="pdftk_%s" % (img), command=cmd, isForceLocal=True, - dependencies="gs_layer_%s" % (img), nCores=1, memMb=4096) - pdftk_jobs.append("pdftk_%s" % (img)) - - for pdf in self.pdfImageJobs["pdf"]: - - pdf_counter_conv += 1 - if self.skip_bin: - inp_images = sorted(os.listdir( - os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", "tiff"))) - else: - inp_images = sorted(filter(lambda x: ".bin." in x, os.listdir( - os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", - "ocropy")))) - - deps_pdf_tiffpdf = [] - - for image in inp_images: - if self.skip_bin: - cmd = "tiff2pdf -o %s.pdf %s" % ( - os.path.join(self.outputDir, "pdf", pdf, "tmp", "gs", image.split(".")[0]), - os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", "tiff", image)) - self.addTask(label="tiffpdf_pdf%i_%s" % (pdf_counter_conv, image.split(".")[0]), command=cmd, - isForceLocal=True, - dependencies=deps_dirs, nCores=1, memMb=256) - deps_pdf_tiffpdf.append("tiffpdf_pdf%i_%s" % (pdf_counter_conv, image.split(".")[0])) - else: - cmd = "convert %s %s.pdf" % ( - os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", "ocropy", image), - os.path.join(self.outputDir, "pdf", pdf, "tmp", "gs", image.split(".")[0])) - self.addTask(label="tiffpdf_pdf%i_%s" % (pdf_counter_conv, image.split(".")[0]), command=cmd, - isForceLocal=True, - dependencies=deps_ocropus, nCores=1, memMb=256) - deps_pdf_tiffpdf.append("tiffpdf_pdf%i_%s" % (pdf_counter_conv, image.split(".")[0])) - - cmd = "gs -dBATCH -dNOPAUSE -q -r150 -dPDFSETTINGS=/ebook -sDEVICE=pdfwrite -sOutputFile=%s %s" % ( - os.path.join(self.outputDir, "pdf", pdf, "tmp", "gs", "image.pdf"), - os.path.join(self.outputDir, "pdf", pdf, "tmp", "gs", "*.pdf")) - self.addTask(label="gs_image_pdf%i" % (pdf_counter_conv), command=cmd, isForceLocal=True, - dependencies=deps_pdf_tiffpdf, nCores=1, memMb=4096) - - deps_ocr.append("gs_image_pdf%i" % (pdf_counter_conv)) - - cmd = "gs -dBATCH -dNOPAUSE -q -r150 -dPDFSETTINGS=/ebook -dFILTERIMAGE -sDEVICE=pdfwrite -sOutputFile=%s %s" % ( - os.path.join(self.outputDir, "pdf", pdf, "tmp", "gs", "text.pdf"), - os.path.join(self.outputDir, "pdf", pdf, "tmp", "tesseract", "*.pdf")) - - self.addTask(label="gs_layer_pdf%i" % (pdf_counter_conv), command=cmd, isForceLocal=True, - dependencies=deps_ocr, nCores=1, memMb=4096) - - cmd = "pdftk %s multistamp %s output %s" % ( - os.path.join(self.outputDir, "pdf", pdf, "tmp", "gs", "text.pdf"), - os.path.join(self.outputDir, "pdf", pdf, "tmp", "gs", "image.pdf"), - os.path.join(self.outputDir, "pdf", pdf, "%s.pdf" % (pdf))) - - self.addTask(label="pdftk_pdf%i" % (pdf_counter_conv), command=cmd, isForceLocal=True, - dependencies="gs_layer_pdf%i" % (pdf_counter_conv), nCores=1, memMb=4096) - - pdftk_jobs.append("pdftk_pdf%i" % (pdf_counter_conv)) - - else: - self.addTask(label="pdftk_", command=None, dependencies=deps_ocr) - pdftk_jobs.append("pdftk_") ### - # Merging and cleanup + # Task "move_hocr_job": move hocr files from /tmp/tesseract to /hocr_files + # Dependencies: tesseract_jobs + ### + move_hocr_jobs = [] + move_hocr_job_number = 0 + for job in self.pdfImageJobs["images"] + self.pdfImageJobs["pdf"]: + move_hocr_job_number += 1 + cmd = "mv %s %s" % ( + os.path.join(job["output_dir"], "tmp", "tesseract", "*.hocr"), + os.path.join(job["output_dir"], "hocr_files")) + move_hocr_jobs.append(self.addTask(label="move_hocr_job_-_%i" % (move_hocr_job_number), command=cmd, dependencies=tesseract_jobs)) + + + ### + # Task "hocr_to_teip5_job": create TEI P5 file from hocr files + # Dependencies: move_hocr_jobs + ### + hocr_to_teip5_jobs = [] + hocr_to_teip5_job_number = 0 + for job in self.pdfImageJobs["images"] + self.pdfImageJobs["pdf"]: + hocr_to_teip5_job_number += 1 + cmd = "parse_hocr %s %s" % ( + os.path.join(job["output_dir"], "hocr_files"), + os.path.join(os.path.join(job["output_dir"], os.path.basename(job["path"]).rsplit(".", 1)[0] + ".xml"))) + hocr_to_teip5_jobs.append(self.addTask(label="hocr_to_teip5_job_-_%i" % (hocr_to_teip5_job_number), command=cmd, dependencies=move_hocr_jobs, nCores=1, memMb=250)) + + + ### + # Task "cleanup_job": remove temporary files + # Dependencies: All ### self.waitForTasks() - for img in self.pdfImageJobs["images"]: - cmd = "mv %s %s" % (os.path.join(self.outputDir, "images", img, "tmp", "tesseract", "*.hocr"), - os.path.join(os.path.abspath(self.outputDir), "images", img, "HOCR")) - self.addTask(label="mv_%s" % (img), command=cmd, isForceLocal=True, - dependencies=pdftk_jobs, nCores=1, memMb=32) - - cmd = "parse_hocr %s %s" % ( - os.path.join(os.path.abspath(self.outputDir), "images", img, "HOCR"), - os.path.join(os.path.abspath(self.outputDir), "images", img, "%s.xml" % (img))) - self.addTask(label="generate_hocr_%s" % (img), command=cmd, isForceLocal=True, - dependencies="mv_%s" % (img), nCores=1, memMb=250) - - if not self.intermediate: - cmd = "rm -rf %s" % (os.path.join(self.outputDir, "images", img, "tmp")) - self.addTask(label="cleanup_%s" % (img), command=cmd, isForceLocal=True, - dependencies="mv_%s" % (img), nCores=1, memMb=32) - - pdf_counter = 0 - for pdf in self.pdfImageJobs["pdf"]: - pdf_counter += 1 - cmd = "mv %s %s" % (os.path.join(self.outputDir, "pdf", pdf, "tmp", "tesseract", "*.hocr"), - os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "HOCR")) - self.addTask(label="mv_%i" % (pdf_counter), command=cmd, isForceLocal=True, - dependencies=pdftk_jobs, nCores=1, memMb=32) - - cmd = "parse_hocr %s %s" % ( - os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "HOCR"), - os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "%s.xml" % (pdf))) - self.addTask(label="generate_hocr_%i" % (pdf_counter), command=cmd, isForceLocal=True, - dependencies="mv_%i" % (pdf_counter), nCores=1, memMb=250) - - if not self.intermediate: - cmd = "rm -rf %s" % (os.path.join(self.outputDir, "pdf", pdf, "tmp")) - self.addTask(label="cleanup_%i" % (pdf_counter), command=cmd, isForceLocal=True, - dependencies="mv_%i" % (pdf_counter), nCores=1, memMb=32) + cleanup_jobs = [] + cleanup_job_counter = 0 + if not self.intermediate: + for job in self.pdfImageJobs["images"] + self.pdfImageJobs["pdf"]: + cleanup_job_counter += 1 + cmd = "rm -r %s" % (os.path.join(job["output_dir"], "tmp")) + cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd)) -def strip_files_of_spaces(current_path, old_path): - """Strips inputfiles of spaces to avoid bugs and also for better usage and readability.""" - os.chdir(current_path) - for f in os.listdir("."): - r = f.replace(" ", "_") - if(r != f): - os.rename(f, r) - os.chdir(old_path) +def analyze_jobs(input_dir, output_dir, skip_pdf=False, skip_images=False): + files = os.listdir(input_dir) + images = [] + pdf = [] -def analyze_jobs(input_dir, image_suffix="tiff", skip_pdf=False, skip_images=False): - pdfs = {} - images = {} - files = os.listdir(os.path.abspath(input_dir)) - dirs = list(filter(lambda x: os.path.isdir(os.path.join(os.path.abspath(input_dir), x)), files)) - - # Scan root input dir - if not skip_pdf: - pdfs_ = list(filter(lambda x: x.endswith(".pdf"), files)) - pdfs = {".".join(k.split(".")[:-1]): os.path.join(os.path.abspath(input_dir), k) for k in pdfs_} if not skip_images: - print("sdf") - if list(filter(lambda x: x.endswith(".%s" % (image_suffix)), files)): - images["input_root"] = os.path.abspath(input_dir) - - # Scan subdirectories for inputs if available - for d in dirs: - strip_files_of_spaces(os.path.join(input_dir, d), os.getcwd()) - files = os.listdir(os.path.join(input_dir, d)) - - if not skip_pdf: - for p in list(filter(lambda x: x.endswith(".pdf"), files)): - pdfs[".".join(p.split(".")[:-1])] = os.path.join(os.path.abspath(input_dir), d, p) + for file in filter(lambda x: x.endswith(".tif") or x.endswith(".tiff"), files): + images.append({"path": os.path.join(input_dir, file), "output_dir": os.path.join(output_dir, file.rsplit(".", 1)[0])}) + if not skip_pdf: + for file in filter(lambda x: x.endswith(".pdf"), files): + pdf.append({"path": os.path.join(input_dir, file), "output_dir": os.path.join(output_dir, file.rsplit(".", 1)[0])}) + for subdir in filter(lambda x: os.path.isdir(os.path.join(input_dir, x)), files): + subdir_files = os.listdir(os.path.join(input_dir, subdir)) if not skip_images: - if sorted(list(filter(lambda x: x.endswith(".%s" % (image_suffix)), files))): - images[d] = os.path.join(os.path.abspath(input_dir), d) + for file in filter(lambda x: x.endswith(".tif") or x.endswith(".tiff"), subdir_files): + images.append({"path": os.path.join(input_dir, subdir, file), "output_dir": os.path.join(output_dir, subdir, file.rsplit(".", 1)[0])}) + if not skip_pdf: + for file in filter(lambda x: x.endswith(".pdf"), subdir_files): + pdf.append({"path": os.path.join(input_dir, subdir, file), "output_dir": os.path.join(output_dir, subdir, file.rsplit(".", 1)[0])}) + return {"pdf": pdf, "images": images} + + + +def normalize_input_filenames(path): + ### + # Normalize input filenames and directories to avoid bugs and also for better usage and readability. + ### + for file in os.listdir(path): + file_with_path = os.path.join(path, file) + if os.path.isdir(file_with_path): + normalize_input_filenames(file_with_path) + new_file_with_path = os.path.join(path, unicodedata.normalize("NFKD", file.decode("utf-8")).encode("ascii", "ignore").replace(" ", "_")) + os.rename(file_with_path, new_file_with_path) - return {"pdf": pdfs, "images": images} def main(): args = parse_arguments() - # DICT{DICT} - # 'pdf' :: {name: path} - # 'images' :: {name: path} - current_path = os.getcwd() - strip_files_of_spaces(args.input_dir, current_path) - jobs = analyze_jobs(args.input_dir, image_suffix=args.suffix, skip_pdf=args.skip_pdf, skip_images=args.skip_images) - - wflow = OCRWorkflow(jobs, args.input_dir, args.output_dir, args.suffix, args.lang, args.pdf, args.intermediate, - args.skip_bin, - args.nCores, args.mem, - args.dry_run) - retval = wflow.run(mode="local", nCores=args.nCores, memMb=args.mem, isDryRun=args.dry_run) + normalize_input_filenames(args.input_dir) + jobs = analyze_jobs(args.input_dir, args.output_dir, skip_pdf=args.skip_pdf, skip_images=args.skip_images) + wflow = OCRWorkflow(jobs, args.input_dir, args.output_dir, args.lang, args.pdf, args.intermediate, args.nCores, args.mem) + retval = wflow.run(mode="local", nCores=args.nCores, memMb=args.mem) sys.exit(retval) + if __name__ == "__main__": main() \ No newline at end of file diff --git a/parse_hocr b/parse_hocr index 5a0ad2a..0da6f89 100755 --- a/parse_hocr +++ b/parse_hocr @@ -1,4 +1,4 @@ -#!/usr/bin/env python3.6 +#!/usr/bin/env python3.5 # coding=utf-8 import xml.etree.ElementTree as ET @@ -6,7 +6,7 @@ from xml.sax.saxutils import escape import os import sys -input_files = sorted(os.listdir(sys.argv[1])) +input_files = filter(lambda x: x.endswith(".hocr"), sorted(os.listdir(sys.argv[1]))) output_file = open(sys.argv[2], "w") output_file.write('\n' + @@ -37,6 +37,7 @@ for input_file in input_files: if not first_word_in_line: output_file.write('\n') output_file.write('

\n') + output_file.write(' \n' + ' \n' + '')