mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
				synced 2025-10-31 21:23:14 +00:00 
			
		
		
		
	Initial commit
This commit is contained in:
		
							
								
								
									
										67
									
								
								Dockerfile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										67
									
								
								Dockerfile
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,67 @@ | ||||
| FROM ubuntu:18.04 | ||||
|  | ||||
| MAINTAINER Patrick Jentsch <p.jentsch@uni-bielefeld.de> | ||||
|  | ||||
| ENV DEBIAN_FRONTEND=noninteractive | ||||
| ENV LANG=en_US.UTF-8 | ||||
| ENV PYFLOW_VERSION 1.1.20 | ||||
|  | ||||
| RUN apt-get update && \ | ||||
|     apt-get install -y --no-install-recommends \ | ||||
|     gnupg2 | ||||
|  | ||||
| # Add PPA for pdftk | ||||
| RUN echo "deb http://ppa.launchpad.net/malteworld/ppa/ubuntu bionic main" >> /etc/apt/sources.list && \ | ||||
|     echo "deb-src http://ppa.launchpad.net/malteworld/ppa/ubuntu bionic main" >> /etc/apt/sources.list && \ | ||||
|     apt-key adv --keyserver keyserver.ubuntu.com --recv-keys BCA68C33DA36783B03E981C820D0BB61B700CE29 | ||||
|  | ||||
| RUN apt-get update && \ | ||||
|     apt-get install -y --no-install-recommends \ | ||||
|     ghostscript \ | ||||
|     git \ | ||||
|     imagemagick \ | ||||
|     libtiff-tools \ | ||||
|     locales \ | ||||
|     pdftk \ | ||||
|     poppler-utils \ | ||||
|     python2.7 \ | ||||
|     python3.6 \ | ||||
|     tesseract-ocr \ | ||||
|     wget | ||||
|  | ||||
| # Configure locales | ||||
| RUN locale-gen "$LANG" | ||||
|  | ||||
| WORKDIR /root | ||||
|  | ||||
| # Install pyFlow | ||||
| RUN wget -nv http://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \ | ||||
|     tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \ | ||||
|     cd pyflow-"$PYFLOW_VERSION" && \ | ||||
|     python2.7 setup.py build install && \ | ||||
|     cd /root && \ | ||||
|     rm pyflow-"$PYFLOW_VERSION".tar.gz | ||||
|  | ||||
| # Install Tesseract OCR Data Files | ||||
| RUN wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ | ||||
|     wget -nv http://github.com/tesseract-ocr/tessdata/raw/3.04.00/deu_frak.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ | ||||
|     wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ | ||||
|     wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ | ||||
|     wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ | ||||
|     wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata | ||||
|  | ||||
| # Install OCRopus | ||||
| RUN git clone http://github.com/tmbdev/ocropy && \ | ||||
|     cd ocropy && \ | ||||
|     apt-get install -y --no-install-recommends $(cat PACKAGES) && \ | ||||
|     wget -nv http://www.tmbdev.net/en-default.pyrnn.gz && \ | ||||
|     mv en-default.pyrnn.gz models/ && \ | ||||
|     python2.7 setup.py install && \ | ||||
|     cd /root | ||||
|  | ||||
| COPY ocr_pyflow /usr/local/bin | ||||
| COPY parse_hocr /usr/local/bin | ||||
|  | ||||
| VOLUME /root/files_for_ocr | ||||
|  | ||||
| CMD ["/bin/bash"] | ||||
							
								
								
									
										562
									
								
								ocr_pyflow
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										562
									
								
								ocr_pyflow
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,562 @@ | ||||
| #!/usr/bin/env python2.7 | ||||
| # coding=utf-8 | ||||
|  | ||||
| """ | ||||
| ocr_pyflow.py | ||||
|  | ||||
| Date:   01/04/2018 | ||||
| Usage:  For usage instructions run with option --help | ||||
| Author: Madis Rumming <mrumming@uni-bielefeld.de> | ||||
| """ | ||||
|  | ||||
| __author__ = "Madis Rumming <mrumming@uni-bielefeld.de>" | ||||
| __copyright__ = "Copyright 2018, Data Infrastructure and Digital Humanities,\ | ||||
| SFB 1288, Bielefeld University" | ||||
|  | ||||
| __version__ = "0.6" | ||||
| __maintainer__ = "Madis Rumming" | ||||
| __email__ = "mrumming@uni-bielefeld.de" | ||||
| __status__ = "Development" | ||||
|  | ||||
| import argparse | ||||
| import os | ||||
| import sys | ||||
| # from string import maketrans | ||||
| # from lxml import html | ||||
| from pyflow import WorkflowRunner | ||||
|  | ||||
| ocropusnlbin_bin = "/usr/local/bin/ocropus-nlbin" | ||||
|  | ||||
| def parse_arguments(): | ||||
|     parser = argparse.ArgumentParser( | ||||
|         "Performs OCR of (historical) documents utilizing ocropy for preprocessing and tesseract \ | ||||
|         for final OCR. Available outputs are HOCR, PDF, shrinked PDF, and simple DTAbf \ | ||||
|         (TEI P5 compliant). Software requirements: python2.7, pyflow, pdftoppm, ocropus, \ | ||||
|         tesseract, ghostscript, imagick, ") | ||||
|  | ||||
|     parser.add_argument("-i", "--input-directory", | ||||
|                         dest="input_dir", | ||||
|                         help="Input directory with input images/PDFs. For each PDF an OCR-Run is \ | ||||
|                         instantiated and output is created. If images reside in the directory, a \ | ||||
|                         single OCR run is performed. For each subdirectory containing images, a \ | ||||
|                         particular OCR run is instantiated.", | ||||
|                         required=True) | ||||
|     parser.add_argument("-o", "--output-directory", | ||||
|                         dest="output_dir", | ||||
|                         help="Directory, where output directories are created if necessary. \ | ||||
|                         Default: %s" % (os.path.curdir), | ||||
|                         required=False, | ||||
|                         default=os.path.curdir) | ||||
|     parser.add_argument("--image-suffix", | ||||
|                         dest="suffix", | ||||
|                         help="Input images suffix. Case-sensitive! tiff!=TIFF!=Tiff Default: tif.", | ||||
|                         default="tif", | ||||
|                         required=False) | ||||
|     parser.add_argument("--skip-pdf-processing", | ||||
|                         dest="skip_pdf", | ||||
|                         help="Skip detection of PDFs as input.", | ||||
|                         default=False, | ||||
|                         action='store_true', | ||||
|                         required=False) | ||||
|     parser.add_argument("--skip-image-processing", | ||||
|                         dest="skip_images", | ||||
|                         help="Skip detection of images as input.", | ||||
|                         default=False, | ||||
|                         action='store_true', | ||||
|                         required=False) | ||||
|     parser.add_argument("--skip_binarization", | ||||
|                         dest="skip_bin", | ||||
|                         help="skip binarizaiton pre-processing.", | ||||
|                         default=False, | ||||
|                         action="store_true", | ||||
|                         required=False) | ||||
|     parser.add_argument("--start-page", | ||||
|                         dest='startp', | ||||
|                         default=-1, help="First page to ocr.", | ||||
|                         required=False, type=int) | ||||
|     parser.add_argument("--end-page", | ||||
|                         dest='endp', | ||||
|                         default=-1, | ||||
|                         help="Last page to ocr.", | ||||
|                         required=False, type=int) | ||||
|     parser.add_argument("-r", "--rotate-pages", | ||||
|                         dest='rotate', | ||||
|                         default='norotation', | ||||
|                         help="Rotate pages from input. Values: clockwise, counterclockwise, \ | ||||
|                         upsidedown. Default: norotation", | ||||
|                         required=False, | ||||
|                         choices=['clockwise', 'counterclockwise', 'upsidedown', 'norotation']) | ||||
|     parser.add_argument("-s", "--split-pages", | ||||
|                         dest='split', | ||||
|                         default=False, | ||||
|                         help="Split pages in half after possible rotation. Default: Not performed.", | ||||
|                         required=False, | ||||
|                         action='store_true') | ||||
|     parser.add_argument("--ppi-import", | ||||
|                         dest="ppi_in", help="Down-/Scaling for input images. Default: 300 ppi.", | ||||
|                         default=300, | ||||
|                         required=False, | ||||
|                         type=int) | ||||
|     parser.add_argument("-l", "--language", | ||||
|                         dest='lang', | ||||
|                         help="Language for OCR", | ||||
|                         required=True, | ||||
|                         type=str) | ||||
|     parser.add_argument("-p", "--create-pdf", | ||||
|                         dest='pdf', | ||||
|                         default=False, | ||||
|                         action='store_true', | ||||
|                         required=False) | ||||
|     parser.add_argument("-c", "--compress-pdf", | ||||
|                         dest='comp', | ||||
|                         default=False, | ||||
|                         action='store_true', | ||||
|                         required=False) | ||||
|     parser.add_argument("--ppi-export", | ||||
|                         dest="ppi_out", | ||||
|                         help="Down-/Scaling for output images in PDF. Default: 150 ppi.", | ||||
|                         default=150, | ||||
|                         required=False, | ||||
|                         type=int) | ||||
|     parser.add_argument("-t", "--temp-directory", | ||||
|                         dest="temp", | ||||
|                         help="Location of intermediate files. Defaults to tmp in output directory.", | ||||
|                         default="", | ||||
|                         required=False, | ||||
|                         type=str) | ||||
|     parser.add_argument("-k", "--keep-intermediate", | ||||
|                         dest="intermediate", | ||||
|                         help="Keep intermediate files. Default: True", | ||||
|                         default=False, | ||||
|                         action='store_true') | ||||
|     parser.add_argument("--cores", | ||||
|                         dest='nCores', | ||||
|                         help="Amount of CPUs to use for parallel jobs. Default: %i" % (4), | ||||
|                         default=4, | ||||
|                         required=False, | ||||
|                         type=int) | ||||
|     parser.add_argument("--is-continued", dest='continued', | ||||
|                         help="Enables continuing an erroneous or paused workflow. MUST use the \ | ||||
|                         same dataDirRoot as before.", | ||||
|                         default=False, | ||||
|                         required=False, | ||||
|                         action='store_true') | ||||
|     parser.add_argument("--is-dry-run", | ||||
|                         dest='dry_run', | ||||
|                         help="Check workflow without execution.", | ||||
|                         default=False, | ||||
|                         action='store_true', | ||||
|                         required=False) | ||||
|     parser.add_argument("--memory", | ||||
|                         dest='mem', | ||||
|                         help="Total amount of memory (RAM) available for this workflow. \ | ||||
|                         Default: %i" % (8192), | ||||
|                         default=8192, | ||||
|                         required=False, | ||||
|                         type=int) | ||||
|     args = parser.parse_args() | ||||
|     return (args) | ||||
|  | ||||
|  | ||||
| class PureOCR(WorkflowRunner): | ||||
|     def __init__(self, input_path, output_path): | ||||
|         self.input_path = input_path | ||||
|         self.output_path = output_path | ||||
|  | ||||
|     def workflow(self): | ||||
|         pass | ||||
|  | ||||
|  | ||||
| class OCRWorkflow(WorkflowRunner): | ||||
|  | ||||
|     def __init__(self, pdfImageJobs, inputDir, outputDir, suffix, lang, pdf, intermediate, skip_bin, | ||||
|                  nCores, memMb, dry_run): | ||||
|         self.pdfImageJobs = pdfImageJobs | ||||
|         self.outputDir = outputDir | ||||
|         self.inputDir = inputDir | ||||
|         self.suffix = suffix | ||||
|         self.lang = lang | ||||
|         self.pdf = pdf | ||||
|         self.intermediate = intermediate | ||||
|         self.skip_bin = skip_bin | ||||
|         self.nCores = nCores | ||||
|         self.memMb = memMb | ||||
|         self.dry_run = dry_run | ||||
|  | ||||
|     def workflow(self): | ||||
|  | ||||
|         cmd = None | ||||
|         print(self.outputDir) | ||||
|         if os.path.isabs(self.outputDir): | ||||
|             if not os.path.exists(self.outputDir): | ||||
|                 cmd = "mkdir -p %s" % (self.outputDir) | ||||
|         else: | ||||
|             self.outputDir = os.path.join(os.path.abspath(self.inputDir), self.outputDir) | ||||
|             if not os.path.exists(self.outputDir): | ||||
|                 cmd = "mkdir -p %s" % (self.outputDir) | ||||
|  | ||||
|         self.addTask(label="mkdir_outputdir", command=cmd, isForceLocal=True) | ||||
|  | ||||
|         deps = [] | ||||
|         if self.pdfImageJobs["pdf"]: | ||||
|             cmd = "mkdir %s" % (os.path.join(self.outputDir, "pdf")) | ||||
|             deps.append("mkdir_pdf") | ||||
|             self.addTask(label="mkdir_pdf", command=cmd, isForceLocal=True, | ||||
|                          dependencies="mkdir_outputdir") | ||||
|         if self.pdfImageJobs["images"]: | ||||
|             cmd = "mkdir %s" % (os.path.join(self.outputDir, "images")) | ||||
|             deps.append("mkdir_images") | ||||
|             self.addTask(label="mkdir_images", command=cmd, isForceLocal=True, | ||||
|                          dependencies="mkdir_outputdir") | ||||
|  | ||||
|         ### | ||||
|         # Generate directories | ||||
|         ### | ||||
|         deps_dirs = [] | ||||
|         if self.pdfImageJobs["pdf"]: | ||||
|             i = 0 | ||||
|             for k in self.pdfImageJobs["pdf"]: | ||||
|                 cmd = "mkdir -p %s %s %s %s %s" % ( | ||||
|                     os.path.join(os.path.abspath(self.outputDir), "pdf", k, "tmp", "tiff"), | ||||
|                     os.path.join(os.path.abspath(self.outputDir), "pdf", k, "tmp", "tesseract"), | ||||
|                     os.path.join(os.path.abspath(self.outputDir), "pdf", k, "tmp", "ocropy"), | ||||
|                     os.path.join(os.path.abspath(self.outputDir), "pdf", k, "tmp", "gs"), | ||||
|                     os.path.join(os.path.abspath(self.outputDir), "pdf", k, "HOCR")) | ||||
|  | ||||
|                 deps_dirs.append("mkdir_pdf_%i" % (i)) | ||||
|                 self.addTask(label="mkdir_pdf_%i" % (i), command=cmd, isForceLocal=True, | ||||
|                              dependencies=deps) | ||||
|                 i += 1 | ||||
|         if self.pdfImageJobs["images"]: | ||||
|             i = 0 | ||||
|             for k in self.pdfImageJobs["images"]: | ||||
|                 cmd = "mkdir -p %s %s %s %s" % ( | ||||
|                     os.path.join(os.path.abspath(self.outputDir), "images", k, "tmp", "tesseract"), | ||||
|                     os.path.join(os.path.abspath(self.outputDir), "images", k, "tmp", "ocropy"), | ||||
|                     os.path.join(os.path.abspath(self.outputDir), "images", k, "tmp", "gs"), | ||||
|                     os.path.join(os.path.abspath(self.outputDir), "images", k, "HOCR")) | ||||
|                 deps_dirs.append("mkdir_images_%i" % (i)) | ||||
|                 self.addTask(label="mkdir_images_%i" % (i), command=cmd, isForceLocal=True, | ||||
|                              dependencies=deps) | ||||
|                 i += 1 | ||||
|  | ||||
|         ### | ||||
|         # Extract images from PDF input files | ||||
|         ### | ||||
|         pdf_counter = 0 | ||||
|         deps_ppm = [] | ||||
|         for pdf in self.pdfImageJobs["pdf"]: | ||||
|             cmd = "pdftoppm %s %s -tiff -r 300 -tiffcompression lzw -cropbox" % (self.pdfImageJobs["pdf"][pdf], | ||||
|                                                                                  os.path.join( | ||||
|                                                                                  os.path.abspath(self.outputDir), | ||||
|                                                                                  "pdf", pdf, "tmp", | ||||
|                                                                                  "tiff", "interm")) | ||||
|             self.addTask(label="pdftoppm_%i" % (pdf_counter), command=cmd, isForceLocal=True, | ||||
|                          dependencies=deps_dirs, nCores=1, memMb=1024) | ||||
|             deps_ppm.append("pdftoppm_%i" % (pdf_counter)) | ||||
|             pdf_counter += 1 | ||||
|  | ||||
|         ### | ||||
|         # Perform image binarization if not skipped | ||||
|         ### | ||||
|         self.waitForTasks() | ||||
|         ocropus_counter = 0 | ||||
|         deps_ocropus = [] | ||||
|         if not self.skip_bin: | ||||
|             for pdf in self.pdfImageJobs["pdf"]: | ||||
|                 cmd = "%s -Q %i -o %s %s" % (ocropusnlbin_bin, self.nCores, | ||||
|                                              os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", "ocropy"), | ||||
|                                              os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", "tiff", | ||||
|                                                           "interm-*")) | ||||
|                 self.addTask(label="ocropusnlbin_%i" % (ocropus_counter), command=cmd, isForceLocal=True, | ||||
|                              dependencies=deps_ppm, | ||||
|                              nCores=self.nCores, memMb=self.memMb) | ||||
|                 deps_ocropus.append("ocropusnlbin_%i" % (ocropus_counter)) | ||||
|                 ocropus_counter += 1 | ||||
|             for img in self.pdfImageJobs["images"]: | ||||
|                 cmd = "%s -Q %i -o %s %s" % (ocropusnlbin_bin, self.nCores, | ||||
|                                              os.path.join(os.path.abspath(self.outputDir), "images", img, "tmp", | ||||
|                                                           "ocropy"), | ||||
|                                              os.path.join(self.pdfImageJobs["images"][img], "*.%s" % (self.suffix))) | ||||
|                 self.addTask(label="ocropusnlbin_%i" % (ocropus_counter), command=cmd, isForceLocal=True, | ||||
|                              dependencies=deps_ppm, | ||||
|                              nCores=self.nCores, memMb=self.memMb) | ||||
|                 deps_ocropus.append("ocropusnlbin_%i" % (ocropus_counter)) | ||||
|                 ocropus_counter += 1 | ||||
|  | ||||
|         ### | ||||
|         # Perform OCR, HOCR as default output, PDF only optional | ||||
|         ### | ||||
|         self.waitForTasks() | ||||
|         pdf_counter = 0 | ||||
|         for pdf in self.pdfImageJobs["pdf"]: | ||||
|             if self.skip_bin: | ||||
|                 inp_images = sorted(os.listdir( | ||||
|                     os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", "tiff"))) | ||||
|             else: | ||||
|                 inp_images = sorted(filter(lambda x: ".bin." in x, os.listdir( | ||||
|                     os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", | ||||
|                                  "ocropy")))) | ||||
|  | ||||
|             deps_ocr_pdf = [] | ||||
|  | ||||
|             for image in inp_images: | ||||
|                 cmd = "tesseract %s %s -l %s hocr %s" % ( | ||||
|                     os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", | ||||
|                                  "tiff" if self.skip_bin else "ocropy", image), | ||||
|                     os.path.join(self.outputDir, "pdf", pdf, "tmp", "tesseract", | ||||
|                                  "%04i" % int(image.split(".")[0].split("-")[1]) if self.skip_bin else image.split(".")[ | ||||
|                                      0]), | ||||
|                     self.lang, "pdf" if self.pdf else "") | ||||
|  | ||||
|                 self.addTask(label="ocr_%i_%s" % (pdf_counter, image.split(".")[0]), command=cmd, isForceLocal=True, | ||||
|                              dependencies=deps_ocropus, nCores=1, memMb=2048) | ||||
|                 deps_ocr_pdf.append("ocr_%i_%s" % (pdf_counter, image.split(".")[0])) | ||||
|  | ||||
|             pdf_counter += 1 | ||||
|  | ||||
|         for img in self.pdfImageJobs["images"]: | ||||
|             if self.skip_bin: | ||||
|                 images = sorted( | ||||
|                     list(filter(lambda x: x.endswith(self.suffix), os.listdir(self.pdfImageJobs["images"][img])))) | ||||
|             else: | ||||
|                 images = sorted( | ||||
|                     list(filter(lambda x: ".bin." in x, os.listdir( | ||||
|                         os.path.join(os.path.abspath(self.outputDir), "images", img, "tmp", "ocropy"))))) | ||||
|  | ||||
|             deps_ocr = [] | ||||
|             for image in images: | ||||
|                 if self.skip_bin: | ||||
|                     cmd = "tesseract %s %s -l %s hocr %s" % ( | ||||
|                         os.path.join(self.pdfImageJobs["images"][img], image), | ||||
|                         os.path.join(self.outputDir, "images", img, "tmp", "tesseract", image.split(".")[0]), | ||||
|                         self.lang, "pdf" if self.pdf else "") | ||||
|                 else: | ||||
|                     cmd = "tesseract %s %s -l %s hocr %s" % ( | ||||
|                         os.path.join(os.path.abspath(self.outputDir), "images", img, "tmp", "ocropy", image), | ||||
|                         os.path.join(self.outputDir, "images", img, "tmp", "tesseract", image.split(".")[0]), | ||||
|                         self.lang, "pdf" if self.pdf else "") | ||||
|                 print(cmd) | ||||
|                 self.addTask(label="ocr_%s_%s" % (img, image.split(".")[0]), command=cmd, isForceLocal=True, | ||||
|                              dependencies=deps_ocropus, nCores=1, memMb=2048) | ||||
|  | ||||
|                 deps_ocr.append("ocr_%s_%s" % (img, image.split(".")[0])) | ||||
|  | ||||
|         pdftk_jobs = [] | ||||
|         ### | ||||
|         # Creation of PDF output | ||||
|         ### | ||||
|         if self.pdf: | ||||
|             pdf_counter_conv = 0 | ||||
|             for img in self.pdfImageJobs["images"]: | ||||
|                 if self.skip_bin: | ||||
|                     images = sorted( | ||||
|                         list(filter(lambda x: x.endswith(self.suffix), os.listdir(self.pdfImageJobs["images"][img])))) | ||||
|                 else: | ||||
|                     images = sorted( | ||||
|                         list(filter(lambda x: ".bin." in x, os.listdir( | ||||
|                             os.path.join(os.path.abspath(self.outputDir), "images", img, "tmp", "ocropy"))))) | ||||
|  | ||||
|                 deps_tiffpdf = [] | ||||
|  | ||||
|                 for image in images: | ||||
|                     if self.skip_bin: | ||||
|                         cmd = "tiff2pdf -o %s.pdf %s" % ( | ||||
|                             os.path.join(self.outputDir, "images", img, "tmp", "gs", image.split(".")[0]), | ||||
|                             os.path.join(self.pdfImageJobs["images"][img], image)) | ||||
|                         self.addTask(label="tiffpdf_%s_%s" % (img, image.split(".")[0]), command=cmd, isForceLocal=True, | ||||
|                                      dependencies=deps_dirs, nCores=1, memMb=256) | ||||
|                         deps_tiffpdf.append("tiffpdf_%s_%s" % (img, image.split(".")[0])) | ||||
|                     else: | ||||
|                         cmd = "convert %s %s.pdf" % ( | ||||
|                             os.path.join(os.path.abspath(self.outputDir), "images", img, "tmp", "ocropy", image), | ||||
|                             os.path.join(self.outputDir, "images", img, "tmp", "gs", image.split(".")[0])) | ||||
|                         self.addTask(label="tiffpdf_%s_%s" % (img, image.split(".")[0]), command=cmd, isForceLocal=True, | ||||
|                                      dependencies=deps_ocropus, nCores=1, memMb=256) | ||||
|                         deps_tiffpdf.append("tiffpdf_%s_%s" % (img, image.split(".")[0])) | ||||
|  | ||||
|                 cmd = "gs -dBATCH -dNOPAUSE -q -r150 -dPDFSETTINGS=/ebook -sDEVICE=pdfwrite -sOutputFile=%s %s" % ( | ||||
|                     os.path.join(self.outputDir, "images", img, "tmp", "gs", "image.pdf"), | ||||
|                     os.path.join(self.outputDir, "images", img, "tmp", "gs", "*.pdf")) | ||||
|                 self.addTask(label="gs_image_%s" % (img), command=cmd, isForceLocal=True, | ||||
|                              dependencies=deps_tiffpdf, nCores=1, memMb=4096) | ||||
|  | ||||
|                 deps_ocr.append("gs_image_%s" % (img)) | ||||
|  | ||||
|                 cmd = "gs -dBATCH -dNOPAUSE -q -r150 -dPDFSETTINGS=/ebook -dFILTERIMAGE -sDEVICE=pdfwrite -sOutputFile=%s %s" % ( | ||||
|                     os.path.join(self.outputDir, "images", img, "tmp", "gs", "text.pdf"), | ||||
|                     os.path.join(self.outputDir, "images", img, "tmp", "tesseract", "*.pdf")) | ||||
|  | ||||
|                 self.addTask(label="gs_layer_%s" % (img), command=cmd, isForceLocal=True, | ||||
|                              dependencies=deps_ocr, nCores=1, memMb=4096) | ||||
|  | ||||
|                 cmd = "pdftk %s multistamp %s output %s" % ( | ||||
|                     os.path.join(self.outputDir, "images", img, "tmp", "gs", "text.pdf"), | ||||
|                     os.path.join(self.outputDir, "images", img, "tmp", "gs", "image.pdf"), | ||||
|                     os.path.join(self.outputDir, "images", img, "%s.pdf" % (img))) | ||||
|  | ||||
|                 self.addTask(label="pdftk_%s" % (img), command=cmd, isForceLocal=True, | ||||
|                              dependencies="gs_layer_%s" % (img), nCores=1, memMb=4096) | ||||
|                 pdftk_jobs.append("pdftk_%s" % (img)) | ||||
|  | ||||
|             for pdf in self.pdfImageJobs["pdf"]: | ||||
|  | ||||
|                 pdf_counter_conv += 1 | ||||
|                 if self.skip_bin: | ||||
|                     inp_images = sorted(os.listdir( | ||||
|                         os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", "tiff"))) | ||||
|                 else: | ||||
|                     inp_images = sorted(filter(lambda x: ".bin." in x, os.listdir( | ||||
|                         os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", | ||||
|                                      "ocropy")))) | ||||
|  | ||||
|                 deps_pdf_tiffpdf = [] | ||||
|  | ||||
|                 for image in inp_images: | ||||
|                     if self.skip_bin: | ||||
|                         cmd = "tiff2pdf -o %s.pdf %s" % ( | ||||
|                             os.path.join(self.outputDir, "pdf", pdf, "tmp", "gs", image.split(".")[0]), | ||||
|                             os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", "tiff", image)) | ||||
|                         self.addTask(label="tiffpdf_pdf%i_%s" % (pdf_counter_conv, image.split(".")[0]), command=cmd, | ||||
|                                      isForceLocal=True, | ||||
|                                      dependencies=deps_dirs, nCores=1, memMb=256) | ||||
|                         deps_pdf_tiffpdf.append("tiffpdf_pdf%i_%s" % (pdf_counter_conv, image.split(".")[0])) | ||||
|                     else: | ||||
|                         cmd = "convert %s %s.pdf" % ( | ||||
|                             os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "tmp", "ocropy", image), | ||||
|                             os.path.join(self.outputDir, "pdf", pdf, "tmp", "gs", image.split(".")[0])) | ||||
|                         self.addTask(label="tiffpdf_pdf%i_%s" % (pdf_counter_conv, image.split(".")[0]), command=cmd, | ||||
|                                      isForceLocal=True, | ||||
|                                      dependencies=deps_ocropus, nCores=1, memMb=256) | ||||
|                         deps_pdf_tiffpdf.append("tiffpdf_pdf%i_%s" % (pdf_counter_conv, image.split(".")[0])) | ||||
|  | ||||
|                 cmd = "gs -dBATCH -dNOPAUSE -q -r150 -dPDFSETTINGS=/ebook -sDEVICE=pdfwrite -sOutputFile=%s %s" % ( | ||||
|                     os.path.join(self.outputDir, "pdf", pdf, "tmp", "gs", "image.pdf"), | ||||
|                     os.path.join(self.outputDir, "pdf", pdf, "tmp", "gs", "*.pdf")) | ||||
|                 self.addTask(label="gs_image_pdf%i" % (pdf_counter_conv), command=cmd, isForceLocal=True, | ||||
|                              dependencies=deps_pdf_tiffpdf, nCores=1, memMb=4096) | ||||
|  | ||||
|                 deps_ocr.append("gs_image_pdf%i" % (pdf_counter_conv)) | ||||
|  | ||||
|                 cmd = "gs -dBATCH -dNOPAUSE -q -r150 -dPDFSETTINGS=/ebook -dFILTERIMAGE -sDEVICE=pdfwrite -sOutputFile=%s %s" % ( | ||||
|                     os.path.join(self.outputDir, "pdf", pdf, "tmp", "gs", "text.pdf"), | ||||
|                     os.path.join(self.outputDir, "pdf", pdf, "tmp", "tesseract", "*.pdf")) | ||||
|  | ||||
|                 self.addTask(label="gs_layer_pdf%i" % (pdf_counter_conv), command=cmd, isForceLocal=True, | ||||
|                              dependencies=deps_ocr, nCores=1, memMb=4096) | ||||
|  | ||||
|                 cmd = "pdftk %s multistamp %s output %s" % ( | ||||
|                     os.path.join(self.outputDir, "pdf", pdf, "tmp", "gs", "text.pdf"), | ||||
|                     os.path.join(self.outputDir, "pdf", pdf, "tmp", "gs", "image.pdf"), | ||||
|                     os.path.join(self.outputDir, "pdf", pdf, "%s.pdf" % (pdf))) | ||||
|  | ||||
|                 self.addTask(label="pdftk_pdf%i" % (pdf_counter_conv), command=cmd, isForceLocal=True, | ||||
|                              dependencies="gs_layer_pdf%i" % (pdf_counter_conv), nCores=1, memMb=4096) | ||||
|  | ||||
|                 pdftk_jobs.append("pdftk_pdf%i" % (pdf_counter_conv)) | ||||
|  | ||||
|         else: | ||||
|             self.addTask(label="pdftk_", command=None, dependencies=deps_ocr) | ||||
|             pdftk_jobs.append("pdftk_") | ||||
|  | ||||
|         ### | ||||
|         # Merging and cleanup | ||||
|         ### | ||||
|         self.waitForTasks() | ||||
|         for img in self.pdfImageJobs["images"]: | ||||
|             cmd = "mv %s %s" % (os.path.join(self.outputDir, "images", img, "tmp", "tesseract", "*.hocr"), | ||||
|                                 os.path.join(os.path.abspath(self.outputDir), "images", img, "HOCR")) | ||||
|             self.addTask(label="mv_%s" % (img), command=cmd, isForceLocal=True, | ||||
|                          dependencies=pdftk_jobs, nCores=1, memMb=32) | ||||
|  | ||||
|             cmd = "parse_hocr %s %s" % ( | ||||
|                 os.path.join(os.path.abspath(self.outputDir), "images", img, "HOCR"), | ||||
|                 os.path.join(os.path.abspath(self.outputDir), "images", img, "%s.xml" % (img))) | ||||
|             self.addTask(label="generate_hocr_%s" % (img), command=cmd, isForceLocal=True, | ||||
|                          dependencies="mv_%s" % (img), nCores=1, memMb=250) | ||||
|  | ||||
|             if not self.intermediate: | ||||
|                 cmd = "rm -rf %s" % (os.path.join(self.outputDir, "images", img, "tmp")) | ||||
|                 self.addTask(label="cleanup_%s" % (img), command=cmd, isForceLocal=True, | ||||
|                              dependencies="mv_%s" % (img), nCores=1, memMb=32) | ||||
|  | ||||
|         pdf_counter = 0 | ||||
|         for pdf in self.pdfImageJobs["pdf"]: | ||||
|             pdf_counter += 1 | ||||
|             cmd = "mv %s %s" % (os.path.join(self.outputDir, "pdf", pdf, "tmp", "tesseract", "*.hocr"), | ||||
|                                 os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "HOCR")) | ||||
|             self.addTask(label="mv_%i" % (pdf_counter), command=cmd, isForceLocal=True, | ||||
|                          dependencies=pdftk_jobs, nCores=1, memMb=32) | ||||
|  | ||||
|             cmd = "parse_hocr %s %s" % ( | ||||
|                 os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "HOCR"), | ||||
|                 os.path.join(os.path.abspath(self.outputDir), "pdf", pdf, "%s.xml" % (pdf))) | ||||
|             self.addTask(label="generate_hocr_%i" % (pdf_counter), command=cmd, isForceLocal=True, | ||||
|                          dependencies="mv_%i" % (pdf_counter), nCores=1, memMb=250) | ||||
|  | ||||
|             if not self.intermediate: | ||||
|                 cmd = "rm -rf %s" % (os.path.join(self.outputDir, "pdf", pdf, "tmp")) | ||||
|                 self.addTask(label="cleanup_%i" % (pdf_counter), command=cmd, isForceLocal=True, | ||||
|                              dependencies="mv_%i" % (pdf_counter), nCores=1, memMb=32) | ||||
|  | ||||
|  | ||||
| def strip_files_of_spaces(current_path, old_path): | ||||
|     """Strips inputfiles of spaces to avoid bugs and also for better usage and readability.""" | ||||
|     os.chdir(current_path) | ||||
|     for f in os.listdir("."): | ||||
|         r = f.replace(" ", "_") | ||||
|         if(r != f): | ||||
|             os.rename(f, r) | ||||
|     os.chdir(old_path) | ||||
|  | ||||
|  | ||||
| def analyze_jobs(input_dir, image_suffix="tiff", skip_pdf=False, skip_images=False): | ||||
|     pdfs = {} | ||||
|     images = {} | ||||
|     files = os.listdir(os.path.abspath(input_dir)) | ||||
|     dirs = list(filter(lambda x: os.path.isdir(os.path.join(os.path.abspath(input_dir), x)), files)) | ||||
|  | ||||
|     # Scan root input dir | ||||
|     if not skip_pdf: | ||||
|         pdfs_ = list(filter(lambda x: x.endswith(".pdf"), files)) | ||||
|         pdfs = {".".join(k.split(".")[:-1]): os.path.join(os.path.abspath(input_dir), k) for k in pdfs_} | ||||
|     if not skip_images: | ||||
|         print("sdf") | ||||
|         if list(filter(lambda x: x.endswith(".%s" % (image_suffix)), files)): | ||||
|             images["input_root"] = os.path.abspath(input_dir) | ||||
|  | ||||
|     # Scan subdirectories for inputs if available | ||||
|     for d in dirs: | ||||
|         strip_files_of_spaces(os.path.join(input_dir, d), os.getcwd()) | ||||
|         files = os.listdir(os.path.join(input_dir, d)) | ||||
|  | ||||
|         if not skip_pdf: | ||||
|             for p in list(filter(lambda x: x.endswith(".pdf"), files)): | ||||
|                 pdfs[".".join(p.split(".")[:-1])] = os.path.join(os.path.abspath(input_dir), d, p) | ||||
|         if not skip_images: | ||||
|             if sorted(list(filter(lambda x: x.endswith(".%s" % (image_suffix)), files))): | ||||
|                 images[d] = os.path.join(os.path.abspath(input_dir), d) | ||||
|  | ||||
|     return {"pdf": pdfs, "images": images} | ||||
|  | ||||
|  | ||||
| def main(): | ||||
|     args = parse_arguments() | ||||
|  | ||||
|     # DICT{DICT} | ||||
|     # 'pdf' :: {name: path} | ||||
|     # 'images' :: {name: path} | ||||
|     current_path = os.getcwd() | ||||
|     strip_files_of_spaces(args.input_dir, current_path) | ||||
|     jobs = analyze_jobs(args.input_dir, image_suffix=args.suffix, skip_pdf=args.skip_pdf, skip_images=args.skip_images) | ||||
|  | ||||
|     wflow = OCRWorkflow(jobs, args.input_dir, args.output_dir, args.suffix, args.lang, args.pdf, args.intermediate, | ||||
|                         args.skip_bin, | ||||
|                         args.nCores, args.mem, | ||||
|                         args.dry_run) | ||||
|     retval = wflow.run(mode="local", nCores=args.nCores, memMb=args.mem, isDryRun=args.dry_run) | ||||
|     sys.exit(retval) | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     main() | ||||
							
								
								
									
										43
									
								
								parse_hocr
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										43
									
								
								parse_hocr
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,43 @@ | ||||
| #!/usr/bin/env python3.6 | ||||
| # coding=utf-8 | ||||
|  | ||||
| import xml.etree.ElementTree as ET | ||||
| from xml.sax.saxutils import escape | ||||
| import os | ||||
| import sys | ||||
|  | ||||
| input_files = sorted(os.listdir(sys.argv[1])) | ||||
| output_file = open(sys.argv[2], "w") | ||||
|  | ||||
| output_file.write('<?xml version="1.0" encoding="UTF-8"?>\n' + | ||||
|            '<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="dtabf">\n' + | ||||
|            '    <teiHeader>\n' + | ||||
|            '        <fileDesc>\n' + | ||||
|            '            <titleStmt/>\n' + | ||||
|            '            <publicationStmt/>\n' + | ||||
|            '            <sourceDesc/>\n' + | ||||
|            '        </fileDesc>\n' + | ||||
|            '        <encodingDesc/>\n' + | ||||
|            '        <profileDesc/>\n' + | ||||
|            '    </teiHeader>\n' + | ||||
|            '    <text>\n' + | ||||
|            '        <body>\n') | ||||
|  | ||||
| for input_file in input_files: | ||||
|     tree = ET.parse(os.path.join(sys.argv[1], input_file)) | ||||
|     output_file.write('            <pb n="%s" facs="%s.tif"/>\n' % (input_file.split(".")[0], input_file.split(".")[0])) | ||||
|     for para in tree.findall(".//*[@class='ocr_par']"): | ||||
|         output_file.write('            <p>\n') | ||||
|         for line in para.findall(".//*[@class='ocr_line']"): | ||||
|             first_word_in_line = True | ||||
|             for word in line.findall(".//*[@class='ocrx_word']"): | ||||
|                 if word.text is not None: | ||||
|                     output_file.write(("                " if first_word_in_line else " ") + escape(word.text.strip())) | ||||
|                     first_word_in_line = False | ||||
|             if not first_word_in_line: | ||||
|                 output_file.write('<lb/>\n') | ||||
|         output_file.write('            </p>\n') | ||||
| output_file.write('        </body>\n' + | ||||
|            '    </text>\n' + | ||||
|            '</TEI>\n') | ||||
| output_file.close() | ||||
		Reference in New Issue
	
	Block a user