diff --git a/Dockerfile b/Dockerfile index 97734ff..ae7f5fb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ FROM debian:stretch-slim -MAINTAINER Patrick Jentsch +LABEL maintainer="inf_sfb1288@lists.uni-bielefeld.de" ENV DEBIAN_FRONTEND=noninteractive ENV LANG=C.UTF-8 @@ -11,34 +11,31 @@ RUN apt-get update && \ ca-certificates \ gnupg2 \ imagemagick \ - pdftk \ poppler-utils \ python2.7 \ python3.5 \ python-numpy \ wget -WORKDIR /root - # Install ocropy ENV OCROPY_VERSION 1.3.3 RUN wget -nv https://github.com/tmbdev/ocropy/archive/v"$OCROPY_VERSION".tar.gz && \ tar -xzf v"$OCROPY_VERSION".tar.gz && \ - rm v"$OCROPY_VERSION".tar.gz && \ cd ocropy-"$OCROPY_VERSION" && \ apt-get install -y --no-install-recommends $(cat PACKAGES) python-pil python-tk && \ wget -nv http://www.tmbdev.net/en-default.pyrnn.gz -P models/ && \ python2.7 setup.py install && \ - cd .. + cd .. && \ + rm -r v"$OCROPY_VERSION".tar.gz ocropy-"$OCROPY_VERSION" # Install pyFlow ENV PYFLOW_VERSION 1.1.20 RUN wget -nv https://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \ tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \ - rm pyflow-"$PYFLOW_VERSION".tar.gz && \ cd pyflow-"$PYFLOW_VERSION" && \ python2.7 setup.py build install && \ - cd .. + cd .. && \ + rm -r pyflow-"$PYFLOW_VERSION".tar.gz pyflow-"$PYFLOW_VERSION" # Install Tesseract OCR and Data Files RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list && \ @@ -52,11 +49,12 @@ RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /et wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ - wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/ita.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ + wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata -COPY ocr /usr/local/bin COPY hocrtotei /usr/local/bin +COPY ocr /usr/local/bin -CMD ["/bin/bash"] +ENTRYPOINT ["ocr"] +CMD ["--help"] diff --git a/hocrtotei b/hocrtotei index a6e4963..fde4b46 100755 --- a/hocrtotei +++ b/hocrtotei @@ -7,22 +7,31 @@ import os import re import sys -input_files = filter(lambda x: x.endswith(".hocr"), sorted(os.listdir(sys.argv[1]))) +input_files = sorted( + filter( + lambda x: x.endswith(".hocr"), + os.listdir(sys.argv[1]) + ), + key=lambda x: int(re.search(r'\d+', x).group(0)) +) +# "page-1.hocr" -> "1" output_file = open(sys.argv[2], "w") -output_file.write('\n' + - '\n' + - ' \n' + - ' \n' + - ' \n' + - ' \n' + - ' \n' + - ' \n' + - ' \n' + - ' \n' + - ' \n' + - ' \n' + - ' \n') +output_file.write( + '\n' + + '\n' + + ' \n' + + ' \n' + + ' \n' + + ' \n' + + ' \n' + + ' \n' + + ' \n' + + ' \n' + + ' \n' + + ' \n' + + ' \n' +) for input_file in input_files: tree = ET.parse(os.path.join(sys.argv[1], input_file)) @@ -40,7 +49,9 @@ for input_file in input_files: output_file.write('\n') output_file.write('

\n') -output_file.write(' \n' + - '
\n' + - '
') +output_file.write( + ' \n' + + '
\n' + + '
') + output_file.close() diff --git a/ocr b/ocr index f672a4a..34a9c7e 100755 --- a/ocr +++ b/ocr @@ -19,32 +19,27 @@ from pyflow import WorkflowRunner ''' TODO: ' Implement --end-page: Last page to ocr -' Implement --memMb: Total amount of memory (RAM) available for this workflow. Default: 2048 * nCores +' Implement --memMb: Total amount of memory (RAM) available for this workflow. +' Default: 2048 * nCores ' Implement --rotate: Rotate pages from input (90, 180, 270) ' Implement --split-pages: Split pages in half after possible rotation ' Implement --start-page: First page to ocr ''' + + def parse_arguments(): parser = argparse.ArgumentParser( "Performs OCR of (historical) documents utilizing OCRopus for \ preprocessing and Tesseract OCR for OCR. Available outputs are HOCR, \ PDF, shrinked PDF, and simple DTAbf (TEI P5 compliant). Software \ requirements: imagemagick, ocropus, pdftk, pdftoppm, poppler-utils, \ - pyflow, python2.7, tesseract" + pyflow, python2.7, python3.5, tesseract" ) - parser.add_argument("-i", - dest="inputDir", - help="Input directory.", - required=True) parser.add_argument("-l", dest='lang', help="Language for OCR", required=True) - parser.add_argument("-o", - dest="outputDir", - help="Output directory.", - required=True) parser.add_argument("--skip-binarization", action='store_true', default=False, @@ -67,14 +62,16 @@ def parse_arguments(): class OCRWorkflow(WorkflowRunner): - def __init__(self, jobs, skipBinarization, keepIntermediates, lang, nCores): - self.jobs = jobs - self.skipBinarization = skipBinarization - self.keepIntermediates = keepIntermediates - self.lang = lang - self.nCores = nCores - self.defaultNCores = min(nCores, max(1, int(nCores / len(jobs)))) - + def __init__(self, args): + self.jobs = analyze_jobs() + self.skipBinarization = args.skipBinarization + self.keepIntermediates = args.keepIntermediates + self.lang = args.lang + self.nCores = args.nCores + self.defaultNCores = min( + self.nCores, + max(1, int(self.nCores / len(self.jobs))) + ) def workflow(self): ### @@ -93,10 +90,17 @@ class OCRWorkflow(WorkflowRunner): ) if not self.skipBinarization: cmd += ' "%s" "%s"' % ( - os.path.join(job["output_dir"], "tmp", "binarized_png"), - os.path.join(job["output_dir"], "tmp", "normalized_png"), + os.path.join(job["output_dir"], "tmp", "bin.png"), + os.path.join(job["output_dir"], "tmp", "nrm.png"), + ) + create_output_directories_jobs.append( + self.addTask( + command=cmd, + label="create_output_directories_job_-_%i" % ( + create_output_directories_job_number + ), + nCores=self.defaultNCores) ) - create_output_directories_jobs.append(self.addTask(label="create_output_directories_job_-_%i" % (create_output_directories_job_number), command=cmd, nCores=self.defaultNCores)) ### # Task "split_job": split input file into one tiff file per page @@ -116,7 +120,14 @@ class OCRWorkflow(WorkflowRunner): job["path"], os.path.join(job["output_dir"], "tmp", "page") ) - split_jobs.append(self.addTask(label="split_job_-_%i" % (split_job_number), command=cmd, dependencies=create_output_directories_jobs, nCores=self.defaultNCores)) + split_jobs.append( + self.addTask( + command=cmd, + dependencies=create_output_directories_jobs, + label="split_job_-_%i" % (split_job_number), + nCores=self.defaultNCores + ) + ) ### # Task "ocropus_nlbin_job": binarize tiff files from previous split @@ -132,12 +143,21 @@ class OCRWorkflow(WorkflowRunner): if not self.skipBinarization: for job in self.jobs: binarization_job_number += 1 - cmd = 'ocropus-nlbin --output "%s" --parallel "%i" $(ls "%s"/*.tif | sort -V)' % ( + cmd = 'ocropus-nlbin --output "%s" --parallel "%i" $(ls --quoting-style=shell-escape -v "%s"/*.tif)' % ( os.path.join(job["output_dir"], "tmp"), binarization_job_nCores, os.path.join(job["output_dir"], "tmp") ) - binarization_jobs.append(self.addTask(label="binarization_job_-_%i" % (binarization_job_number), command=cmd, dependencies=split_jobs, nCores=binarization_job_nCores)) + binarization_jobs.append( + self.addTask( + command=cmd, + dependencies=split_jobs, + label="binarization_job_-_%i" % ( + binarization_job_number + ), + nCores=binarization_job_nCores + ) + ) ### # Task "post_binarization_job": Normalize file names from binarization @@ -152,9 +172,21 @@ class OCRWorkflow(WorkflowRunner): post_binarization_job_number += 1 cmd = 'mv "%s" "%s"' % ( os.path.join(job["output_dir"], "tmp", file), - os.path.join(job["output_dir"], "tmp", "page-%i.%s" % (int(file.split(".", 1)[0]), file.split(".", 1)[1])), + os.path.join(job["output_dir"], "tmp", "page-%i.%s" % ( + int(file.split(".", 1)[0]), + file.split(".", 1)[1]) + ), + ) + post_binarization_jobs.append( + self.addTask( + command=cmd, + dependencies=binarization_jobs, + label="post_binarization_job_-_%i" % ( + post_binarization_job_number + ), + nCores=self.defaultNCores + ) ) - post_binarization_jobs.append(self.addTask(label="post_binarization_job_-_%i" % (post_binarization_job_number), command=cmd, dependencies=binarization_jobs, nCores=self.defaultNCores)) ### # Task "ocr_job": perform OCR @@ -165,8 +197,8 @@ class OCRWorkflow(WorkflowRunner): ocr_job_number = 0 ''' ' Tesseract runs fastest with four cores. So we run it with either four - ' or, if there are less then four cores available for this workflow, the - ' available core number. + ' or, if there are less then four cores available for this workflow, + ' the available core number. ''' ocr_job_nCores = min(4, self.nCores) ''' @@ -183,7 +215,14 @@ class OCRWorkflow(WorkflowRunner): os.path.join(job["output_dir"], "tmp", file.rsplit(".", 1 if self.skipBinarization else 2)[0]), self.lang ) - ocr_jobs.append(self.addTask(label="ocr_job_-_%i" % (ocr_job_number), command=cmd, dependencies=post_binarization_jobs, nCores=ocr_job_nCores)) + ocr_jobs.append( + self.addTask( + command=cmd, + dependencies=post_binarization_jobs, + label="ocr_job_-_%i" % (ocr_job_number), + nCores=ocr_job_nCores + ) + ) ### # Task "hocr_to_tei_job": create TEI P5 file from hocr files @@ -197,7 +236,14 @@ class OCRWorkflow(WorkflowRunner): os.path.join(job["output_dir"], "tmp"), os.path.join(job["output_dir"], job["filename"].rsplit(".", 1)[0] + ".xml") ) - hocr_to_tei_jobs.append(self.addTask(label="hocr_to_tei_job_-_%i" % (hocr_to_tei_job_number), command=cmd, dependencies=ocr_jobs, nCores=self.defaultNCores)) + hocr_to_tei_jobs.append( + self.addTask( + command=cmd, + dependencies=ocr_jobs, + label="hocr_to_tei_job_-_%i" % (hocr_to_tei_job_number), + nCores=self.defaultNCores + ) + ) ### # Task "pdf_merge_job": Merge PDF files @@ -207,11 +253,18 @@ class OCRWorkflow(WorkflowRunner): pdf_merge_job_number = 0 for job in self.jobs: pdf_merge_job_number += 1 - cmd = 'pdftk $(ls "%s"/*.pdf | sort -V) cat output "%s"' % ( + cmd = '(ls --quoting-style=shell-escape -v "%s"/*.pdf && echo "\'%s\'") | xargs pdfunite' % ( os.path.join(job["output_dir"], "tmp"), os.path.join(job["output_dir"], job["filename"].rsplit(".", 1)[0] + ".pdf") ) - pdf_merge_jobs.append(self.addTask(label="pdf_merge_job_-_%i" % (pdf_merge_job_number), command=cmd, dependencies=ocr_jobs, nCores=self.defaultNCores)) + pdf_merge_jobs.append( + self.addTask( + command=cmd, + dependencies=ocr_jobs, + label="pdf_merge_job_-_%i" % (pdf_merge_job_number), + nCores=self.defaultNCores + ) + ) ### # Task "txt_merge_job": Merge .txt files @@ -221,11 +274,18 @@ class OCRWorkflow(WorkflowRunner): txt_merge_job_number = 0 for job in self.jobs: txt_merge_job_number += 1 - cmd = 'cat $(ls "%s"/*.txt | sort -V) > "%s"' % ( + cmd = 'ls --quoting-style=shell-escape -v "%s"/*.txt | xargs cat > "%s"' % ( os.path.join(job["output_dir"], "tmp"), os.path.join(job["output_dir"], job["filename"].rsplit(".", 1)[0] + ".txt") ) - txt_merge_jobs.append(self.addTask(label="txt_merge_job_-_%i" % (txt_merge_job_number), command=cmd, dependencies=ocr_jobs, nCores=self.defaultNCores)) + txt_merge_jobs.append( + self.addTask( + command=cmd, + dependencies=ocr_jobs, + label="txt_merge_job_-_%i" % (txt_merge_job_number), + nCores=self.defaultNCores + ) + ) ### # Task "cleanup_job": remove temporary files @@ -236,35 +296,59 @@ class OCRWorkflow(WorkflowRunner): if self.keepIntermediates: for job in self.jobs: cleanup_job_counter += 1 - cmd = 'mv "%s"/*.hocr "%s" && mv "%s"/*.pdf "%s" && mv "%s"/*.tif "%s" && mv "%s"/*.txt "%s"' % ( + cmd = 'mv "%s"/*.hocr "%s"' % ( os.path.join(job["output_dir"], "tmp"), os.path.join(job["output_dir"], "tmp", "hocr"), + ) + cmd += ' && mv "%s"/*.pdf "%s"' % ( os.path.join(job["output_dir"], "tmp"), os.path.join(job["output_dir"], "tmp", "pdf"), + ) + cmd += ' && mv "%s"/*.tif "%s"' % ( os.path.join(job["output_dir"], "tmp"), os.path.join(job["output_dir"], "tmp", "tiff"), + ) + cmd += ' && mv "%s"/*.txt "%s"' % ( os.path.join(job["output_dir"], "tmp"), - os.path.join(job["output_dir"], "tmp", "txt") + os.path.join(job["output_dir"], "tmp", "txt"), ) if not self.skipBinarization: - cmd += ' && mv "%s"/*.bin.png "%s" && mv "%s"/*.nrm.png "%s"' % ( + cmd += ' && mv "%s"/*.bin.png "%s"' % ( os.path.join(job["output_dir"], "tmp"), - os.path.join(job["output_dir"], "tmp", "binarized_png"), - os.path.join(job["output_dir"], "tmp"), - os.path.join(job["output_dir"], "tmp", "normalized_png"), + os.path.join(job["output_dir"], "tmp", "bin.png"), ) - cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_tei_jobs + pdf_merge_jobs + txt_merge_jobs, nCores=self.defaultNCores)) + cmd += ' && mv "%s"/*.nrm.png "%s"' % ( + os.path.join(job["output_dir"], "tmp"), + os.path.join(job["output_dir"], "tmp", "nrm.png"), + ) + cleanup_jobs.append( + self.addTask( + command=cmd, + dependencies=hocr_to_tei_jobs + pdf_merge_jobs + txt_merge_jobs, + label="cleanup_job_-_%i" % (cleanup_job_counter), + nCores=self.defaultNCores + ) + ) else: for job in self.jobs: cleanup_job_counter += 1 cmd = 'rm -r "%s"' % ( os.path.join(job["output_dir"], "tmp") ) - cleanup_jobs.append(self.addTask(label="cleanup_job_-_%i" % (cleanup_job_counter), command=cmd, dependencies=hocr_to_tei_jobs + pdf_merge_jobs + txt_merge_jobs), nCores=self.defaultNCores) + cleanup_jobs.append( + self.addTask( + label="cleanup_job_-_%i" % (cleanup_job_counter), + command=cmd, + dependencies=hocr_to_tei_jobs + pdf_merge_jobs + txt_merge_jobs, + nCores=self.defaultNCores + ) + ) -def analyze_jobs(inputDir, outputDir): +def analyze_jobs(): + inputDir = "/files_for_ocr" jobs = [] + outputDir = "/files_from_ocr" for file in os.listdir(inputDir): if os.path.isdir(os.path.join(inputDir, file)): @@ -273,7 +357,13 @@ def analyze_jobs(inputDir, outputDir): os.path.join(outputDir, file) ) elif file.endswith((".pdf", ".tif", ".tiff")): - jobs.append({"filename": file, "output_dir": os.path.join(outputDir, file), "path": os.path.join(inputDir, file)}) + jobs.append( + { + "filename": file, + "output_dir": os.path.join(outputDir, file), + "path": os.path.join(inputDir, file) + } + ) return jobs @@ -281,15 +371,10 @@ def analyze_jobs(inputDir, outputDir): def main(): args = parse_arguments() - wflow = OCRWorkflow( - analyze_jobs(args.inputDir, args.outputDir), - args.skipBinarization, - args.keepIntermediates, - args.lang, - args.nCores - ) + wflow = OCRWorkflow(args) + + retval = wflow.run(dataDirRoot="/files_from_ocr", nCores=args.nCores) - retval = wflow.run(nCores=args.nCores) sys.exit(retval)