diff --git a/Dockerfile b/Dockerfile index 704d5a2..e48b895 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,59 +1,70 @@ FROM debian:9-slim + +# Define image metadata LABEL maintainer="inf_sfb1288@lists.uni-bielefeld.de" -ENV DEBIAN_FRONTEND=noninteractive -ENV LANG=C.UTF-8 -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - apt-transport-https \ - ca-certificates \ - gnupg2 \ - imagemagick \ - poppler-utils \ - python2.7 \ - python3.5 \ - python-numpy \ - wget +# Install prerequisites +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + apt-transport-https \ + ca-certificates \ + gnupg2 \ + imagemagick \ + poppler-utils \ + python2.7 \ + python3.5 \ + wget \ + && rm -rf /var/lib/apt/lists/* -# Install ocropy ENV OCROPY_VERSION 1.3.3 -RUN wget -nv https://github.com/tmbdev/ocropy/archive/v"$OCROPY_VERSION".tar.gz && \ - tar -xzf v"$OCROPY_VERSION".tar.gz && \ - cd ocropy-"$OCROPY_VERSION" && \ - apt-get install -y --no-install-recommends $(cat PACKAGES) python-pil python-tk && \ - wget -nv http://www.tmbdev.net/en-default.pyrnn.gz -P models/ && \ - python2.7 setup.py install && \ - cd .. && \ - rm -r ocropy-"$OCROPY_VERSION" v"$OCROPY_VERSION".tar.gz +ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" . +RUN tar -xzf "v${OCROPY_VERSION}.tar.gz" \ + && cd "ocropy-${OCROPY_VERSION}" \ + && apt-get update \ + && apt-get install -y --no-install-recommends \ + python-pil \ + python-tk \ + $(cat PACKAGES) \ + && rm -rf /var/lib/apt/lists/* \ + && python2.7 setup.py install \ + && cd .. \ + && rm -rf \ + "ocropy-${OCROPY_VERSION}" \ + "v${OCROPY_VERSION}.tar.gz" -# Install pyFlow -ENV PYFLOW_VERSION 1.1.20 -RUN wget -nv https://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \ - tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \ - cd pyflow-"$PYFLOW_VERSION" && \ - python2.7 setup.py build install && \ - cd .. && \ - rm -r pyflow-"$PYFLOW_VERSION" pyflow-"$PYFLOW_VERSION".tar.gz +ENV PYFLOW_VERSION=1.1.20 +ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" . +RUN tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \ + && cd "pyflow-${PYFLOW_VERSION}" \ + && python2.7 setup.py build install \ + && cd .. \ + && rm -rf \ + "pyflow-${PYFLOW_VERSION}" \ + "pyflow-${PYFLOW_VERSION}.tar.gz" -# Install Tesseract OCR and Data Files -RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list && \ - wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - && \ - apt-get update && \ - apt-get install -y --no-install-recommends \ - tesseract-ocr \ - tesseract-ocr-deu \ - tesseract-ocr-eng \ - tesseract-ocr-enm \ - tesseract-ocr-fra \ - tesseract-ocr-frk \ - tesseract-ocr-frm \ - tesseract-ocr-ita \ - tesseract-ocr-por \ - tesseract-ocr-spa +RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list \ + && wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - \ + && apt-get update \ + && apt-get install -y --no-install-recommends \ + tesseract-ocr \ + tesseract-ocr-deu \ + tesseract-ocr-eng \ + tesseract-ocr-enm \ + tesseract-ocr-fra \ + tesseract-ocr-frk \ + tesseract-ocr-frm \ + tesseract-ocr-ita \ + tesseract-ocr-por \ + tesseract-ocr-spa \ + && rm -rf /var/lib/apt/lists/* + +# Install OCR pipeline COPY hocrtotei /usr/local/bin COPY ocr /usr/local/bin + ENTRYPOINT ["ocr"] +CMD ["--help"]