FROM debian:9-slim LABEL maintainer="inf_sfb1288@lists.uni-bielefeld.de" ENV DEBIAN_FRONTEND=noninteractive ENV LANG=C.UTF-8 RUN apt-get update && \ apt-get install -y --no-install-recommends \ apt-transport-https \ ca-certificates \ gnupg2 \ imagemagick \ poppler-utils \ python2.7 \ python3.5 \ python-numpy \ wget # Install ocropy ENV OCROPY_VERSION 1.3.3 RUN wget -nv https://github.com/tmbdev/ocropy/archive/v"$OCROPY_VERSION".tar.gz && \ tar -xzf v"$OCROPY_VERSION".tar.gz && \ cd ocropy-"$OCROPY_VERSION" && \ apt-get install -y --no-install-recommends $(cat PACKAGES) python-pil python-tk && \ wget -nv http://www.tmbdev.net/en-default.pyrnn.gz -P models/ && \ python2.7 setup.py install && \ cd .. && \ rm -r ocropy-"$OCROPY_VERSION" v"$OCROPY_VERSION".tar.gz # Install pyFlow ENV PYFLOW_VERSION 1.1.20 RUN wget -nv https://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \ tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \ cd pyflow-"$PYFLOW_VERSION" && \ python2.7 setup.py build install && \ cd .. && \ rm -r pyflow-"$PYFLOW_VERSION" pyflow-"$PYFLOW_VERSION".tar.gz # Install Tesseract OCR and Data Files RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list && \ wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - && \ apt-get update && \ apt-get install -y --no-install-recommends \ tesseract-ocr \ tesseract-ocr-deu \ tesseract-ocr-eng \ tesseract-ocr-enm \ tesseract-ocr-fra \ tesseract-ocr-frk \ tesseract-ocr-frm \ tesseract-ocr-ita \ tesseract-ocr-por \ tesseract-ocr-spa COPY hocrtotei /usr/local/bin COPY ocr /usr/local/bin ENTRYPOINT ["ocr"]