FROM debian:stretch-slim

MAINTAINER Patrick Jentsch <p.jentsch@uni-bielefeld.de>

ENV DEBIAN_FRONTEND=noninteractive
ENV LANG=C.UTF-8

RUN apt-get update && \
    apt-get install -y --no-install-recommends \
    apt-transport-https \
    ca-certificates \
    gnupg2 \
    imagemagick \
    libtiff-tools \
    pdftk \
    poppler-utils \
    python2.7 \
    python3.5 \
    python-numpy \
    wget

WORKDIR /root

# Install ocropy
ENV OCROPY_VERSION 1.3.3
RUN wget -nv https://github.com/tmbdev/ocropy/archive/v"$OCROPY_VERSION".tar.gz && \
    tar -xzf v"$OCROPY_VERSION".tar.gz && \
    rm v"$OCROPY_VERSION".tar.gz && \
    cd ocropy-"$OCROPY_VERSION" && \
    apt-get install -y --no-install-recommends $(cat PACKAGES) python-pil python-tk && \
    wget -nv http://www.tmbdev.net/en-default.pyrnn.gz -P models/ && \
    python2.7 setup.py install && \
    cd ..

# Install pyFlow
ENV PYFLOW_VERSION 1.1.20
RUN wget -nv https://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \
    tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \
    rm pyflow-"$PYFLOW_VERSION".tar.gz && \
    cd pyflow-"$PYFLOW_VERSION" && \
    python2.7 setup.py build install && \
    cd ..

# Install Tesseract OCR and Data Files
RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list && \
    wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - && \
    apt-get update && \
    apt-get install -y --no-install-recommends tesseract-ocr && \
    wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
    wget -nv https://github.com/tesseract-ocr/tessdata/raw/master/deu_frak.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
    wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
    wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
    wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
    wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
    wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
    wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata

COPY ocr /usr/local/bin
COPY hocrtotei /usr/local/bin

CMD ["/bin/bash"]