FROM debian:10-slim


LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <sporada@uni-bielefeld.de>"


ENV LANG=C.UTF-8


RUN apt-get update


## Install pyFlow ##
ENV PYFLOW_RELEASE=1.1.20
ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" .
RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \
 && cd "pyflow-${PYFLOW_RELEASE}" \
 && apt-get install -y --no-install-recommends \
      python2.7 \
 && python2.7 setup.py build install \
 && cd .. \
 && rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz"


## Install ocropy ##
ENV OCROPY_RELEASE 1.3.3
ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_RELEASE}.tar.gz" .
RUN tar -xzf "v${OCROPY_RELEASE}.tar.gz" \
 && cd "ocropy-${OCROPY_RELEASE}" \
 && apt-get install -y --no-install-recommends \
      python-pil \
      python-tk \
      $(cat PACKAGES) \
 && python2.7 setup.py install \
 && cd .. \
 && rm -r "ocropy-${OCROPY_RELEASE}" "v${OCROPY_RELEASE}.tar.gz"


## Install Tesseract OCR ##
ENV TESSERACT_RELEASE=4.1.1
ADD "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_RELEASE}.tar.gz" .
RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \
 && cd "tesseract-${TESSERACT_RELEASE}" \
 && apt-get install -y --no-install-recommends \
      autoconf \
      automake \
      g++ \
      libjpeg62-turbo-dev \
      libleptonica-dev \
      libtiff5-dev \
      libtool \
      libpng-dev \
      make \
      pkg-config \
      zlib1g-dev \
 && ./autogen.sh \
 && ./configure \
 && make \
 && make install \
 && ldconfig \
 && cd - > /dev/null \
 && rm -r "tesseract-${TESSERACT_RELEASE}" "${TESSERACT_RELEASE}.tar.gz"
ADD "https://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata" \
    "https://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata" \
    "https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata" \
    "https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata" \
    "https://github.com/tesseract-ocr/tessdata_best/raw/master/frk.traineddata" \
    "https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata" \
    "https://github.com/tesseract-ocr/tessdata_best/raw/master/ita.traineddata" \
    "https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata" \
    "https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata" \
    "/usr/local/share/tessdata/"
RUN chmod 644 /usr/local/share/tessdata/*.traineddata


## Install Pipeline ##
RUN apt-get install -y --no-install-recommends \
      ghostscript \
      python-pip \
      python3.7 \
      zip \
 && pip install natsort
COPY "hocrtotei" "ocr" "/usr/local/bin/"


## Cleanup ##
RUN rm -r /var/lib/apt/lists/*


ENTRYPOINT ["ocr"]
CMD ["--help"]