FROM debian:buster-slim


LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <porada@posteo.de>"


ENV LANG=C.UTF-8


RUN apt-get update \
 && apt-get install --no-install-recommends --yes \
      wget


# Install the OCR pipeline and it's dependencies #
## Install pyFlow ##
ENV PYFLOW_VERSION=1.1.20
RUN wget --no-check-certificate --quiet \
      "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" \
 && tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
 && cd "pyflow-${PYFLOW_VERSION}" \
 && apt-get install --no-install-recommends --yes \
      python2.7 \
 && python2.7 setup.py build install \
 && cd - > /dev/null \
 && rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz"


## Install ocropy ##
ENV OCROPY_VERSION=1.3.3
RUN wget --no-check-certificate --quiet \
      "https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" \
 && tar -xzf "v${OCROPY_VERSION}.tar.gz" \
 && cd "ocropy-${OCROPY_VERSION}" \
 && apt-get install --no-install-recommends --yes \
      python2.7 \
      python-pil \
      python-tk \
      $(cat PACKAGES) \
 && python2.7 setup.py install \
 && cd - > /dev/null \
 && rm -r "ocropy-${OCROPY_VERSION}" "v${OCROPY_VERSION}.tar.gz"


## Install Tesseract OCR ##
ENV TESSERACT_VERSION=4.1.1
RUN wget --no-check-certificate --quiet \
      "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \
 && tar -xzf "${TESSERACT_VERSION}.tar.gz" \
 && cd "tesseract-${TESSERACT_VERSION}" \
 && apt-get install --no-install-recommends --yes \
      autoconf \
      automake \
      g++ \
      libjpeg62-turbo-dev \
      libleptonica-dev \
      libtiff5-dev \
      libtool \
      libpng-dev \
      make \
      pkg-config \
      zlib1g-dev \
 && ./autogen.sh \
 && ./configure \
 && make \
 && make install \
 && ldconfig \
 && cd - > /dev/null \
 && rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz"

ENV TESSERACT_MODELS="ara,chi_tra,dan,ell,eng,enm,fra,frk,frm,ita,por,rus,spa"
ENV TESSDATA_BEST_VERSION=4.1.0
RUN wget --no-check-certificate --quiet \
      "https://github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}.tar.gz" \
 && tar -xzf "${TESSDATA_BEST_VERSION}.tar.gz" \
 && for tesseract_model in $(echo ${TESSERACT_MODELS} | tr "," "\n"); do mv "tessdata_best-${TESSDATA_BEST_VERSION}/${tesseract_model}.traineddata" "/usr/local/share/tessdata/"; done \
 && rm -r "tessdata_best-${TESSDATA_BEST_VERSION}" "${TESSDATA_BEST_VERSION}.tar.gz"


## Further dependencies ##
RUN apt-get install --no-install-recommends --yes \
      procps \
      ghostscript \
      python3.7 \
      rename \
      zip


## Install Pipeline ##
COPY hocrtotei ocr /usr/local/bin/


RUN rm -r /var/lib/apt/lists/*


ENTRYPOINT ["ocr"]
CMD ["--help"]