FROM debian:stretch-slim MAINTAINER Patrick Jentsch ENV LANG=C.UTF-8 RUN apt-get update && \ apt-get install -y --no-install-recommends \ apt-transport-https \ ca-certificates \ gnupg2 \ pdftk \ poppler-utils \ python2.7 \ python3 \ wget \ zip WORKDIR /root # Install pyFlow ENV PYFLOW_VERSION 1.1.20 RUN wget -nv https://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \ tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \ rm pyflow-"$PYFLOW_VERSION".tar.gz && \ cd pyflow-"$PYFLOW_VERSION" && \ python2.7 setup.py build install && \ cd .. # Install Tesseract OCR and Data Files RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list && \ wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - && \ apt-get update && \ apt-get install -y --no-install-recommends tesseract-ocr && \ wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ wget -nv https://github.com/tesseract-ocr/tessdata/raw/master/deu_frak.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata RUN mkdir files_for_ocr files_from_ocr COPY hocrtotei /usr/local/bin COPY ocr /usr/local/bin CMD ["/bin/bash"]