FROM ubuntu:18.04 MAINTAINER Patrick Jentsch ENV DEBIAN_FRONTEND=noninteractive ENV LANG=en_US.UTF-8 ENV PYFLOW_VERSION 1.1.20 RUN apt-get update && \ apt-get install -y --no-install-recommends \ gnupg2 # Add PPA for pdftk RUN echo "deb http://ppa.launchpad.net/malteworld/ppa/ubuntu bionic main" >> /etc/apt/sources.list && \ echo "deb-src http://ppa.launchpad.net/malteworld/ppa/ubuntu bionic main" >> /etc/apt/sources.list && \ apt-key adv --keyserver keyserver.ubuntu.com --recv-keys BCA68C33DA36783B03E981C820D0BB61B700CE29 RUN apt-get update && \ apt-get install -y --no-install-recommends \ ghostscript \ git \ imagemagick \ libtiff-tools \ locales \ pdftk \ poppler-utils \ python2.7 \ python3.6 \ tesseract-ocr \ wget # Configure locales RUN locale-gen "$LANG" WORKDIR /root # Install pyFlow RUN wget -nv http://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \ tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \ cd pyflow-"$PYFLOW_VERSION" && \ python2.7 setup.py build install && \ cd /root && \ rm pyflow-"$PYFLOW_VERSION".tar.gz # Install Tesseract OCR Data Files RUN wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ wget -nv http://github.com/tesseract-ocr/tessdata/raw/3.04.00/deu_frak.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \ wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata # Install OCRopus RUN git clone http://github.com/tmbdev/ocropy && \ cd ocropy && \ apt-get install -y --no-install-recommends $(cat PACKAGES) && \ wget -nv http://www.tmbdev.net/en-default.pyrnn.gz && \ mv en-default.pyrnn.gz models/ && \ python2.7 setup.py install && \ cd /root COPY ocr_pyflow /usr/local/bin COPY parse_hocr /usr/local/bin VOLUME /root/files_for_ocr CMD ["/bin/bash"]