FROM ubuntu:18.04
MAINTAINER Patrick Jentsch
ENV DEBIAN_FRONTEND=noninteractive
ENV LANG=en_US.UTF-8
ENV PYFLOW_VERSION 1.1.20
RUN apt-get update && \
apt-get install -y --no-install-recommends \
gnupg2
# Add PPA for pdftk
RUN echo "deb http://ppa.launchpad.net/malteworld/ppa/ubuntu bionic main" >> /etc/apt/sources.list && \
echo "deb-src http://ppa.launchpad.net/malteworld/ppa/ubuntu bionic main" >> /etc/apt/sources.list && \
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys BCA68C33DA36783B03E981C820D0BB61B700CE29
RUN apt-get update && \
apt-get install -y --no-install-recommends \
ghostscript \
git \
imagemagick \
libtiff-tools \
locales \
pdftk \
poppler-utils \
python2.7 \
python3.6 \
tesseract-ocr \
wget
# Configure locales
RUN locale-gen "$LANG"
WORKDIR /root
# Install pyFlow
RUN wget -nv http://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \
tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \
cd pyflow-"$PYFLOW_VERSION" && \
python2.7 setup.py build install && \
cd /root && \
rm pyflow-"$PYFLOW_VERSION".tar.gz
# Install Tesseract OCR Data Files
RUN wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
wget -nv http://github.com/tesseract-ocr/tessdata/raw/3.04.00/deu_frak.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata
# Install OCRopus
RUN git clone http://github.com/tmbdev/ocropy && \
cd ocropy && \
apt-get install -y --no-install-recommends $(cat PACKAGES) && \
wget -nv http://www.tmbdev.net/en-default.pyrnn.gz && \
mv en-default.pyrnn.gz models/ && \
python2.7 setup.py install && \
cd /root
COPY ocr_pyflow /usr/local/bin
COPY parse_hocr /usr/local/bin
VOLUME /root/files_for_ocr
CMD ["/bin/bash"]