This commit is contained in:
Patrick Jentsch
2018-10-29 10:38:50 +01:00
parent ce864e205a
commit 132490a929
3 changed files with 219 additions and 441 deletions

View File

@ -1,66 +1,57 @@
FROM ubuntu:18.04
FROM debian:stretch
MAINTAINER Patrick Jentsch <p.jentsch@uni-bielefeld.de>
ENV DEBIAN_FRONTEND=noninteractive
ENV LANG=en_US.UTF-8
ENV LANG=C.UTF-8
ENV PYFLOW_VERSION 1.1.20
ENV OCROPY_VERSION 1.3.3
RUN apt-get update && \
apt-get install -y --no-install-recommends \
gnupg2
# Add PPA for pdftk
RUN echo "deb http://ppa.launchpad.net/malteworld/ppa/ubuntu bionic main" >> /etc/apt/sources.list && \
echo "deb-src http://ppa.launchpad.net/malteworld/ppa/ubuntu bionic main" >> /etc/apt/sources.list && \
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys BCA68C33DA36783B03E981C820D0BB61B700CE29
RUN apt-get update && \
apt-get install -y --no-install-recommends \
ghostscript \
git \
apt-transport-https \
ca-certificates \
gnupg2 \
imagemagick \
libtiff-tools \
locales \
pdftk \
poppler-utils \
python2.7 \
python3.6 \
python-pip \
python-tk \
tesseract-ocr \
python3.5 \
python-numpy \
wget
# Configure locales
RUN locale-gen "$LANG"
WORKDIR /root
# Install pyFlow
RUN wget -nv http://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \
RUN wget -nv https://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \
tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \
rm pyflow-"$PYFLOW_VERSION".tar.gz && \
cd pyflow-"$PYFLOW_VERSION" && \
python2.7 setup.py build install && \
cd /root && \
rm pyflow-"$PYFLOW_VERSION".tar.gz
cd ..
# Install Tesseract OCR Data Files
RUN wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
wget -nv http://github.com/tesseract-ocr/tessdata/raw/3.04.00/deu_frak.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata
# Install Tesseract OCR and Data Files
RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list && \
wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - && \
apt-get update && \
apt-get install -y --no-install-recommends tesseract-ocr && \
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
wget -nv https://github.com/tesseract-ocr/tessdata/raw/master/deu_frak.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata
# Install OCRopus
RUN git clone http://github.com/tmbdev/ocropy && \
cd ocropy && \
apt-get install -y --no-install-recommends $(cat PACKAGES) && \
pip install -r requirements.txt && \
wget -nv http://www.tmbdev.net/en-default.pyrnn.gz && \
mv en-default.pyrnn.gz models/ && \
# Install ocropy
RUN wget -nv https://github.com/tmbdev/ocropy/archive/v"$OCROPY_VERSION".tar.gz && \
tar -xzf v"$OCROPY_VERSION".tar.gz && \
rm v"$OCROPY_VERSION".tar.gz && \
cd ocropy-"$OCROPY_VERSION" && \
apt-get install -y --no-install-recommends $(cat PACKAGES) python-pil python-tk && \
wget -nv http://www.tmbdev.net/en-default.pyrnn.gz -P models/ && \
python2.7 setup.py install && \
cd /root
cd ..
COPY ocr_pyflow /usr/local/bin
COPY parse_hocr /usr/local/bin