ocr/Dockerfile
2019-04-24 17:01:49 +02:00

62 lines
1.7 KiB
Docker

FROM debian:stretch-slim
MAINTAINER Patrick Jentsch <p.jentsch@uni-bielefeld.de>
ENV DEBIAN_FRONTEND=noninteractive
ENV LANG=C.UTF-8
RUN apt-get update && \
apt-get install -y --no-install-recommends \
apt-transport-https \
ca-certificates \
gnupg2 \
imagemagick \
pdftk \
poppler-utils \
python2.7 \
python3.5 \
python-numpy \
wget
WORKDIR /root
# Install ocropy
ENV OCROPY_VERSION 1.3.3
RUN wget -nv https://github.com/tmbdev/ocropy/archive/v"$OCROPY_VERSION".tar.gz && \
tar -xzf v"$OCROPY_VERSION".tar.gz && \
rm v"$OCROPY_VERSION".tar.gz && \
cd ocropy-"$OCROPY_VERSION" && \
apt-get install -y --no-install-recommends $(cat PACKAGES) python-pil python-tk && \
wget -nv http://www.tmbdev.net/en-default.pyrnn.gz -P models/ && \
python2.7 setup.py install && \
cd ..
# Install pyFlow
ENV PYFLOW_VERSION 1.1.20
RUN wget -nv https://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \
tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \
rm pyflow-"$PYFLOW_VERSION".tar.gz && \
cd pyflow-"$PYFLOW_VERSION" && \
python2.7 setup.py build install && \
cd ..
# Install Tesseract OCR and Data Files
RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list && \
wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - && \
apt-get update && \
apt-get install -y --no-install-recommends \
tesseract-ocr \
tesseract-ocr-deu \
tesseract-ocr-frk \
tesseract-ocr-eng \
tesseract-ocr-enm \
tesseract-ocr-fra \
tesseract-ocr-frm \
tesseract-ocr-por \
tesseract-ocr-spa
COPY ocr /usr/local/bin
COPY hocrtotei /usr/local/bin
CMD ["/bin/bash"]