ocr/Dockerfile

63 lines
2.8 KiB
Docker
Raw Normal View History

2019-03-11 22:54:33 +00:00
FROM debian:stretch-slim
2018-10-09 12:43:23 +00:00
2019-05-13 13:03:43 +00:00
LABEL maintainer="inf_sfb1288@lists.uni-bielefeld.de"
2018-10-09 12:43:23 +00:00
ENV DEBIAN_FRONTEND=noninteractive
2018-10-29 09:38:50 +00:00
ENV LANG=C.UTF-8
2018-10-09 12:43:23 +00:00
RUN apt-get update && \
apt-get install -y --no-install-recommends \
2018-10-29 09:38:50 +00:00
apt-transport-https \
ca-certificates \
gnupg2 \
2018-10-09 12:43:23 +00:00
imagemagick \
poppler-utils \
python2.7 \
2018-10-29 09:38:50 +00:00
python3.5 \
python-numpy \
2018-10-09 12:43:23 +00:00
wget
2018-10-29 09:49:19 +00:00
# Install ocropy
ENV OCROPY_VERSION 1.3.3
RUN wget -nv https://github.com/tmbdev/ocropy/archive/v"$OCROPY_VERSION".tar.gz && \
tar -xzf v"$OCROPY_VERSION".tar.gz && \
cd ocropy-"$OCROPY_VERSION" && \
apt-get install -y --no-install-recommends $(cat PACKAGES) python-pil python-tk && \
wget -nv http://www.tmbdev.net/en-default.pyrnn.gz -P models/ && \
python2.7 setup.py install && \
2019-05-13 13:03:43 +00:00
cd .. && \
2019-05-27 09:47:38 +00:00
rm -r ocropy-"$OCROPY_VERSION" v"$OCROPY_VERSION".tar.gz
2018-10-29 09:49:19 +00:00
2018-10-09 12:43:23 +00:00
# Install pyFlow
2018-10-29 09:49:19 +00:00
ENV PYFLOW_VERSION 1.1.20
2018-10-29 09:38:50 +00:00
RUN wget -nv https://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \
2018-10-09 12:43:23 +00:00
tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \
cd pyflow-"$PYFLOW_VERSION" && \
python2.7 setup.py build install && \
2019-05-13 13:03:43 +00:00
cd .. && \
2019-05-27 09:47:38 +00:00
rm -r pyflow-"$PYFLOW_VERSION" pyflow-"$PYFLOW_VERSION".tar.gz
2018-10-29 09:38:50 +00:00
# Install Tesseract OCR and Data Files
RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list && \
wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - && \
apt-get update && \
apt-get install -y --no-install-recommends \
2019-04-25 09:50:32 +00:00
tesseract-ocr && \
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
wget -nv https://github.com/tesseract-ocr/tessdata/raw/master/deu_frak.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
2019-05-02 10:14:05 +00:00
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/ita.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
2019-05-13 13:03:43 +00:00
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
2019-04-25 09:50:32 +00:00
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata
2018-10-29 09:38:50 +00:00
2019-03-10 20:04:14 +00:00
COPY hocrtotei /usr/local/bin
2019-05-13 13:03:43 +00:00
COPY ocr /usr/local/bin
2018-10-09 12:43:23 +00:00
2019-06-02 19:38:30 +00:00
mkdir /input /output
chmod a+rw /input /output
2019-05-13 13:03:43 +00:00
ENTRYPOINT ["ocr"]