ocr/Dockerfile

98 lines
2.8 KiB
Docker
Raw Normal View History

2020-10-08 23:17:48 +02:00
FROM debian:buster-slim
2018-10-09 14:43:23 +02:00
2019-09-11 15:15:00 +02:00
2021-02-24 15:17:42 +01:00
LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <porada@posteo.de>"
2018-10-09 14:43:23 +02:00
2019-09-11 15:15:00 +02:00
2019-09-12 11:30:52 +02:00
ENV LANG=C.UTF-8
RUN apt-get update \
&& apt-get install --no-install-recommends --yes \
wget
2020-09-21 14:46:03 +02:00
# Install the OCR pipeline and it's dependencies #
2020-09-21 14:46:03 +02:00
## Install pyFlow ##
ENV PYFLOW_VERSION=1.1.20
RUN wget --no-check-certificate --quiet \
"https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" \
&& tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
&& cd "pyflow-${PYFLOW_VERSION}" \
2020-10-08 23:09:10 +02:00
&& apt-get install --no-install-recommends --yes \
2019-09-11 15:15:00 +02:00
python2.7 \
2020-09-21 14:46:03 +02:00
&& python2.7 setup.py build install \
&& cd - > /dev/null \
&& rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz"
2020-04-06 09:21:52 +02:00
2019-09-11 15:15:00 +02:00
2020-09-21 14:46:03 +02:00
## Install ocropy ##
ENV OCROPY_VERSION=1.3.3
RUN wget --no-check-certificate --quiet \
"https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" \
&& tar -xzf "v${OCROPY_VERSION}.tar.gz" \
&& cd "ocropy-${OCROPY_VERSION}" \
2020-10-08 23:09:10 +02:00
&& apt-get install --no-install-recommends --yes \
python2.7 \
2019-09-11 15:15:00 +02:00
python-pil \
python-tk \
$(cat PACKAGES) \
&& python2.7 setup.py install \
&& cd - > /dev/null \
&& rm -r "ocropy-${OCROPY_VERSION}" "v${OCROPY_VERSION}.tar.gz"
2018-10-29 10:38:50 +01:00
2020-04-06 09:21:52 +02:00
2020-09-21 14:46:03 +02:00
## Install Tesseract OCR ##
ENV TESSERACT_VERSION=4.1.1
RUN wget --no-check-certificate --quiet \
"https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \
&& tar -xzf "${TESSERACT_VERSION}.tar.gz" \
&& cd "tesseract-${TESSERACT_VERSION}" \
2020-10-08 23:09:10 +02:00
&& apt-get install --no-install-recommends --yes \
2020-09-21 14:46:03 +02:00
autoconf \
automake \
g++ \
libjpeg62-turbo-dev \
libleptonica-dev \
libtiff5-dev \
libtool \
libpng-dev \
make \
pkg-config \
zlib1g-dev \
&& ./autogen.sh \
&& ./configure \
&& make \
&& make install \
&& ldconfig \
&& cd - > /dev/null \
&& rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz"
2021-03-17 14:26:24 +01:00
ENV TESSERACT_MODELS="ara,chi_tra,dan,deu,ell,eng,enm,fra,frk,frm,ita,por,rus,spa"
ENV TESSDATA_BEST_VERSION=4.1.0
RUN wget --no-check-certificate --quiet \
"https://github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}.tar.gz" \
&& tar -xzf "${TESSDATA_BEST_VERSION}.tar.gz" \
&& for tesseract_model in $(echo ${TESSERACT_MODELS} | tr "," "\n"); do mv "tessdata_best-${TESSDATA_BEST_VERSION}/${tesseract_model}.traineddata" "/usr/local/share/tessdata/"; done \
&& rm -r "tessdata_best-${TESSDATA_BEST_VERSION}" "${TESSDATA_BEST_VERSION}.tar.gz"
2020-09-21 14:46:03 +02:00
2020-10-08 23:09:10 +02:00
## Further dependencies ##
2021-02-19 13:04:03 +01:00
RUN apt-get install --no-install-recommends --yes \
procps \
2020-09-21 14:46:03 +02:00
ghostscript \
python3.7 \
rename \
zip
2019-09-11 15:15:00 +02:00
2020-10-08 23:09:10 +02:00
## Install Pipeline ##
COPY hocrtotei ocr /usr/local/bin/
2018-10-09 14:43:23 +02:00
2019-09-11 15:15:00 +02:00
2021-02-24 15:28:04 +01:00
RUN rm -r /var/lib/apt/lists/*
2019-05-13 15:03:43 +02:00
ENTRYPOINT ["ocr"]
2019-09-11 15:15:00 +02:00
CMD ["--help"]