ocr/Dockerfile

98 lines
2.8 KiB
Docker
Raw Normal View History

2020-10-08 21:17:48 +00:00
FROM debian:buster-slim
2018-10-09 12:43:23 +00:00
2019-09-11 13:15:00 +00:00
2021-02-24 14:17:42 +00:00
LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <porada@posteo.de>"
2018-10-09 12:43:23 +00:00
2019-09-11 13:15:00 +00:00
2019-09-12 09:30:52 +00:00
ENV LANG=C.UTF-8
RUN apt-get update \
&& apt-get install --no-install-recommends --yes \
wget
2020-09-21 12:46:03 +00:00
# Install the OCR pipeline and it's dependencies #
2020-09-21 12:46:03 +00:00
## Install pyFlow ##
ENV PYFLOW_VERSION=1.1.20
RUN wget --no-check-certificate --quiet \
"https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" \
&& tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
&& cd "pyflow-${PYFLOW_VERSION}" \
2020-10-08 21:09:10 +00:00
&& apt-get install --no-install-recommends --yes \
2019-09-11 13:15:00 +00:00
python2.7 \
2020-09-21 12:46:03 +00:00
&& python2.7 setup.py build install \
&& cd - > /dev/null \
&& rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz"
2020-04-06 07:21:52 +00:00
2019-09-11 13:15:00 +00:00
2020-09-21 12:46:03 +00:00
## Install ocropy ##
ENV OCROPY_VERSION=1.3.3
RUN wget --no-check-certificate --quiet \
"https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" \
&& tar -xzf "v${OCROPY_VERSION}.tar.gz" \
&& cd "ocropy-${OCROPY_VERSION}" \
2020-10-08 21:09:10 +00:00
&& apt-get install --no-install-recommends --yes \
python2.7 \
2019-09-11 13:15:00 +00:00
python-pil \
python-tk \
$(cat PACKAGES) \
&& python2.7 setup.py install \
&& cd - > /dev/null \
&& rm -r "ocropy-${OCROPY_VERSION}" "v${OCROPY_VERSION}.tar.gz"
2018-10-29 09:38:50 +00:00
2020-04-06 07:21:52 +00:00
2020-09-21 12:46:03 +00:00
## Install Tesseract OCR ##
ENV TESSERACT_VERSION=4.1.1
RUN wget --no-check-certificate --quiet \
"https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \
&& tar -xzf "${TESSERACT_VERSION}.tar.gz" \
&& cd "tesseract-${TESSERACT_VERSION}" \
2020-10-08 21:09:10 +00:00
&& apt-get install --no-install-recommends --yes \
2020-09-21 12:46:03 +00:00
autoconf \
automake \
g++ \
libjpeg62-turbo-dev \
libleptonica-dev \
libtiff5-dev \
libtool \
libpng-dev \
make \
pkg-config \
zlib1g-dev \
&& ./autogen.sh \
&& ./configure \
&& make \
&& make install \
&& ldconfig \
&& cd - > /dev/null \
&& rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz"
ENV TESSERACT_MODELS="ara,chi_tra,dan,ell,eng,enm,fra,frk,frm,ita,por,rus,spa"
ENV TESSDATA_BEST_VERSION=4.1.0
RUN wget --no-check-certificate --quiet \
"https://github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}.tar.gz" \
&& tar -xzf "${TESSDATA_BEST_VERSION}.tar.gz" \
&& for tesseract_model in $(echo ${TESSERACT_MODELS} | tr "," "\n"); do mv "tessdata_best-${TESSDATA_BEST_VERSION}/${tesseract_model}.traineddata" "/usr/local/share/tessdata/"; done \
&& rm -r "tessdata_best-${TESSDATA_BEST_VERSION}" "${TESSDATA_BEST_VERSION}.tar.gz"
2020-09-21 12:46:03 +00:00
2020-10-08 21:09:10 +00:00
## Further dependencies ##
2021-02-19 12:04:03 +00:00
RUN apt-get install --no-install-recommends --yes \
procps \
2020-09-21 12:46:03 +00:00
ghostscript \
python3.7 \
rename \
zip
2019-09-11 13:15:00 +00:00
2020-10-08 21:09:10 +00:00
## Install Pipeline ##
COPY hocrtotei ocr /usr/local/bin/
2018-10-09 12:43:23 +00:00
2019-09-11 13:15:00 +00:00
2021-02-24 14:28:04 +00:00
RUN rm -r /var/lib/apt/lists/*
2019-05-13 13:03:43 +00:00
ENTRYPOINT ["ocr"]
2019-09-11 13:15:00 +00:00
CMD ["--help"]