ocr/Dockerfile

103 lines
3.5 KiB
Docker
Raw Normal View History

2020-10-08 21:17:48 +00:00
FROM debian:buster-slim
2018-10-09 12:43:23 +00:00
2019-09-11 13:15:00 +00:00
2021-02-24 14:17:42 +00:00
LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <porada@posteo.de>"
2018-10-09 12:43:23 +00:00
2019-09-11 13:15:00 +00:00
2019-09-12 09:30:52 +00:00
ENV LANG=C.UTF-8
2020-09-21 12:46:03 +00:00
RUN apt-get update
2020-10-08 21:09:10 +00:00
# Install pipeline dependencies #
2020-09-21 12:46:03 +00:00
## Install pyFlow ##
ENV PYFLOW_RELEASE=1.1.20
ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" .
RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \
&& cd "pyflow-${PYFLOW_RELEASE}" \
2020-10-08 21:09:10 +00:00
&& apt-get install --no-install-recommends --yes \
2019-09-11 13:15:00 +00:00
python2.7 \
2020-09-21 12:46:03 +00:00
&& python2.7 setup.py build install \
&& cd .. \
&& rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz"
2020-04-06 07:21:52 +00:00
2019-09-11 13:15:00 +00:00
2020-09-21 12:46:03 +00:00
## Install ocropy ##
2021-02-19 12:04:03 +00:00
ENV OCROPY_RELEASE=1.3.3
2020-09-21 12:46:03 +00:00
ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_RELEASE}.tar.gz" .
RUN tar -xzf "v${OCROPY_RELEASE}.tar.gz" \
&& cd "ocropy-${OCROPY_RELEASE}" \
2020-10-08 21:09:10 +00:00
&& apt-get install --no-install-recommends --yes \
2019-09-11 13:15:00 +00:00
python-pil \
python-tk \
$(cat PACKAGES) \
&& python2.7 setup.py install \
&& cd .. \
2020-09-21 12:46:03 +00:00
&& rm -r "ocropy-${OCROPY_RELEASE}" "v${OCROPY_RELEASE}.tar.gz"
2018-10-29 09:38:50 +00:00
2020-04-06 07:21:52 +00:00
2020-09-21 12:46:03 +00:00
## Install Tesseract OCR ##
ENV TESSERACT_RELEASE=4.1.1
ADD "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_RELEASE}.tar.gz" .
RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \
&& cd "tesseract-${TESSERACT_RELEASE}" \
2020-10-08 21:09:10 +00:00
&& apt-get install --no-install-recommends --yes \
2020-09-21 12:46:03 +00:00
autoconf \
automake \
g++ \
libjpeg62-turbo-dev \
libleptonica-dev \
libtiff5-dev \
libtool \
libpng-dev \
make \
pkg-config \
zlib1g-dev \
&& ./autogen.sh \
&& ./configure \
&& make \
&& make install \
&& ldconfig \
&& cd - > /dev/null \
&& rm -r "tesseract-${TESSERACT_RELEASE}" "${TESSERACT_RELEASE}.tar.gz"
2021-02-19 12:04:03 +00:00
ENV TESSDATA_BEST_RELEASE=4.1.0
ADD "https://github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_RELEASE}.tar.gz" .
RUN tar -xzf "${TESSDATA_BEST_RELEASE}.tar.gz" \
2021-02-23 10:11:50 +00:00
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/ara.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/chi_tra.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/dan.traineddata" "/usr/local/share/tessdata/" \
2021-02-19 12:04:03 +00:00
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/deu.traineddata" "/usr/local/share/tessdata/" \
2021-02-23 10:11:50 +00:00
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/ell.traineddata" "/usr/local/share/tessdata/" \
2021-02-19 12:04:03 +00:00
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/eng.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/enm.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/fra.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/frk.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/frm.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/ita.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/por.traineddata" "/usr/local/share/tessdata/" \
2021-02-23 10:11:50 +00:00
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/rus.traineddata" "/usr/local/share/tessdata/" \
2021-02-19 12:04:03 +00:00
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/spa.traineddata" "/usr/local/share/tessdata/" \
&& rm -r "tessdata_best-${TESSDATA_BEST_RELEASE}" "${TESSDATA_BEST_RELEASE}.tar.gz"
2020-09-21 12:46:03 +00:00
2020-10-08 21:09:10 +00:00
## Further dependencies ##
2021-02-19 12:04:03 +00:00
RUN apt-get install --no-install-recommends --yes \
2020-09-21 12:46:03 +00:00
ghostscript \
python-pip \
2020-09-21 12:46:03 +00:00
python3.7 \
zip \
&& pip install natsort
2019-09-11 13:15:00 +00:00
2020-10-08 21:09:10 +00:00
## Install Pipeline ##
COPY hocrtotei ocr /usr/local/bin/
2018-10-09 12:43:23 +00:00
2019-09-11 13:15:00 +00:00
2021-02-24 14:28:04 +00:00
RUN rm -r /var/lib/apt/lists/*
2019-05-13 13:03:43 +00:00
ENTRYPOINT ["ocr"]
2019-09-11 13:15:00 +00:00
CMD ["--help"]