2020-04-06 07:21:52 +00:00
|
|
|
FROM debian:10-slim
|
2018-10-09 12:43:23 +00:00
|
|
|
|
2019-09-11 13:15:00 +00:00
|
|
|
|
2020-09-21 12:46:03 +00:00
|
|
|
LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <sporada@uni-bielefeld.de>"
|
2018-10-09 12:43:23 +00:00
|
|
|
|
2019-09-11 13:15:00 +00:00
|
|
|
|
2019-09-12 09:30:52 +00:00
|
|
|
ENV LANG=C.UTF-8
|
|
|
|
|
|
|
|
|
2020-09-21 12:46:03 +00:00
|
|
|
RUN apt-get update
|
|
|
|
|
|
|
|
|
2020-10-08 21:09:10 +00:00
|
|
|
# Install pipeline dependencies #
|
2020-09-21 12:46:03 +00:00
|
|
|
## Install pyFlow ##
|
|
|
|
ENV PYFLOW_RELEASE=1.1.20
|
|
|
|
ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" .
|
|
|
|
RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \
|
|
|
|
&& cd "pyflow-${PYFLOW_RELEASE}" \
|
2020-10-08 21:09:10 +00:00
|
|
|
&& apt-get update \
|
|
|
|
&& apt-get install --no-install-recommends --yes \
|
2019-09-11 13:15:00 +00:00
|
|
|
python2.7 \
|
2020-10-08 21:09:10 +00:00
|
|
|
&& rm -r /var/lib/apt/lists/* \
|
2020-09-21 12:46:03 +00:00
|
|
|
&& python2.7 setup.py build install \
|
|
|
|
&& cd .. \
|
|
|
|
&& rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz"
|
2020-04-06 07:21:52 +00:00
|
|
|
|
2019-09-11 13:15:00 +00:00
|
|
|
|
2020-09-21 12:46:03 +00:00
|
|
|
## Install ocropy ##
|
|
|
|
ENV OCROPY_RELEASE 1.3.3
|
|
|
|
ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_RELEASE}.tar.gz" .
|
|
|
|
RUN tar -xzf "v${OCROPY_RELEASE}.tar.gz" \
|
|
|
|
&& cd "ocropy-${OCROPY_RELEASE}" \
|
2020-10-08 21:09:10 +00:00
|
|
|
&& apt-get update \
|
|
|
|
&& apt-get install --no-install-recommends --yes \
|
2019-09-11 13:15:00 +00:00
|
|
|
python-pil \
|
|
|
|
python-tk \
|
|
|
|
$(cat PACKAGES) \
|
2020-10-08 21:09:10 +00:00
|
|
|
&& rm -r /var/lib/apt/lists/* \
|
2019-09-11 13:15:00 +00:00
|
|
|
&& python2.7 setup.py install \
|
|
|
|
&& cd .. \
|
2020-09-21 12:46:03 +00:00
|
|
|
&& rm -r "ocropy-${OCROPY_RELEASE}" "v${OCROPY_RELEASE}.tar.gz"
|
2018-10-29 09:38:50 +00:00
|
|
|
|
2020-04-06 07:21:52 +00:00
|
|
|
|
2020-09-21 12:46:03 +00:00
|
|
|
## Install Tesseract OCR ##
|
|
|
|
ENV TESSERACT_RELEASE=4.1.1
|
|
|
|
ADD "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_RELEASE}.tar.gz" .
|
|
|
|
RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \
|
|
|
|
&& cd "tesseract-${TESSERACT_RELEASE}" \
|
2020-10-08 21:09:10 +00:00
|
|
|
&& apt-get update \
|
|
|
|
&& apt-get install --no-install-recommends --yes \
|
2020-09-21 12:46:03 +00:00
|
|
|
autoconf \
|
|
|
|
automake \
|
|
|
|
g++ \
|
|
|
|
libjpeg62-turbo-dev \
|
|
|
|
libleptonica-dev \
|
|
|
|
libtiff5-dev \
|
|
|
|
libtool \
|
|
|
|
libpng-dev \
|
|
|
|
make \
|
|
|
|
pkg-config \
|
|
|
|
zlib1g-dev \
|
2020-10-08 21:09:10 +00:00
|
|
|
&& rm -r /var/lib/apt/lists/* \
|
2020-09-21 12:46:03 +00:00
|
|
|
&& ./autogen.sh \
|
|
|
|
&& ./configure \
|
|
|
|
&& make \
|
|
|
|
&& make install \
|
|
|
|
&& ldconfig \
|
|
|
|
&& cd - > /dev/null \
|
|
|
|
&& rm -r "tesseract-${TESSERACT_RELEASE}" "${TESSERACT_RELEASE}.tar.gz"
|
|
|
|
ADD "https://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata" \
|
|
|
|
"https://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata" \
|
|
|
|
"https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata" \
|
|
|
|
"https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata" \
|
|
|
|
"https://github.com/tesseract-ocr/tessdata_best/raw/master/frk.traineddata" \
|
|
|
|
"https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata" \
|
|
|
|
"https://github.com/tesseract-ocr/tessdata_best/raw/master/ita.traineddata" \
|
|
|
|
"https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata" \
|
|
|
|
"https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata" \
|
|
|
|
"/usr/local/share/tessdata/"
|
|
|
|
RUN chmod 644 /usr/local/share/tessdata/*.traineddata
|
|
|
|
|
|
|
|
|
2020-10-08 21:09:10 +00:00
|
|
|
## Further dependencies ##
|
|
|
|
RUN apt-get update \
|
|
|
|
&& apt-get install --no-install-recommends --yes \
|
2020-09-21 12:46:03 +00:00
|
|
|
ghostscript \
|
2020-09-22 15:44:32 +00:00
|
|
|
python-pip \
|
2020-09-21 12:46:03 +00:00
|
|
|
python3.7 \
|
2020-09-22 15:44:32 +00:00
|
|
|
zip \
|
2020-10-08 21:09:10 +00:00
|
|
|
&& rm -r /var/lib/apt/lists/* \
|
2020-09-22 15:44:32 +00:00
|
|
|
&& pip install natsort
|
2019-09-11 13:15:00 +00:00
|
|
|
|
|
|
|
|
2020-10-08 21:09:10 +00:00
|
|
|
## Install Pipeline ##
|
|
|
|
COPY hocrtotei ocr /usr/local/bin/
|
2018-10-09 12:43:23 +00:00
|
|
|
|
2019-09-11 13:15:00 +00:00
|
|
|
|
2019-05-13 13:03:43 +00:00
|
|
|
ENTRYPOINT ["ocr"]
|
2019-09-11 13:15:00 +00:00
|
|
|
CMD ["--help"]
|