ocr/Dockerfile

87 lines
2.3 KiB
Docker
Raw Normal View History

2020-10-08 23:17:48 +02:00
FROM debian:buster-slim
2018-10-09 14:43:23 +02:00
2019-09-11 15:15:00 +02:00
2021-02-24 15:17:42 +01:00
LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <porada@posteo.de>"
2018-10-09 14:43:23 +02:00
2019-09-11 15:15:00 +02:00
2019-09-12 11:30:52 +02:00
ENV LANG=C.UTF-8
RUN apt-get update \
&& apt-get install --no-install-recommends --yes \
ghostscript \
procps \
python3.7 \
python3-pip \
rename \
wget \
zip \
&& python3 -m pip install lxml
2020-09-21 14:46:03 +02:00
# Install the OCR pipeline and it's dependencies #
2020-09-21 14:46:03 +02:00
## Install pyFlow ##
ENV PYFLOW_VERSION=1.1.20
RUN wget --no-check-certificate --quiet \
"https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" \
&& tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
&& cd "pyflow-${PYFLOW_VERSION}" \
2020-10-08 23:09:10 +02:00
&& apt-get install --no-install-recommends --yes \
2019-09-11 15:15:00 +02:00
python2.7 \
2020-09-21 14:46:03 +02:00
&& python2.7 setup.py build install \
&& cd - > /dev/null \
&& rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz"
2020-04-06 09:21:52 +02:00
2019-09-11 15:15:00 +02:00
2020-09-21 14:46:03 +02:00
## Install ocropy ##
ENV OCROPY_VERSION=1.3.3
RUN wget --no-check-certificate --quiet \
"https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" \
&& tar -xzf "v${OCROPY_VERSION}.tar.gz" \
&& cd "ocropy-${OCROPY_VERSION}" \
2020-10-08 23:09:10 +02:00
&& apt-get install --no-install-recommends --yes \
python2.7 \
2019-09-11 15:15:00 +02:00
python-pil \
python-tk \
$(cat PACKAGES) \
&& python2.7 setup.py install \
&& cd - > /dev/null \
&& rm -r "ocropy-${OCROPY_VERSION}" "v${OCROPY_VERSION}.tar.gz"
2018-10-29 10:38:50 +01:00
2020-04-06 09:21:52 +02:00
2020-09-21 14:46:03 +02:00
## Install Tesseract OCR ##
ENV TESSERACT_VERSION=5.0.0
RUN wget --no-check-certificate --quiet \
"https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \
&& tar -xzf "${TESSERACT_VERSION}.tar.gz" \
&& cd "tesseract-${TESSERACT_VERSION}" \
2020-10-08 23:09:10 +02:00
&& apt-get install --no-install-recommends --yes \
2020-09-21 14:46:03 +02:00
autoconf \
automake \
g++ \
libjpeg62-turbo-dev \
libleptonica-dev \
libtiff5-dev \
libtool \
libpng-dev \
make \
pkg-config \
zlib1g-dev \
&& ./autogen.sh \
&& ./configure --disable-openmp --disable-shared 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic' \
2020-09-21 14:46:03 +02:00
&& make \
&& make install \
&& ldconfig \
&& cd - > /dev/null \
&& rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz"
2020-09-21 14:46:03 +02:00
RUN rm -r /var/lib/apt/lists/*
2019-09-11 15:15:00 +02:00
2020-10-08 23:09:10 +02:00
## Install Pipeline ##
COPY hocr2tei hocr-combine ocr /usr/local/bin/
2021-02-24 15:28:04 +01:00
2019-05-13 15:03:43 +02:00
ENTRYPOINT ["ocr"]
2019-09-11 15:15:00 +02:00
CMD ["--help"]