mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2025-01-28 04:10:34 +00:00
87 lines
2.3 KiB
Docker
87 lines
2.3 KiB
Docker
FROM debian:buster-slim
|
|
|
|
|
|
LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <porada@posteo.de>"
|
|
|
|
|
|
ENV LANG=C.UTF-8
|
|
|
|
|
|
RUN apt-get update \
|
|
&& apt-get install --no-install-recommends --yes \
|
|
ghostscript \
|
|
procps \
|
|
python3.7 \
|
|
python3-pip \
|
|
rename \
|
|
wget \
|
|
zip \
|
|
&& python3 -m pip install lxml
|
|
|
|
# Install the OCR pipeline and it's dependencies #
|
|
## Install pyFlow ##
|
|
ENV PYFLOW_VERSION=1.1.20
|
|
RUN wget --no-check-certificate --quiet \
|
|
"https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" \
|
|
&& tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
|
|
&& cd "pyflow-${PYFLOW_VERSION}" \
|
|
&& apt-get install --no-install-recommends --yes \
|
|
python2.7 \
|
|
&& python2.7 setup.py build install \
|
|
&& cd - > /dev/null \
|
|
&& rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz"
|
|
|
|
|
|
## Install ocropy ##
|
|
ENV OCROPY_VERSION=1.3.3
|
|
RUN wget --no-check-certificate --quiet \
|
|
"https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" \
|
|
&& tar -xzf "v${OCROPY_VERSION}.tar.gz" \
|
|
&& cd "ocropy-${OCROPY_VERSION}" \
|
|
&& apt-get install --no-install-recommends --yes \
|
|
python2.7 \
|
|
python-pil \
|
|
python-tk \
|
|
$(cat PACKAGES) \
|
|
&& python2.7 setup.py install \
|
|
&& cd - > /dev/null \
|
|
&& rm -r "ocropy-${OCROPY_VERSION}" "v${OCROPY_VERSION}.tar.gz"
|
|
|
|
|
|
## Install Tesseract OCR ##
|
|
ENV TESSERACT_VERSION=5.0.0
|
|
RUN wget --no-check-certificate --quiet \
|
|
"https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \
|
|
&& tar -xzf "${TESSERACT_VERSION}.tar.gz" \
|
|
&& cd "tesseract-${TESSERACT_VERSION}" \
|
|
&& apt-get install --no-install-recommends --yes \
|
|
autoconf \
|
|
automake \
|
|
g++ \
|
|
libjpeg62-turbo-dev \
|
|
libleptonica-dev \
|
|
libtiff5-dev \
|
|
libtool \
|
|
libpng-dev \
|
|
make \
|
|
pkg-config \
|
|
zlib1g-dev \
|
|
&& ./autogen.sh \
|
|
&& ./configure --disable-openmp --disable-shared 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic' \
|
|
&& make \
|
|
&& make install \
|
|
&& ldconfig \
|
|
&& cd - > /dev/null \
|
|
&& rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz"
|
|
|
|
|
|
RUN rm -r /var/lib/apt/lists/*
|
|
|
|
|
|
## Install Pipeline ##
|
|
COPY hocr2tei hocr-combine ocr /usr/local/bin/
|
|
|
|
|
|
ENTRYPOINT ["ocr"]
|
|
CMD ["--help"]
|