FROM debian:buster-slim LABEL authors="Patrick Jentsch , Stephan Porada " ENV LANG=C.UTF-8 RUN apt-get update \ && apt-get install --no-install-recommends --yes \ ghostscript \ procps \ python3.7 \ python3-pip \ rename \ wget \ zip \ && python3 -m pip install lxml # Install the OCR pipeline and it's dependencies # ## Install pyFlow ## ENV PYFLOW_VERSION=1.1.20 RUN wget --no-check-certificate --quiet \ "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" \ && tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \ && cd "pyflow-${PYFLOW_VERSION}" \ && apt-get install --no-install-recommends --yes \ python2.7 \ && python2.7 setup.py build install \ && cd - > /dev/null \ && rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz" ## Install ocropy ## ENV OCROPY_VERSION=1.3.3 RUN wget --no-check-certificate --quiet \ "https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" \ && tar -xzf "v${OCROPY_VERSION}.tar.gz" \ && cd "ocropy-${OCROPY_VERSION}" \ && apt-get install --no-install-recommends --yes \ python2.7 \ python-pil \ python-tk \ $(cat PACKAGES) \ && python2.7 setup.py install \ && cd - > /dev/null \ && rm -r "ocropy-${OCROPY_VERSION}" "v${OCROPY_VERSION}.tar.gz" ## Install Tesseract OCR ## ENV TESSERACT_VERSION=5.0.0 RUN wget --no-check-certificate --quiet \ "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \ && tar -xzf "${TESSERACT_VERSION}.tar.gz" \ && cd "tesseract-${TESSERACT_VERSION}" \ && apt-get install --no-install-recommends --yes \ autoconf \ automake \ g++ \ libjpeg62-turbo-dev \ libleptonica-dev \ libtiff5-dev \ libtool \ libpng-dev \ make \ pkg-config \ zlib1g-dev \ && ./autogen.sh \ && ./configure --disable-openmp --disable-shared 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic' \ && make \ && make install \ && ldconfig \ && cd - > /dev/null \ && rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz" RUN rm -r /var/lib/apt/lists/* ## Install Pipeline ## COPY hocr2tei hocr-combine ocr /usr/local/bin/ ENTRYPOINT ["ocr"] CMD ["--help"]