FROM debian:10-slim LABEL authors="Patrick Jentsch , Stephan Porada " ENV LANG=C.UTF-8 RUN apt-get update ## Install pyFlow ## ENV PYFLOW_RELEASE=1.1.20 ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" . RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \ && cd "pyflow-${PYFLOW_RELEASE}" \ && apt-get install -y --no-install-recommends \ python2.7 \ && python2.7 setup.py build install \ && cd .. \ && rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz" ## Install ocropy ## ENV OCROPY_RELEASE 1.3.3 ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_RELEASE}.tar.gz" . RUN tar -xzf "v${OCROPY_RELEASE}.tar.gz" \ && cd "ocropy-${OCROPY_RELEASE}" \ && apt-get install -y --no-install-recommends \ python-pil \ python-tk \ $(cat PACKAGES) \ && python2.7 setup.py install \ && cd .. \ && rm -r "ocropy-${OCROPY_RELEASE}" "v${OCROPY_RELEASE}.tar.gz" ## Install Tesseract OCR ## ENV TESSERACT_RELEASE=4.1.1 ADD "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_RELEASE}.tar.gz" . RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \ && cd "tesseract-${TESSERACT_RELEASE}" \ && apt-get install -y --no-install-recommends \ autoconf \ automake \ g++ \ libjpeg62-turbo-dev \ libleptonica-dev \ libtiff5-dev \ libtool \ libpng-dev \ make \ pkg-config \ zlib1g-dev \ && ./autogen.sh \ && ./configure \ && make \ && make install \ && ldconfig \ && cd - > /dev/null \ && rm -r "tesseract-${TESSERACT_RELEASE}" "${TESSERACT_RELEASE}.tar.gz" ADD "https://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata" \ "https://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata" \ "https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata" \ "https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata" \ "https://github.com/tesseract-ocr/tessdata_best/raw/master/frk.traineddata" \ "https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata" \ "https://github.com/tesseract-ocr/tessdata_best/raw/master/ita.traineddata" \ "https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata" \ "https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata" \ "/usr/local/share/tessdata/" RUN chmod 644 /usr/local/share/tessdata/*.traineddata ## Install Pipeline ## RUN apt-get install -y --no-install-recommends \ ghostscript \ python-pip \ python3.7 \ zip \ && pip install natsort COPY "hocrtotei" "ocr" "/usr/local/bin/" ## Cleanup ## RUN rm -r /var/lib/apt/lists/* ENTRYPOINT ["ocr"] CMD ["--help"]