mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2025-07-01 10:10:34 +00:00
Cleanup and make use of globbing for input files for binarization and ocr
This commit is contained in:
75
Dockerfile
75
Dockerfile
@ -7,41 +7,47 @@ LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <por
|
||||
ENV LANG=C.UTF-8
|
||||
|
||||
|
||||
RUN apt-get update
|
||||
RUN apt-get update \
|
||||
&& apt-get install --no-install-recommends --yes \
|
||||
wget
|
||||
|
||||
|
||||
# Install pipeline dependencies #
|
||||
# Install the OCR pipeline and it's dependencies #
|
||||
## Install pyFlow ##
|
||||
ENV PYFLOW_RELEASE=1.1.20
|
||||
ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" .
|
||||
RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \
|
||||
&& cd "pyflow-${PYFLOW_RELEASE}" \
|
||||
ENV PYFLOW_VERSION=1.1.20
|
||||
RUN wget --no-check-certificate --quiet \
|
||||
"https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" \
|
||||
&& tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
|
||||
&& cd "pyflow-${PYFLOW_VERSION}" \
|
||||
&& apt-get install --no-install-recommends --yes \
|
||||
python2.7 \
|
||||
&& python2.7 setup.py build install \
|
||||
&& cd .. \
|
||||
&& rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz"
|
||||
&& cd - > /dev/null \
|
||||
&& rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz"
|
||||
|
||||
|
||||
## Install ocropy ##
|
||||
ENV OCROPY_RELEASE=1.3.3
|
||||
ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_RELEASE}.tar.gz" .
|
||||
RUN tar -xzf "v${OCROPY_RELEASE}.tar.gz" \
|
||||
&& cd "ocropy-${OCROPY_RELEASE}" \
|
||||
ENV OCROPY_VERSION=1.3.3
|
||||
RUN wget --no-check-certificate --quiet \
|
||||
"https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" \
|
||||
&& tar -xzf "v${OCROPY_VERSION}.tar.gz" \
|
||||
&& cd "ocropy-${OCROPY_VERSION}" \
|
||||
&& apt-get install --no-install-recommends --yes \
|
||||
python2.7 \
|
||||
python-pil \
|
||||
python-tk \
|
||||
$(cat PACKAGES) \
|
||||
&& python2.7 setup.py install \
|
||||
&& cd .. \
|
||||
&& rm -r "ocropy-${OCROPY_RELEASE}" "v${OCROPY_RELEASE}.tar.gz"
|
||||
&& cd - > /dev/null \
|
||||
&& rm -r "ocropy-${OCROPY_VERSION}" "v${OCROPY_VERSION}.tar.gz"
|
||||
|
||||
|
||||
## Install Tesseract OCR ##
|
||||
ENV TESSERACT_RELEASE=4.1.1
|
||||
ADD "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_RELEASE}.tar.gz" .
|
||||
RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \
|
||||
&& cd "tesseract-${TESSERACT_RELEASE}" \
|
||||
ENV TESSERACT_VERSION=4.1.1
|
||||
RUN wget --no-check-certificate --quiet \
|
||||
"https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \
|
||||
&& tar -xzf "${TESSERACT_VERSION}.tar.gz" \
|
||||
&& cd "tesseract-${TESSERACT_VERSION}" \
|
||||
&& apt-get install --no-install-recommends --yes \
|
||||
autoconf \
|
||||
automake \
|
||||
@ -60,35 +66,24 @@ RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \
|
||||
&& make install \
|
||||
&& ldconfig \
|
||||
&& cd - > /dev/null \
|
||||
&& rm -r "tesseract-${TESSERACT_RELEASE}" "${TESSERACT_RELEASE}.tar.gz"
|
||||
&& rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz"
|
||||
|
||||
ENV TESSDATA_BEST_RELEASE=4.1.0
|
||||
ADD "https://github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_RELEASE}.tar.gz" .
|
||||
RUN tar -xzf "${TESSDATA_BEST_RELEASE}.tar.gz" \
|
||||
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/ara.traineddata" "/usr/local/share/tessdata/" \
|
||||
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/chi_tra.traineddata" "/usr/local/share/tessdata/" \
|
||||
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/dan.traineddata" "/usr/local/share/tessdata/" \
|
||||
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/deu.traineddata" "/usr/local/share/tessdata/" \
|
||||
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/ell.traineddata" "/usr/local/share/tessdata/" \
|
||||
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/eng.traineddata" "/usr/local/share/tessdata/" \
|
||||
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/enm.traineddata" "/usr/local/share/tessdata/" \
|
||||
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/fra.traineddata" "/usr/local/share/tessdata/" \
|
||||
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/frk.traineddata" "/usr/local/share/tessdata/" \
|
||||
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/frm.traineddata" "/usr/local/share/tessdata/" \
|
||||
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/ita.traineddata" "/usr/local/share/tessdata/" \
|
||||
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/por.traineddata" "/usr/local/share/tessdata/" \
|
||||
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/rus.traineddata" "/usr/local/share/tessdata/" \
|
||||
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/spa.traineddata" "/usr/local/share/tessdata/" \
|
||||
&& rm -r "tessdata_best-${TESSDATA_BEST_RELEASE}" "${TESSDATA_BEST_RELEASE}.tar.gz"
|
||||
ENV TESSERACT_MODELS="ara,chi_tra,dan,ell,eng,enm,fra,frk,frm,ita,por,rus,spa"
|
||||
ENV TESSDATA_BEST_VERSION=4.1.0
|
||||
RUN wget --no-check-certificate --quiet \
|
||||
"https://github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}.tar.gz" \
|
||||
&& tar -xzf "${TESSDATA_BEST_VERSION}.tar.gz" \
|
||||
&& for tesseract_model in $(echo ${TESSERACT_MODELS} | tr "," "\n"); do mv "tessdata_best-${TESSDATA_BEST_VERSION}/${tesseract_model}.traineddata" "/usr/local/share/tessdata/"; done \
|
||||
&& rm -r "tessdata_best-${TESSDATA_BEST_VERSION}" "${TESSDATA_BEST_VERSION}.tar.gz"
|
||||
|
||||
|
||||
## Further dependencies ##
|
||||
RUN apt-get install --no-install-recommends --yes \
|
||||
procps \
|
||||
ghostscript \
|
||||
python-pip \
|
||||
python3.7 \
|
||||
zip \
|
||||
&& pip install natsort
|
||||
rename \
|
||||
zip
|
||||
|
||||
|
||||
## Install Pipeline ##
|
||||
|
Reference in New Issue
Block a user