mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
				synced 2025-10-31 21:13:16 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			98 lines
		
	
	
		
			2.8 KiB
		
	
	
	
		
			Docker
		
	
	
	
	
	
			
		
		
	
	
			98 lines
		
	
	
		
			2.8 KiB
		
	
	
	
		
			Docker
		
	
	
	
	
	
| FROM debian:buster-slim
 | |
| 
 | |
| 
 | |
| LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <porada@posteo.de>"
 | |
| 
 | |
| 
 | |
| ENV LANG=C.UTF-8
 | |
| 
 | |
| 
 | |
| RUN apt-get update \
 | |
|  && apt-get install --no-install-recommends --yes \
 | |
|       wget
 | |
| 
 | |
| 
 | |
| # Install the OCR pipeline and it's dependencies #
 | |
| ## Install pyFlow ##
 | |
| ENV PYFLOW_VERSION=1.1.20
 | |
| RUN wget --no-check-certificate --quiet \
 | |
|       "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" \
 | |
|  && tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
 | |
|  && cd "pyflow-${PYFLOW_VERSION}" \
 | |
|  && apt-get install --no-install-recommends --yes \
 | |
|       python2.7 \
 | |
|  && python2.7 setup.py build install \
 | |
|  && cd - > /dev/null \
 | |
|  && rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz"
 | |
| 
 | |
| 
 | |
| ## Install ocropy ##
 | |
| ENV OCROPY_VERSION=1.3.3
 | |
| RUN wget --no-check-certificate --quiet \
 | |
|       "https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" \
 | |
|  && tar -xzf "v${OCROPY_VERSION}.tar.gz" \
 | |
|  && cd "ocropy-${OCROPY_VERSION}" \
 | |
|  && apt-get install --no-install-recommends --yes \
 | |
|       python2.7 \
 | |
|       python-pil \
 | |
|       python-tk \
 | |
|       $(cat PACKAGES) \
 | |
|  && python2.7 setup.py install \
 | |
|  && cd - > /dev/null \
 | |
|  && rm -r "ocropy-${OCROPY_VERSION}" "v${OCROPY_VERSION}.tar.gz"
 | |
| 
 | |
| 
 | |
| ## Install Tesseract OCR ##
 | |
| ENV TESSERACT_VERSION=4.1.1
 | |
| RUN wget --no-check-certificate --quiet \
 | |
|       "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \
 | |
|  && tar -xzf "${TESSERACT_VERSION}.tar.gz" \
 | |
|  && cd "tesseract-${TESSERACT_VERSION}" \
 | |
|  && apt-get install --no-install-recommends --yes \
 | |
|       autoconf \
 | |
|       automake \
 | |
|       g++ \
 | |
|       libjpeg62-turbo-dev \
 | |
|       libleptonica-dev \
 | |
|       libtiff5-dev \
 | |
|       libtool \
 | |
|       libpng-dev \
 | |
|       make \
 | |
|       pkg-config \
 | |
|       zlib1g-dev \
 | |
|  && ./autogen.sh \
 | |
|  && ./configure \
 | |
|  && make \
 | |
|  && make install \
 | |
|  && ldconfig \
 | |
|  && cd - > /dev/null \
 | |
|  && rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz"
 | |
| 
 | |
| ENV TESSERACT_MODELS="ara,chi_tra,dan,ell,eng,enm,fra,frk,frm,ita,por,rus,spa"
 | |
| ENV TESSDATA_BEST_VERSION=4.1.0
 | |
| RUN wget --no-check-certificate --quiet \
 | |
|       "https://github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}.tar.gz" \
 | |
|  && tar -xzf "${TESSDATA_BEST_VERSION}.tar.gz" \
 | |
|  && for tesseract_model in $(echo ${TESSERACT_MODELS} | tr "," "\n"); do mv "tessdata_best-${TESSDATA_BEST_VERSION}/${tesseract_model}.traineddata" "/usr/local/share/tessdata/"; done \
 | |
|  && rm -r "tessdata_best-${TESSDATA_BEST_VERSION}" "${TESSDATA_BEST_VERSION}.tar.gz"
 | |
| 
 | |
| 
 | |
| ## Further dependencies ##
 | |
| RUN apt-get install --no-install-recommends --yes \
 | |
|       procps \
 | |
|       ghostscript \
 | |
|       python3.7 \
 | |
|       rename \
 | |
|       zip
 | |
| 
 | |
| 
 | |
| ## Install Pipeline ##
 | |
| COPY hocrtotei ocr /usr/local/bin/
 | |
| 
 | |
| 
 | |
| RUN rm -r /var/lib/apt/lists/*
 | |
| 
 | |
| 
 | |
| ENTRYPOINT ["ocr"]
 | |
| CMD ["--help"]
 |