mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
				synced 2025-10-31 21:33:15 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			103 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			Docker
		
	
	
	
	
	
			
		
		
	
	
			103 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			Docker
		
	
	
	
	
	
| FROM debian:buster-slim
 | |
| 
 | |
| 
 | |
| LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <porada@posteo.de>"
 | |
| 
 | |
| 
 | |
| ENV LANG=C.UTF-8
 | |
| 
 | |
| 
 | |
| RUN apt-get update
 | |
| 
 | |
| 
 | |
| # Install pipeline dependencies #
 | |
| ## Install pyFlow ##
 | |
| ENV PYFLOW_RELEASE=1.1.20
 | |
| ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" .
 | |
| RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \
 | |
|  && cd "pyflow-${PYFLOW_RELEASE}" \
 | |
|  && apt-get install --no-install-recommends --yes \
 | |
|       python2.7 \
 | |
|  && python2.7 setup.py build install \
 | |
|  && cd .. \
 | |
|  && rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz"
 | |
| 
 | |
| 
 | |
| ## Install ocropy ##
 | |
| ENV OCROPY_RELEASE=1.3.3
 | |
| ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_RELEASE}.tar.gz" .
 | |
| RUN tar -xzf "v${OCROPY_RELEASE}.tar.gz" \
 | |
|  && cd "ocropy-${OCROPY_RELEASE}" \
 | |
|  && apt-get install --no-install-recommends --yes \
 | |
|       python-pil \
 | |
|       python-tk \
 | |
|       $(cat PACKAGES) \
 | |
|  && python2.7 setup.py install \
 | |
|  && cd .. \
 | |
|  && rm -r "ocropy-${OCROPY_RELEASE}" "v${OCROPY_RELEASE}.tar.gz"
 | |
| 
 | |
| 
 | |
| ## Install Tesseract OCR ##
 | |
| ENV TESSERACT_RELEASE=4.1.1
 | |
| ADD "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_RELEASE}.tar.gz" .
 | |
| RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \
 | |
|  && cd "tesseract-${TESSERACT_RELEASE}" \
 | |
|  && apt-get install --no-install-recommends --yes \
 | |
|       autoconf \
 | |
|       automake \
 | |
|       g++ \
 | |
|       libjpeg62-turbo-dev \
 | |
|       libleptonica-dev \
 | |
|       libtiff5-dev \
 | |
|       libtool \
 | |
|       libpng-dev \
 | |
|       make \
 | |
|       pkg-config \
 | |
|       zlib1g-dev \
 | |
|  && ./autogen.sh \
 | |
|  && ./configure \
 | |
|  && make \
 | |
|  && make install \
 | |
|  && ldconfig \
 | |
|  && cd - > /dev/null \
 | |
|  && rm -r "tesseract-${TESSERACT_RELEASE}" "${TESSERACT_RELEASE}.tar.gz"
 | |
| 
 | |
| ENV TESSDATA_BEST_RELEASE=4.1.0
 | |
| ADD "https://github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_RELEASE}.tar.gz" .
 | |
| RUN tar -xzf "${TESSDATA_BEST_RELEASE}.tar.gz" \
 | |
|  && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/ara.traineddata" "/usr/local/share/tessdata/" \
 | |
|  && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/chi_tra.traineddata" "/usr/local/share/tessdata/" \
 | |
|  && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/dan.traineddata" "/usr/local/share/tessdata/" \
 | |
|  && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/deu.traineddata" "/usr/local/share/tessdata/" \
 | |
|  && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/ell.traineddata" "/usr/local/share/tessdata/" \
 | |
|  && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/eng.traineddata" "/usr/local/share/tessdata/" \
 | |
|  && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/enm.traineddata" "/usr/local/share/tessdata/" \
 | |
|  && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/fra.traineddata" "/usr/local/share/tessdata/" \
 | |
|  && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/frk.traineddata" "/usr/local/share/tessdata/" \
 | |
|  && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/frm.traineddata" "/usr/local/share/tessdata/" \
 | |
|  && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/ita.traineddata" "/usr/local/share/tessdata/" \
 | |
|  && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/por.traineddata" "/usr/local/share/tessdata/" \
 | |
|  && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/rus.traineddata" "/usr/local/share/tessdata/" \
 | |
|  && mv "tessdata_best-${TESSDATA_BEST_RELEASE}/spa.traineddata" "/usr/local/share/tessdata/" \
 | |
|  && rm -r "tessdata_best-${TESSDATA_BEST_RELEASE}" "${TESSDATA_BEST_RELEASE}.tar.gz"
 | |
| 
 | |
| 
 | |
| ## Further dependencies ##
 | |
| RUN apt-get install --no-install-recommends --yes \
 | |
|       ghostscript \
 | |
|       python-pip \
 | |
|       python3.7 \
 | |
|       zip \
 | |
|  && pip install natsort
 | |
| 
 | |
| 
 | |
| ## Install Pipeline ##
 | |
| COPY hocrtotei ocr /usr/local/bin/
 | |
| 
 | |
| 
 | |
| RUN rm -r /var/lib/apt/lists/*
 | |
| 
 | |
| 
 | |
| ENTRYPOINT ["ocr"]
 | |
| CMD ["--help"]
 |