mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
				synced 2025-10-31 21:43:14 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			60 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			Docker
		
	
	
	
	
	
			
		
		
	
	
			60 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			Docker
		
	
	
	
	
	
| FROM debian:stretch
 | |
| 
 | |
| MAINTAINER Patrick Jentsch <p.jentsch@uni-bielefeld.de>
 | |
| 
 | |
| ENV DEBIAN_FRONTEND=noninteractive
 | |
| ENV LANG=C.UTF-8
 | |
| 
 | |
| RUN apt-get update && \
 | |
|     apt-get install -y --no-install-recommends \
 | |
|     apt-transport-https \
 | |
|     ca-certificates \
 | |
|     gnupg2 \
 | |
|     imagemagick \
 | |
|     libtiff-tools \
 | |
|     pdftk \
 | |
|     poppler-utils \
 | |
|     python2.7 \
 | |
|     python3.5 \
 | |
|     python-numpy \
 | |
|     wget
 | |
| 
 | |
| WORKDIR /root
 | |
| 
 | |
| # Install ocropy
 | |
| ENV OCROPY_VERSION 1.3.3
 | |
| RUN wget -nv https://github.com/tmbdev/ocropy/archive/v"$OCROPY_VERSION".tar.gz && \
 | |
|     tar -xzf v"$OCROPY_VERSION".tar.gz && \
 | |
|     rm v"$OCROPY_VERSION".tar.gz && \
 | |
|     cd ocropy-"$OCROPY_VERSION" && \
 | |
|     apt-get install -y --no-install-recommends $(cat PACKAGES) python-pil python-tk && \
 | |
|     wget -nv http://www.tmbdev.net/en-default.pyrnn.gz -P models/ && \
 | |
|     python2.7 setup.py install && \
 | |
|     cd ..
 | |
| 
 | |
| # Install pyFlow
 | |
| ENV PYFLOW_VERSION 1.1.20
 | |
| RUN wget -nv https://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \
 | |
|     tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \
 | |
|     rm pyflow-"$PYFLOW_VERSION".tar.gz && \
 | |
|     cd pyflow-"$PYFLOW_VERSION" && \
 | |
|     python2.7 setup.py build install && \
 | |
|     cd ..
 | |
| 
 | |
| # Install Tesseract OCR and Data Files
 | |
| RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list && \
 | |
|     wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - && \
 | |
|     apt-get update && \
 | |
|     apt-get install -y --no-install-recommends tesseract-ocr && \
 | |
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
 | |
|     wget -nv https://github.com/tesseract-ocr/tessdata/raw/master/deu_frak.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
 | |
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
 | |
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
 | |
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
 | |
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
 | |
|     wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata
 | |
| 
 | |
| COPY ocr_pyflow /usr/local/bin
 | |
| COPY parse_hocr /usr/local/bin
 | |
| 
 | |
| CMD ["/bin/bash"] |