mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
				synced 2025-10-31 21:33:15 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			70 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			Docker
		
	
	
	
	
	
			
		
		
	
	
			70 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			Docker
		
	
	
	
	
	
| FROM ubuntu:18.04
 | |
| 
 | |
| MAINTAINER Patrick Jentsch <p.jentsch@uni-bielefeld.de>
 | |
| 
 | |
| ENV DEBIAN_FRONTEND=noninteractive
 | |
| ENV LANG=en_US.UTF-8
 | |
| ENV PYFLOW_VERSION 1.1.20
 | |
| 
 | |
| RUN apt-get update && \
 | |
|     apt-get install -y --no-install-recommends \
 | |
|     gnupg2
 | |
| 
 | |
| # Add PPA for pdftk
 | |
| RUN echo "deb http://ppa.launchpad.net/malteworld/ppa/ubuntu bionic main" >> /etc/apt/sources.list && \
 | |
|     echo "deb-src http://ppa.launchpad.net/malteworld/ppa/ubuntu bionic main" >> /etc/apt/sources.list && \
 | |
|     apt-key adv --keyserver keyserver.ubuntu.com --recv-keys BCA68C33DA36783B03E981C820D0BB61B700CE29
 | |
| 
 | |
| RUN apt-get update && \
 | |
|     apt-get install -y --no-install-recommends \
 | |
|     ghostscript \
 | |
|     git \
 | |
|     imagemagick \
 | |
|     libtiff-tools \
 | |
|     locales \
 | |
|     pdftk \
 | |
|     poppler-utils \
 | |
|     python2.7 \
 | |
|     python3.6 \
 | |
|     python-pip \
 | |
|     python-tk \
 | |
|     tesseract-ocr \
 | |
|     wget
 | |
| 
 | |
| # Configure locales
 | |
| RUN locale-gen "$LANG"
 | |
| 
 | |
| WORKDIR /root
 | |
| 
 | |
| # Install pyFlow
 | |
| RUN wget -nv http://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \
 | |
|     tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \
 | |
|     cd pyflow-"$PYFLOW_VERSION" && \
 | |
|     python2.7 setup.py build install && \
 | |
|     cd /root && \
 | |
|     rm pyflow-"$PYFLOW_VERSION".tar.gz
 | |
| 
 | |
| # Install Tesseract OCR Data Files
 | |
| RUN wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
 | |
|     wget -nv http://github.com/tesseract-ocr/tessdata/raw/3.04.00/deu_frak.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
 | |
|     wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
 | |
|     wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
 | |
|     wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
 | |
|     wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata
 | |
| 
 | |
| # Install OCRopus
 | |
| RUN git clone http://github.com/tmbdev/ocropy && \
 | |
|     cd ocropy && \
 | |
|     apt-get install -y --no-install-recommends $(cat PACKAGES) && \
 | |
|     pip install -r requirements.txt && \
 | |
|     wget -nv http://www.tmbdev.net/en-default.pyrnn.gz && \
 | |
|     mv en-default.pyrnn.gz models/ && \
 | |
|     python2.7 setup.py install && \
 | |
|     cd /root
 | |
| 
 | |
| COPY ocr_pyflow /usr/local/bin
 | |
| COPY parse_hocr /usr/local/bin
 | |
| 
 | |
| VOLUME /root/files_for_ocr
 | |
| 
 | |
| CMD ["/bin/bash"] |