mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
				synced 2025-10-31 21:33:15 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			87 lines
		
	
	
		
			2.3 KiB
		
	
	
	
		
			Docker
		
	
	
	
	
	
			
		
		
	
	
			87 lines
		
	
	
		
			2.3 KiB
		
	
	
	
		
			Docker
		
	
	
	
	
	
| FROM debian:buster-slim
 | |
| 
 | |
| 
 | |
| LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <porada@posteo.de>"
 | |
| 
 | |
| 
 | |
| ENV LANG=C.UTF-8
 | |
| 
 | |
| 
 | |
| RUN apt-get update \
 | |
|  && apt-get install --no-install-recommends --yes \
 | |
|       ghostscript \
 | |
|       procps \
 | |
|       python3.7 \
 | |
|       python3-pip \
 | |
|       rename \
 | |
|       wget \
 | |
|       zip \
 | |
|  && python3 -m pip install lxml
 | |
| 
 | |
| # Install the OCR pipeline and it's dependencies #
 | |
| ## Install pyFlow ##
 | |
| ENV PYFLOW_VERSION=1.1.20
 | |
| RUN wget --no-check-certificate --quiet \
 | |
|       "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" \
 | |
|  && tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
 | |
|  && cd "pyflow-${PYFLOW_VERSION}" \
 | |
|  && apt-get install --no-install-recommends --yes \
 | |
|       python2.7 \
 | |
|  && python2.7 setup.py build install \
 | |
|  && cd - > /dev/null \
 | |
|  && rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz"
 | |
| 
 | |
| 
 | |
| ## Install ocropy ##
 | |
| ENV OCROPY_VERSION=1.3.3
 | |
| RUN wget --no-check-certificate --quiet \
 | |
|       "https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" \
 | |
|  && tar -xzf "v${OCROPY_VERSION}.tar.gz" \
 | |
|  && cd "ocropy-${OCROPY_VERSION}" \
 | |
|  && apt-get install --no-install-recommends --yes \
 | |
|       python2.7 \
 | |
|       python-pil \
 | |
|       python-tk \
 | |
|       $(cat PACKAGES) \
 | |
|  && python2.7 setup.py install \
 | |
|  && cd - > /dev/null \
 | |
|  && rm -r "ocropy-${OCROPY_VERSION}" "v${OCROPY_VERSION}.tar.gz"
 | |
| 
 | |
| 
 | |
| ## Install Tesseract OCR ##
 | |
| ENV TESSERACT_VERSION=5.0.0
 | |
| RUN wget --no-check-certificate --quiet \
 | |
|       "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \
 | |
|  && tar -xzf "${TESSERACT_VERSION}.tar.gz" \
 | |
|  && cd "tesseract-${TESSERACT_VERSION}" \
 | |
|  && apt-get install --no-install-recommends --yes \
 | |
|       autoconf \
 | |
|       automake \
 | |
|       g++ \
 | |
|       libjpeg62-turbo-dev \
 | |
|       libleptonica-dev \
 | |
|       libtiff5-dev \
 | |
|       libtool \
 | |
|       libpng-dev \
 | |
|       make \
 | |
|       pkg-config \
 | |
|       zlib1g-dev \
 | |
|  && ./autogen.sh \
 | |
|  && ./configure --disable-openmp --disable-shared 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic' \
 | |
|  && make \
 | |
|  && make install \
 | |
|  && ldconfig \
 | |
|  && cd - > /dev/null \
 | |
|  && rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz"
 | |
| 
 | |
| 
 | |
| RUN rm -r /var/lib/apt/lists/*
 | |
| 
 | |
| 
 | |
| ## Install Pipeline ##
 | |
| COPY hocr2tei hocr-combine ocr /usr/local/bin/
 | |
| 
 | |
| 
 | |
| ENTRYPOINT ["ocr"]
 | |
| CMD ["--help"]
 |