mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2024-12-26 15:14:18 +00:00
67 lines
2.4 KiB
Docker
67 lines
2.4 KiB
Docker
|
FROM ubuntu:18.04
|
||
|
|
||
|
MAINTAINER Patrick Jentsch <p.jentsch@uni-bielefeld.de>
|
||
|
|
||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||
|
ENV LANG=en_US.UTF-8
|
||
|
ENV PYFLOW_VERSION 1.1.20
|
||
|
|
||
|
RUN apt-get update && \
|
||
|
apt-get install -y --no-install-recommends \
|
||
|
gnupg2
|
||
|
|
||
|
# Add PPA for pdftk
|
||
|
RUN echo "deb http://ppa.launchpad.net/malteworld/ppa/ubuntu bionic main" >> /etc/apt/sources.list && \
|
||
|
echo "deb-src http://ppa.launchpad.net/malteworld/ppa/ubuntu bionic main" >> /etc/apt/sources.list && \
|
||
|
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys BCA68C33DA36783B03E981C820D0BB61B700CE29
|
||
|
|
||
|
RUN apt-get update && \
|
||
|
apt-get install -y --no-install-recommends \
|
||
|
ghostscript \
|
||
|
git \
|
||
|
imagemagick \
|
||
|
libtiff-tools \
|
||
|
locales \
|
||
|
pdftk \
|
||
|
poppler-utils \
|
||
|
python2.7 \
|
||
|
python3.6 \
|
||
|
tesseract-ocr \
|
||
|
wget
|
||
|
|
||
|
# Configure locales
|
||
|
RUN locale-gen "$LANG"
|
||
|
|
||
|
WORKDIR /root
|
||
|
|
||
|
# Install pyFlow
|
||
|
RUN wget -nv http://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \
|
||
|
tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \
|
||
|
cd pyflow-"$PYFLOW_VERSION" && \
|
||
|
python2.7 setup.py build install && \
|
||
|
cd /root && \
|
||
|
rm pyflow-"$PYFLOW_VERSION".tar.gz
|
||
|
|
||
|
# Install Tesseract OCR Data Files
|
||
|
RUN wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
||
|
wget -nv http://github.com/tesseract-ocr/tessdata/raw/3.04.00/deu_frak.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
||
|
wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
||
|
wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
||
|
wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
||
|
wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata
|
||
|
|
||
|
# Install OCRopus
|
||
|
RUN git clone http://github.com/tmbdev/ocropy && \
|
||
|
cd ocropy && \
|
||
|
apt-get install -y --no-install-recommends $(cat PACKAGES) && \
|
||
|
wget -nv http://www.tmbdev.net/en-default.pyrnn.gz && \
|
||
|
mv en-default.pyrnn.gz models/ && \
|
||
|
python2.7 setup.py install && \
|
||
|
cd /root
|
||
|
|
||
|
COPY ocr_pyflow /usr/local/bin
|
||
|
COPY parse_hocr /usr/local/bin
|
||
|
|
||
|
VOLUME /root/files_for_ocr
|
||
|
|
||
|
CMD ["/bin/bash"]
|