mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2025-07-01 10:00:33 +00:00
Initial commit
This commit is contained in:
67
Dockerfile
Normal file
67
Dockerfile
Normal file
@ -0,0 +1,67 @@
|
||||
FROM ubuntu:18.04
|
||||
|
||||
MAINTAINER Patrick Jentsch <p.jentsch@uni-bielefeld.de>
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV LANG=en_US.UTF-8
|
||||
ENV PYFLOW_VERSION 1.1.20
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
gnupg2
|
||||
|
||||
# Add PPA for pdftk
|
||||
RUN echo "deb http://ppa.launchpad.net/malteworld/ppa/ubuntu bionic main" >> /etc/apt/sources.list && \
|
||||
echo "deb-src http://ppa.launchpad.net/malteworld/ppa/ubuntu bionic main" >> /etc/apt/sources.list && \
|
||||
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys BCA68C33DA36783B03E981C820D0BB61B700CE29
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
ghostscript \
|
||||
git \
|
||||
imagemagick \
|
||||
libtiff-tools \
|
||||
locales \
|
||||
pdftk \
|
||||
poppler-utils \
|
||||
python2.7 \
|
||||
python3.6 \
|
||||
tesseract-ocr \
|
||||
wget
|
||||
|
||||
# Configure locales
|
||||
RUN locale-gen "$LANG"
|
||||
|
||||
WORKDIR /root
|
||||
|
||||
# Install pyFlow
|
||||
RUN wget -nv http://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \
|
||||
tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \
|
||||
cd pyflow-"$PYFLOW_VERSION" && \
|
||||
python2.7 setup.py build install && \
|
||||
cd /root && \
|
||||
rm pyflow-"$PYFLOW_VERSION".tar.gz
|
||||
|
||||
# Install Tesseract OCR Data Files
|
||||
RUN wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
||||
wget -nv http://github.com/tesseract-ocr/tessdata/raw/3.04.00/deu_frak.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
||||
wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
||||
wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
||||
wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
||||
wget -nv http://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata
|
||||
|
||||
# Install OCRopus
|
||||
RUN git clone http://github.com/tmbdev/ocropy && \
|
||||
cd ocropy && \
|
||||
apt-get install -y --no-install-recommends $(cat PACKAGES) && \
|
||||
wget -nv http://www.tmbdev.net/en-default.pyrnn.gz && \
|
||||
mv en-default.pyrnn.gz models/ && \
|
||||
python2.7 setup.py install && \
|
||||
cd /root
|
||||
|
||||
COPY ocr_pyflow /usr/local/bin
|
||||
COPY parse_hocr /usr/local/bin
|
||||
|
||||
VOLUME /root/files_for_ocr
|
||||
|
||||
CMD ["/bin/bash"]
|
Reference in New Issue
Block a user