mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2024-12-26 03:14:19 +00:00
49 lines
2.1 KiB
Docker
49 lines
2.1 KiB
Docker
FROM debian:stretch-slim
|
|
|
|
MAINTAINER Patrick Jentsch <p.jentsch@uni-bielefeld.de>
|
|
|
|
ENV LANG=C.UTF-8
|
|
|
|
RUN apt-get update && \
|
|
apt-get install -y --no-install-recommends \
|
|
apt-transport-https \
|
|
ca-certificates \
|
|
gnupg2 \
|
|
pdftk \
|
|
poppler-utils \
|
|
python2.7 \
|
|
python3 \
|
|
wget \
|
|
zip
|
|
|
|
WORKDIR /root
|
|
|
|
# Install pyFlow
|
|
ENV PYFLOW_VERSION 1.1.20
|
|
RUN wget -nv https://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \
|
|
tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \
|
|
rm pyflow-"$PYFLOW_VERSION".tar.gz && \
|
|
cd pyflow-"$PYFLOW_VERSION" && \
|
|
python2.7 setup.py build install && \
|
|
cd ..
|
|
|
|
# Install Tesseract OCR and Data Files
|
|
RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list && \
|
|
wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - && \
|
|
apt-get update && \
|
|
apt-get install -y --no-install-recommends tesseract-ocr && \
|
|
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
|
wget -nv https://github.com/tesseract-ocr/tessdata/raw/master/deu_frak.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
|
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
|
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
|
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
|
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
|
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata && \
|
|
wget -nv https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata
|
|
|
|
RUN mkdir files_for_ocr files_from_ocr
|
|
|
|
COPY hocrtotei /usr/local/bin
|
|
COPY ocr /usr/local/bin
|
|
|
|
CMD ["/bin/bash"] |