compile tesseract in container

This commit is contained in:
Patrick Jentsch 2020-09-21 14:46:03 +02:00
parent 82285a8e6c
commit 7cc8ebd666
3 changed files with 102 additions and 60 deletions

View File

@ -9,36 +9,62 @@ variables:
stages: stages:
- build - build
- push - push
- clean
before_script: before_script:
- docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
Build: build_image:
script: script:
- docker build --pull -t $CI_REGISTRY_IMAGE:tmp . - docker build -t $INTERMEDIATE_IMAGE_TAG .
- docker push $CI_REGISTRY_IMAGE:tmp - docker push $INTERMEDIATE_IMAGE_TAG
stage: build stage: build
tags: tags:
- docker - docker
variables:
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG
Push latest: push_master:
only: only:
- master - master
script: script:
- docker pull $CI_REGISTRY_IMAGE:tmp - docker pull $INTERMEDIATE_IMAGE_TAG
- docker tag $CI_REGISTRY_IMAGE:tmp $CI_REGISTRY_IMAGE:latest - docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG
- docker push $CI_REGISTRY_IMAGE:latest - docker push $IMAGE_TAG
stage: push stage: push
tags: tags:
- docker - docker
variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE:latest
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG
Push tag: push_other:
except:
- master
only: only:
- branches
- tags - tags
script: script:
- docker pull $CI_REGISTRY_IMAGE:tmp - docker pull $INTERMEDIATE_IMAGE_TAG
- docker tag $CI_REGISTRY_IMAGE:tmp $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME - docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG
- docker push $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME - docker push $IMAGE_TAG
stage: push stage: push
tags: tags:
- docker - docker
variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE:CI_COMMIT_REF_NAME
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG
delete_image:
before_script:
- apk add --no-cache curl
- curl --fail --show-error --location "https://github.com/genuinetools/reg/releases/download/v$REG_VERSION/reg-linux-amd64" --output /usr/local/bin/reg
- echo "$REG_SHA256 /usr/local/bin/reg" | sha256sum -c -
- chmod a+x /usr/local/bin/reg
script:
- /usr/local/bin/reg rm -d --auth-url $CI_REGISTRY -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $INTERMEDIATE_IMAGE_TAG
stage: clean
variables:
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG
REG_SHA256: ade837fc5224acd8c34732bf54a94f579b47851cc6a7fd5899a98386b782e228
REG_VERSION: 0.16.1

View File

@ -1,73 +1,88 @@
FROM debian:10-slim FROM debian:10-slim
LABEL maintainer="inf_sfb1288@lists.uni-bielefeld.de" LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <sporada@uni-bielefeld.de>"
ENV LANG=C.UTF-8 ENV LANG=C.UTF-8
RUN apt-get update \ RUN apt-get update
## Install pyFlow ##
ENV PYFLOW_RELEASE=1.1.20
ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" .
RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \
&& cd "pyflow-${PYFLOW_RELEASE}" \
&& apt-get install -y --no-install-recommends \ && apt-get install -y --no-install-recommends \
apt-transport-https \
build-essential \
ca-certificates \
gnupg2 \
ghostscript \
python2.7 \ python2.7 \
python3.7 \ && python2.7 setup.py build install \
wget \ && cd .. \
zip && rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz"
ENV OCROPY_VERSION 1.3.3 ## Install ocropy ##
ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" . ENV OCROPY_RELEASE 1.3.3
RUN tar -xzf "v${OCROPY_VERSION}.tar.gz" \ ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_RELEASE}.tar.gz" .
&& cd "ocropy-${OCROPY_VERSION}" \ RUN tar -xzf "v${OCROPY_RELEASE}.tar.gz" \
&& apt-get update \ && cd "ocropy-${OCROPY_RELEASE}" \
&& apt-get install -y --no-install-recommends \ && apt-get install -y --no-install-recommends \
python-pil \ python-pil \
python-tk \ python-tk \
$(cat PACKAGES) \ $(cat PACKAGES) \
&& python2.7 setup.py install \ && python2.7 setup.py install \
&& cd .. \ && cd .. \
&& rm -rf \ && rm -r "ocropy-${OCROPY_RELEASE}" "v${OCROPY_RELEASE}.tar.gz"
"ocropy-${OCROPY_VERSION}" \
"v${OCROPY_VERSION}.tar.gz"
ENV PYFLOW_VERSION=1.1.20 ## Install Tesseract OCR ##
ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" . ENV TESSERACT_RELEASE=4.1.1
RUN tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \ ADD "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_RELEASE}.tar.gz" .
&& cd "pyflow-${PYFLOW_VERSION}" \ RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \
&& python2.7 setup.py build install \ && cd "tesseract-${TESSERACT_RELEASE}" \
&& cd .. \
&& rm -rf \
"pyflow-${PYFLOW_VERSION}" \
"pyflow-${PYFLOW_VERSION}.tar.gz"
RUN echo "deb https://notesalexp.org/tesseract-ocr/buster/ buster main" >> /etc/apt/sources.list \
&& wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - \
&& apt-get update \
&& apt-get install -y --no-install-recommends \ && apt-get install -y --no-install-recommends \
tesseract-ocr \ autoconf \
tesseract-ocr-deu \ automake \
tesseract-ocr-eng \ g++ \
tesseract-ocr-enm \ libjpeg62-turbo-dev \
tesseract-ocr-fra \ libleptonica-dev \
tesseract-ocr-frk \ libtiff5-dev \
tesseract-ocr-frm \ libtool \
tesseract-ocr-ita \ libpng-dev \
tesseract-ocr-por \ make \
tesseract-ocr-spa pkg-config \
zlib1g-dev \
&& ./autogen.sh \
&& ./configure \
&& make \
&& make install \
&& ldconfig \
&& cd - > /dev/null \
&& rm -r "tesseract-${TESSERACT_RELEASE}" "${TESSERACT_RELEASE}.tar.gz"
ADD "https://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata" \
"https://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata" \
"https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata" \
"https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata" \
"https://github.com/tesseract-ocr/tessdata_best/raw/master/frk.traineddata" \
"https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata" \
"https://github.com/tesseract-ocr/tessdata_best/raw/master/ita.traineddata" \
"https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata" \
"https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata" \
"/usr/local/share/tessdata/"
RUN chmod 644 /usr/local/share/tessdata/*.traineddata
RUN rm -rf /var/lib/apt/lists/* ## Install Pipeline ##
RUN apt-get install -y --no-install-recommends \
ghostscript \
python3.7 \
zip
COPY "hocrtotei" "ocr" "/usr/local/bin/"
COPY hocrtotei /usr/local/bin ## Cleanup ##
COPY ocr /usr/local/bin RUN rm -r /var/lib/apt/lists/*
ENTRYPOINT ["ocr"] ENTRYPOINT ["ocr"]

3
ocr
View File

@ -6,7 +6,8 @@
ocr ocr
Usage: For usage instructions run with option --help Usage: For usage instructions run with option --help
Author: Patrick Jentsch <p.jentsch@uni-bielefeld.de> Authors: Patrick Jentsch <p.jentsch@uni-bielefeld.de
Stephan Porada <sporada@uni-bielefeld.de>
""" """