From 7cc8ebd666258b5ab1f08d7d092c654faaf8be2b Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Mon, 21 Sep 2020 14:46:03 +0200 Subject: [PATCH] compile tesseract in container --- .gitlab-ci.yml | 54 ++++++++++++++++++------- Dockerfile | 105 ++++++++++++++++++++++++++++--------------------- ocr | 3 +- 3 files changed, 102 insertions(+), 60 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ea08fd4..00d3567 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -9,36 +9,62 @@ variables: stages: - build - push + - clean before_script: - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY -Build: +build_image: script: - - docker build --pull -t $CI_REGISTRY_IMAGE:tmp . - - docker push $CI_REGISTRY_IMAGE:tmp + - docker build -t $INTERMEDIATE_IMAGE_TAG . + - docker push $INTERMEDIATE_IMAGE_TAG stage: build tags: - - docker + - docker + variables: + INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG -Push latest: +push_master: only: - master script: - - docker pull $CI_REGISTRY_IMAGE:tmp - - docker tag $CI_REGISTRY_IMAGE:tmp $CI_REGISTRY_IMAGE:latest - - docker push $CI_REGISTRY_IMAGE:latest + - docker pull $INTERMEDIATE_IMAGE_TAG + - docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG + - docker push $IMAGE_TAG stage: push tags: - - docker + - docker + variables: + IMAGE_TAG: $CI_REGISTRY_IMAGE:latest + INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG -Push tag: +push_other: + except: + - master only: + - branches - tags script: - - docker pull $CI_REGISTRY_IMAGE:tmp - - docker tag $CI_REGISTRY_IMAGE:tmp $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME - - docker push $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME + - docker pull $INTERMEDIATE_IMAGE_TAG + - docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG + - docker push $IMAGE_TAG stage: push tags: - - docker + - docker + variables: + IMAGE_TAG: $CI_REGISTRY_IMAGE:CI_COMMIT_REF_NAME + INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG + +delete_image: + before_script: + - apk add --no-cache curl + - curl --fail --show-error --location "https://github.com/genuinetools/reg/releases/download/v$REG_VERSION/reg-linux-amd64" --output /usr/local/bin/reg + - echo "$REG_SHA256 /usr/local/bin/reg" | sha256sum -c - + - chmod a+x /usr/local/bin/reg + script: + - /usr/local/bin/reg rm -d --auth-url $CI_REGISTRY -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $INTERMEDIATE_IMAGE_TAG + stage: clean + variables: + INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG + REG_SHA256: ade837fc5224acd8c34732bf54a94f579b47851cc6a7fd5899a98386b782e228 + REG_VERSION: 0.16.1 diff --git a/Dockerfile b/Dockerfile index bc4ae93..3ed80ef 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,73 +1,88 @@ FROM debian:10-slim -LABEL maintainer="inf_sfb1288@lists.uni-bielefeld.de" +LABEL authors="Patrick Jentsch , Stephan Porada " ENV LANG=C.UTF-8 -RUN apt-get update \ +RUN apt-get update + + +## Install pyFlow ## +ENV PYFLOW_RELEASE=1.1.20 +ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" . +RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \ + && cd "pyflow-${PYFLOW_RELEASE}" \ && apt-get install -y --no-install-recommends \ - apt-transport-https \ - build-essential \ - ca-certificates \ - gnupg2 \ - ghostscript \ python2.7 \ - python3.7 \ - wget \ - zip + && python2.7 setup.py build install \ + && cd .. \ + && rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz" -ENV OCROPY_VERSION 1.3.3 -ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" . -RUN tar -xzf "v${OCROPY_VERSION}.tar.gz" \ - && cd "ocropy-${OCROPY_VERSION}" \ - && apt-get update \ +## Install ocropy ## +ENV OCROPY_RELEASE 1.3.3 +ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_RELEASE}.tar.gz" . +RUN tar -xzf "v${OCROPY_RELEASE}.tar.gz" \ + && cd "ocropy-${OCROPY_RELEASE}" \ && apt-get install -y --no-install-recommends \ python-pil \ python-tk \ $(cat PACKAGES) \ && python2.7 setup.py install \ && cd .. \ - && rm -rf \ - "ocropy-${OCROPY_VERSION}" \ - "v${OCROPY_VERSION}.tar.gz" + && rm -r "ocropy-${OCROPY_RELEASE}" "v${OCROPY_RELEASE}.tar.gz" -ENV PYFLOW_VERSION=1.1.20 -ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" . -RUN tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \ - && cd "pyflow-${PYFLOW_VERSION}" \ - && python2.7 setup.py build install \ - && cd .. \ - && rm -rf \ - "pyflow-${PYFLOW_VERSION}" \ - "pyflow-${PYFLOW_VERSION}.tar.gz" - - -RUN echo "deb https://notesalexp.org/tesseract-ocr/buster/ buster main" >> /etc/apt/sources.list \ - && wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - \ - && apt-get update \ +## Install Tesseract OCR ## +ENV TESSERACT_RELEASE=4.1.1 +ADD "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_RELEASE}.tar.gz" . +RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \ + && cd "tesseract-${TESSERACT_RELEASE}" \ && apt-get install -y --no-install-recommends \ - tesseract-ocr \ - tesseract-ocr-deu \ - tesseract-ocr-eng \ - tesseract-ocr-enm \ - tesseract-ocr-fra \ - tesseract-ocr-frk \ - tesseract-ocr-frm \ - tesseract-ocr-ita \ - tesseract-ocr-por \ - tesseract-ocr-spa + autoconf \ + automake \ + g++ \ + libjpeg62-turbo-dev \ + libleptonica-dev \ + libtiff5-dev \ + libtool \ + libpng-dev \ + make \ + pkg-config \ + zlib1g-dev \ + && ./autogen.sh \ + && ./configure \ + && make \ + && make install \ + && ldconfig \ + && cd - > /dev/null \ + && rm -r "tesseract-${TESSERACT_RELEASE}" "${TESSERACT_RELEASE}.tar.gz" +ADD "https://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata" \ + "https://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata" \ + "https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata" \ + "https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata" \ + "https://github.com/tesseract-ocr/tessdata_best/raw/master/frk.traineddata" \ + "https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata" \ + "https://github.com/tesseract-ocr/tessdata_best/raw/master/ita.traineddata" \ + "https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata" \ + "https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata" \ + "/usr/local/share/tessdata/" +RUN chmod 644 /usr/local/share/tessdata/*.traineddata -RUN rm -rf /var/lib/apt/lists/* +## Install Pipeline ## +RUN apt-get install -y --no-install-recommends \ + ghostscript \ + python3.7 \ + zip +COPY "hocrtotei" "ocr" "/usr/local/bin/" -COPY hocrtotei /usr/local/bin -COPY ocr /usr/local/bin +## Cleanup ## +RUN rm -r /var/lib/apt/lists/* ENTRYPOINT ["ocr"] diff --git a/ocr b/ocr index a857931..10ee021 100755 --- a/ocr +++ b/ocr @@ -6,7 +6,8 @@ ocr Usage: For usage instructions run with option --help -Author: Patrick Jentsch +Authors: Patrick Jentsch """