mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2025-01-13 22:30:33 +00:00
compile tesseract in container
This commit is contained in:
parent
82285a8e6c
commit
7cc8ebd666
@ -9,36 +9,62 @@ variables:
|
|||||||
stages:
|
stages:
|
||||||
- build
|
- build
|
||||||
- push
|
- push
|
||||||
|
- clean
|
||||||
|
|
||||||
before_script:
|
before_script:
|
||||||
- docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
|
- docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
|
||||||
|
|
||||||
Build:
|
build_image:
|
||||||
script:
|
script:
|
||||||
- docker build --pull -t $CI_REGISTRY_IMAGE:tmp .
|
- docker build -t $INTERMEDIATE_IMAGE_TAG .
|
||||||
- docker push $CI_REGISTRY_IMAGE:tmp
|
- docker push $INTERMEDIATE_IMAGE_TAG
|
||||||
stage: build
|
stage: build
|
||||||
tags:
|
tags:
|
||||||
- docker
|
- docker
|
||||||
|
variables:
|
||||||
|
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG
|
||||||
|
|
||||||
Push latest:
|
push_master:
|
||||||
only:
|
only:
|
||||||
- master
|
- master
|
||||||
script:
|
script:
|
||||||
- docker pull $CI_REGISTRY_IMAGE:tmp
|
- docker pull $INTERMEDIATE_IMAGE_TAG
|
||||||
- docker tag $CI_REGISTRY_IMAGE:tmp $CI_REGISTRY_IMAGE:latest
|
- docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG
|
||||||
- docker push $CI_REGISTRY_IMAGE:latest
|
- docker push $IMAGE_TAG
|
||||||
stage: push
|
stage: push
|
||||||
tags:
|
tags:
|
||||||
- docker
|
- docker
|
||||||
|
variables:
|
||||||
|
IMAGE_TAG: $CI_REGISTRY_IMAGE:latest
|
||||||
|
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG
|
||||||
|
|
||||||
Push tag:
|
push_other:
|
||||||
|
except:
|
||||||
|
- master
|
||||||
only:
|
only:
|
||||||
|
- branches
|
||||||
- tags
|
- tags
|
||||||
script:
|
script:
|
||||||
- docker pull $CI_REGISTRY_IMAGE:tmp
|
- docker pull $INTERMEDIATE_IMAGE_TAG
|
||||||
- docker tag $CI_REGISTRY_IMAGE:tmp $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME
|
- docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG
|
||||||
- docker push $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME
|
- docker push $IMAGE_TAG
|
||||||
stage: push
|
stage: push
|
||||||
tags:
|
tags:
|
||||||
- docker
|
- docker
|
||||||
|
variables:
|
||||||
|
IMAGE_TAG: $CI_REGISTRY_IMAGE:CI_COMMIT_REF_NAME
|
||||||
|
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG
|
||||||
|
|
||||||
|
delete_image:
|
||||||
|
before_script:
|
||||||
|
- apk add --no-cache curl
|
||||||
|
- curl --fail --show-error --location "https://github.com/genuinetools/reg/releases/download/v$REG_VERSION/reg-linux-amd64" --output /usr/local/bin/reg
|
||||||
|
- echo "$REG_SHA256 /usr/local/bin/reg" | sha256sum -c -
|
||||||
|
- chmod a+x /usr/local/bin/reg
|
||||||
|
script:
|
||||||
|
- /usr/local/bin/reg rm -d --auth-url $CI_REGISTRY -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $INTERMEDIATE_IMAGE_TAG
|
||||||
|
stage: clean
|
||||||
|
variables:
|
||||||
|
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG
|
||||||
|
REG_SHA256: ade837fc5224acd8c34732bf54a94f579b47851cc6a7fd5899a98386b782e228
|
||||||
|
REG_VERSION: 0.16.1
|
||||||
|
105
Dockerfile
105
Dockerfile
@ -1,73 +1,88 @@
|
|||||||
FROM debian:10-slim
|
FROM debian:10-slim
|
||||||
|
|
||||||
|
|
||||||
LABEL maintainer="inf_sfb1288@lists.uni-bielefeld.de"
|
LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <sporada@uni-bielefeld.de>"
|
||||||
|
|
||||||
|
|
||||||
ENV LANG=C.UTF-8
|
ENV LANG=C.UTF-8
|
||||||
|
|
||||||
|
|
||||||
RUN apt-get update \
|
RUN apt-get update
|
||||||
|
|
||||||
|
|
||||||
|
## Install pyFlow ##
|
||||||
|
ENV PYFLOW_RELEASE=1.1.20
|
||||||
|
ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" .
|
||||||
|
RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \
|
||||||
|
&& cd "pyflow-${PYFLOW_RELEASE}" \
|
||||||
&& apt-get install -y --no-install-recommends \
|
&& apt-get install -y --no-install-recommends \
|
||||||
apt-transport-https \
|
|
||||||
build-essential \
|
|
||||||
ca-certificates \
|
|
||||||
gnupg2 \
|
|
||||||
ghostscript \
|
|
||||||
python2.7 \
|
python2.7 \
|
||||||
python3.7 \
|
&& python2.7 setup.py build install \
|
||||||
wget \
|
&& cd .. \
|
||||||
zip
|
&& rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz"
|
||||||
|
|
||||||
|
|
||||||
ENV OCROPY_VERSION 1.3.3
|
## Install ocropy ##
|
||||||
ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" .
|
ENV OCROPY_RELEASE 1.3.3
|
||||||
RUN tar -xzf "v${OCROPY_VERSION}.tar.gz" \
|
ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_RELEASE}.tar.gz" .
|
||||||
&& cd "ocropy-${OCROPY_VERSION}" \
|
RUN tar -xzf "v${OCROPY_RELEASE}.tar.gz" \
|
||||||
&& apt-get update \
|
&& cd "ocropy-${OCROPY_RELEASE}" \
|
||||||
&& apt-get install -y --no-install-recommends \
|
&& apt-get install -y --no-install-recommends \
|
||||||
python-pil \
|
python-pil \
|
||||||
python-tk \
|
python-tk \
|
||||||
$(cat PACKAGES) \
|
$(cat PACKAGES) \
|
||||||
&& python2.7 setup.py install \
|
&& python2.7 setup.py install \
|
||||||
&& cd .. \
|
&& cd .. \
|
||||||
&& rm -rf \
|
&& rm -r "ocropy-${OCROPY_RELEASE}" "v${OCROPY_RELEASE}.tar.gz"
|
||||||
"ocropy-${OCROPY_VERSION}" \
|
|
||||||
"v${OCROPY_VERSION}.tar.gz"
|
|
||||||
|
|
||||||
|
|
||||||
ENV PYFLOW_VERSION=1.1.20
|
## Install Tesseract OCR ##
|
||||||
ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" .
|
ENV TESSERACT_RELEASE=4.1.1
|
||||||
RUN tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
|
ADD "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_RELEASE}.tar.gz" .
|
||||||
&& cd "pyflow-${PYFLOW_VERSION}" \
|
RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \
|
||||||
&& python2.7 setup.py build install \
|
&& cd "tesseract-${TESSERACT_RELEASE}" \
|
||||||
&& cd .. \
|
|
||||||
&& rm -rf \
|
|
||||||
"pyflow-${PYFLOW_VERSION}" \
|
|
||||||
"pyflow-${PYFLOW_VERSION}.tar.gz"
|
|
||||||
|
|
||||||
|
|
||||||
RUN echo "deb https://notesalexp.org/tesseract-ocr/buster/ buster main" >> /etc/apt/sources.list \
|
|
||||||
&& wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - \
|
|
||||||
&& apt-get update \
|
|
||||||
&& apt-get install -y --no-install-recommends \
|
&& apt-get install -y --no-install-recommends \
|
||||||
tesseract-ocr \
|
autoconf \
|
||||||
tesseract-ocr-deu \
|
automake \
|
||||||
tesseract-ocr-eng \
|
g++ \
|
||||||
tesseract-ocr-enm \
|
libjpeg62-turbo-dev \
|
||||||
tesseract-ocr-fra \
|
libleptonica-dev \
|
||||||
tesseract-ocr-frk \
|
libtiff5-dev \
|
||||||
tesseract-ocr-frm \
|
libtool \
|
||||||
tesseract-ocr-ita \
|
libpng-dev \
|
||||||
tesseract-ocr-por \
|
make \
|
||||||
tesseract-ocr-spa
|
pkg-config \
|
||||||
|
zlib1g-dev \
|
||||||
|
&& ./autogen.sh \
|
||||||
|
&& ./configure \
|
||||||
|
&& make \
|
||||||
|
&& make install \
|
||||||
|
&& ldconfig \
|
||||||
|
&& cd - > /dev/null \
|
||||||
|
&& rm -r "tesseract-${TESSERACT_RELEASE}" "${TESSERACT_RELEASE}.tar.gz"
|
||||||
|
ADD "https://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata" \
|
||||||
|
"https://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata" \
|
||||||
|
"https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata" \
|
||||||
|
"https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata" \
|
||||||
|
"https://github.com/tesseract-ocr/tessdata_best/raw/master/frk.traineddata" \
|
||||||
|
"https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata" \
|
||||||
|
"https://github.com/tesseract-ocr/tessdata_best/raw/master/ita.traineddata" \
|
||||||
|
"https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata" \
|
||||||
|
"https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata" \
|
||||||
|
"/usr/local/share/tessdata/"
|
||||||
|
RUN chmod 644 /usr/local/share/tessdata/*.traineddata
|
||||||
|
|
||||||
|
|
||||||
RUN rm -rf /var/lib/apt/lists/*
|
## Install Pipeline ##
|
||||||
|
RUN apt-get install -y --no-install-recommends \
|
||||||
|
ghostscript \
|
||||||
|
python3.7 \
|
||||||
|
zip
|
||||||
|
COPY "hocrtotei" "ocr" "/usr/local/bin/"
|
||||||
|
|
||||||
|
|
||||||
COPY hocrtotei /usr/local/bin
|
## Cleanup ##
|
||||||
COPY ocr /usr/local/bin
|
RUN rm -r /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
|
||||||
ENTRYPOINT ["ocr"]
|
ENTRYPOINT ["ocr"]
|
||||||
|
3
ocr
3
ocr
@ -6,7 +6,8 @@
|
|||||||
ocr
|
ocr
|
||||||
|
|
||||||
Usage: For usage instructions run with option --help
|
Usage: For usage instructions run with option --help
|
||||||
Author: Patrick Jentsch <p.jentsch@uni-bielefeld.de>
|
Authors: Patrick Jentsch <p.jentsch@uni-bielefeld.de
|
||||||
|
Stephan Porada <sporada@uni-bielefeld.de>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user