mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2025-07-01 10:20:36 +00:00
Update to Tesseract 5.0.0, Set version 0.1.0
This commit is contained in:
39
Dockerfile
39
Dockerfile
@ -9,8 +9,14 @@ ENV LANG=C.UTF-8
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install --no-install-recommends --yes \
|
||||
wget
|
||||
|
||||
ghostscript \
|
||||
procps \
|
||||
python3.7 \
|
||||
python3-pip \
|
||||
rename \
|
||||
wget \
|
||||
zip \
|
||||
&& python3 -m pip install lxml
|
||||
|
||||
# Install the OCR pipeline and it's dependencies #
|
||||
## Install pyFlow ##
|
||||
@ -43,7 +49,7 @@ RUN wget --no-check-certificate --quiet \
|
||||
|
||||
|
||||
## Install Tesseract OCR ##
|
||||
ENV TESSERACT_VERSION=4.1.1
|
||||
ENV TESSERACT_VERSION=5.0.0
|
||||
RUN wget --no-check-certificate --quiet \
|
||||
"https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \
|
||||
&& tar -xzf "${TESSERACT_VERSION}.tar.gz" \
|
||||
@ -61,37 +67,20 @@ RUN wget --no-check-certificate --quiet \
|
||||
pkg-config \
|
||||
zlib1g-dev \
|
||||
&& ./autogen.sh \
|
||||
&& ./configure \
|
||||
&& ./configure --disable-openmp --disable-shared 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic' \
|
||||
&& make \
|
||||
&& make install \
|
||||
&& ldconfig \
|
||||
&& cd - > /dev/null \
|
||||
&& rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz"
|
||||
|
||||
ENV TESSERACT_MODELS="ara,chi_tra,dan,deu,ell,eng,enm,fra,frk,frm,ita,por,rus,spa"
|
||||
ENV TESSDATA_BEST_VERSION=4.1.0
|
||||
RUN wget --no-check-certificate --quiet \
|
||||
"https://github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}.tar.gz" \
|
||||
&& tar -xzf "${TESSDATA_BEST_VERSION}.tar.gz" \
|
||||
&& for tesseract_model in $(echo ${TESSERACT_MODELS} | tr "," "\n"); do mv "tessdata_best-${TESSDATA_BEST_VERSION}/${tesseract_model}.traineddata" "/usr/local/share/tessdata/"; done \
|
||||
&& rm -r "tessdata_best-${TESSDATA_BEST_VERSION}" "${TESSDATA_BEST_VERSION}.tar.gz"
|
||||
|
||||
|
||||
## Further dependencies ##
|
||||
RUN apt-get install --no-install-recommends --yes \
|
||||
procps \
|
||||
ghostscript \
|
||||
python3.7 \
|
||||
rename \
|
||||
zip
|
||||
|
||||
|
||||
## Install Pipeline ##
|
||||
COPY hocrtotei ocr /usr/local/bin/
|
||||
|
||||
|
||||
RUN rm -r /var/lib/apt/lists/*
|
||||
|
||||
|
||||
## Install Pipeline ##
|
||||
COPY hocr2tei hocr-combine ocr /usr/local/bin/
|
||||
|
||||
|
||||
ENTRYPOINT ["ocr"]
|
||||
CMD ["--help"]
|
||||
|
Reference in New Issue
Block a user