Update to Tesseract 5.0.0, Set version 0.1.0

This commit is contained in:
Patrick Jentsch
2022-01-04 11:42:55 +01:00
parent a0760487ae
commit e1b78b6ba4
7 changed files with 574 additions and 338 deletions

View File

@ -9,8 +9,14 @@ ENV LANG=C.UTF-8
RUN apt-get update \
&& apt-get install --no-install-recommends --yes \
wget
ghostscript \
procps \
python3.7 \
python3-pip \
rename \
wget \
zip \
&& python3 -m pip install lxml
# Install the OCR pipeline and it's dependencies #
## Install pyFlow ##
@ -43,7 +49,7 @@ RUN wget --no-check-certificate --quiet \
## Install Tesseract OCR ##
ENV TESSERACT_VERSION=4.1.1
ENV TESSERACT_VERSION=5.0.0
RUN wget --no-check-certificate --quiet \
"https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \
&& tar -xzf "${TESSERACT_VERSION}.tar.gz" \
@ -61,37 +67,20 @@ RUN wget --no-check-certificate --quiet \
pkg-config \
zlib1g-dev \
&& ./autogen.sh \
&& ./configure \
&& ./configure --disable-openmp --disable-shared 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic' \
&& make \
&& make install \
&& ldconfig \
&& cd - > /dev/null \
&& rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz"
ENV TESSERACT_MODELS="ara,chi_tra,dan,deu,ell,eng,enm,fra,frk,frm,ita,por,rus,spa"
ENV TESSDATA_BEST_VERSION=4.1.0
RUN wget --no-check-certificate --quiet \
"https://github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}.tar.gz" \
&& tar -xzf "${TESSDATA_BEST_VERSION}.tar.gz" \
&& for tesseract_model in $(echo ${TESSERACT_MODELS} | tr "," "\n"); do mv "tessdata_best-${TESSDATA_BEST_VERSION}/${tesseract_model}.traineddata" "/usr/local/share/tessdata/"; done \
&& rm -r "tessdata_best-${TESSDATA_BEST_VERSION}" "${TESSDATA_BEST_VERSION}.tar.gz"
## Further dependencies ##
RUN apt-get install --no-install-recommends --yes \
procps \
ghostscript \
python3.7 \
rename \
zip
## Install Pipeline ##
COPY hocrtotei ocr /usr/local/bin/
RUN rm -r /var/lib/apt/lists/*
## Install Pipeline ##
COPY hocr2tei hocr-combine ocr /usr/local/bin/
ENTRYPOINT ["ocr"]
CMD ["--help"]