Compare commits

..

No commits in common. "master" and "1.0" have entirely different histories.
master ... 1.0

9 changed files with 755 additions and 968 deletions

View File

@ -1,68 +1,44 @@
image: docker:19.03.13 image: docker:stable
services: services:
- docker:19.03.13-dind - docker:stable-dind
variables:
DOCKER_DRIVER: overlay2
stages: stages:
- build - build
- push - push
variables:
DOCKER_TLS_CERTDIR: "/certs"
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME-$CI_COMMIT_SHA
.reg_setup:
before_script: before_script:
- apk add --no-cache curl
- curl --fail --show-error --location "https://github.com/genuinetools/reg/releases/download/v$REG_VERSION/reg-linux-amd64" --output /usr/local/bin/reg
- echo "$REG_SHA256 /usr/local/bin/reg" | sha256sum -c -
- chmod a+x /usr/local/bin/reg
variables:
REG_SHA256: ade837fc5224acd8c34732bf54a94f579b47851cc6a7fd5899a98386b782e228
REG_VERSION: 0.16.1
build_image:
script:
- docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
- docker build -t $INTERMEDIATE_IMAGE_TAG .
- docker push $INTERMEDIATE_IMAGE_TAG Build:
script:
- docker build --pull -t $CI_REGISTRY_IMAGE:tmp .
- docker push $CI_REGISTRY_IMAGE:tmp
stage: build stage: build
tags: tags:
- docker - docker
push_master: Push latest:
extends:
- .reg_setup
only: only:
- master - master
script: script:
- docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY - docker pull $CI_REGISTRY_IMAGE:tmp
- docker pull $INTERMEDIATE_IMAGE_TAG - docker tag $CI_REGISTRY_IMAGE:tmp $CI_REGISTRY_IMAGE:latest
- /usr/local/bin/reg rm -d --auth-url $CI_REGISTRY -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $INTERMEDIATE_IMAGE_TAG - docker push $CI_REGISTRY_IMAGE:latest
- docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG
- docker push $IMAGE_TAG
stage: push stage: push
tags: tags:
- docker - docker
variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE:latest
push_other: Push tag:
extends:
- .reg_setup
except:
- master
only: only:
- branches
- tags - tags
script: script:
- docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY - docker pull $CI_REGISTRY_IMAGE:tmp
- docker pull $INTERMEDIATE_IMAGE_TAG - docker tag $CI_REGISTRY_IMAGE:tmp $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME
- /usr/local/bin/reg rm -d --auth-url $CI_REGISTRY -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $INTERMEDIATE_IMAGE_TAG - docker push $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME
- docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG
- docker push $IMAGE_TAG
stage: push stage: push
tags: tags:
- docker - docker
variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME

View File

@ -1,85 +1,73 @@
FROM debian:buster-slim FROM debian:9-slim
LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <porada@posteo.de>" # Define image metadata
LABEL maintainer="inf_sfb1288@lists.uni-bielefeld.de"
ENV LANG=C.UTF-8 ENV LANG=C.UTF-8
# Install prerequisites
RUN apt-get update \ RUN apt-get update \
&& apt-get install --no-install-recommends --yes \ && apt-get install -y --no-install-recommends \
ghostscript \ apt-transport-https \
procps \ ca-certificates \
python3.7 \ gnupg2 \
python3-pip \ imagemagick \
rename \ poppler-utils \
python2.7 \
python3.5 \
wget \ wget \
zip \ zip \
&& python3 -m pip install lxml && rm -rf /var/lib/apt/lists/*
# Install the OCR pipeline and it's dependencies # ENV OCROPY_VERSION 1.3.3
## Install pyFlow ## ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" .
ENV PYFLOW_VERSION=1.1.20 RUN tar -xzf "v${OCROPY_VERSION}.tar.gz" \
RUN wget --no-check-certificate --quiet \
"https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" \
&& tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
&& cd "pyflow-${PYFLOW_VERSION}" \
&& apt-get install --no-install-recommends --yes \
python2.7 \
&& python2.7 setup.py build install \
&& cd - > /dev/null \
&& rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz"
## Install ocropy ##
ENV OCROPY_VERSION=1.3.3
RUN wget --no-check-certificate --quiet \
"https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" \
&& tar -xzf "v${OCROPY_VERSION}.tar.gz" \
&& cd "ocropy-${OCROPY_VERSION}" \ && cd "ocropy-${OCROPY_VERSION}" \
&& apt-get install --no-install-recommends --yes \ && apt-get update \
python2.7 \ && apt-get install -y --no-install-recommends \
python-pil \ python-pil \
python-tk \ python-tk \
$(cat PACKAGES) \ $(cat PACKAGES) \
&& rm -rf /var/lib/apt/lists/* \
&& python2.7 setup.py install \ && python2.7 setup.py install \
&& cd - > /dev/null \ && cd .. \
&& rm -r "ocropy-${OCROPY_VERSION}" "v${OCROPY_VERSION}.tar.gz" && rm -rf \
"ocropy-${OCROPY_VERSION}" \
"v${OCROPY_VERSION}.tar.gz"
ENV PYFLOW_VERSION=1.1.20
ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" .
RUN tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
&& cd "pyflow-${PYFLOW_VERSION}" \
&& python2.7 setup.py build install \
&& cd .. \
&& rm -rf \
"pyflow-${PYFLOW_VERSION}" \
"pyflow-${PYFLOW_VERSION}.tar.gz"
RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list \
&& wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - \
&& apt-get update \
&& apt-get install -y --no-install-recommends \
tesseract-ocr \
tesseract-ocr-deu \
tesseract-ocr-eng \
tesseract-ocr-enm \
tesseract-ocr-fra \
tesseract-ocr-frk \
tesseract-ocr-frm \
tesseract-ocr-ita \
tesseract-ocr-por \
tesseract-ocr-spa \
&& rm -rf /var/lib/apt/lists/*
## Install Tesseract OCR ## # Install OCR pipeline
ENV TESSERACT_VERSION=5.0.0 COPY hocrtotei /usr/local/bin
RUN wget --no-check-certificate --quiet \ COPY ocr /usr/local/bin
"https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \
&& tar -xzf "${TESSERACT_VERSION}.tar.gz" \
&& cd "tesseract-${TESSERACT_VERSION}" \
&& apt-get install --no-install-recommends --yes \
autoconf \
automake \
g++ \
libjpeg62-turbo-dev \
libleptonica-dev \
libtiff5-dev \
libtool \
libpng-dev \
make \
pkg-config \
zlib1g-dev \
&& ./autogen.sh \
&& ./configure --disable-openmp --disable-shared 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic' \
&& make \
&& make install \
&& ldconfig \
&& cd - > /dev/null \
&& rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz"
RUN rm -r /var/lib/apt/lists/*
## Install Pipeline ##
COPY hocr2tei hocr-combine ocr /usr/local/bin/
ENTRYPOINT ["ocr"] ENTRYPOINT ["ocr"]

21
LICENSE
View File

@ -1,21 +0,0 @@
MIT License
Copyright (c) 2021 Bielefeld University - CRC 1288 - INF
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

139
README.md
View File

@ -1,49 +1,96 @@
# OCR - Optical Character Recognition # OCR
This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided. The pipeline is designed to run on Linux operating systems, but with some tweaks it should also run on Windows with WSL installed. ## Build image
## Software used in this pipeline implementation 1. Clone this repository and navigate into it:
```
- Official Debian Docker image (buster-slim): https://hub.docker.com/_/debian git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git && cd ocr
- Software from Debian Buster's free repositories ```
- ocropy (1.3.3): https://github.com/ocropus/ocropy/releases/tag/v1.3.3
- pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20 2. Build image:
- Tesseract OCR (5.0.0): https://github.com/tesseract-ocr/tesseract/releases/tag/5.0.0 ```
docker build -t sfb1288inf/ocr:latest .
## Installation ```
1. Install Docker and Python 3. Alternatively build from the GitLab repository without cloning:
2. Clone this repository: `git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git`
3. Build the Docker image: `docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:v0.1.0 ocr` 1. Build image:
4. Add the wrapper script (`wrapper/ocr` relative to this README file) to your `${PATH}`. ```
5. Create working directories for the pipeline: `mkdir -p /<my_data_location>/{input,models,output}`. docker build -t sfb1288inf/ocr:latest https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
6. Place your Tesseract OCR model(s) inside `/<my_data_location>/models`. ```
## Use the Pipeline ## Download prebuilt image
1. Place your PDF files inside `/<my_data_location>/input`. Files should all contain text of the same language. The GitLab registry provides a prebuilt image. It is automatically created, utilizing the conquaire build servers.
2. Clear your `/<my_data_location>/output` directory.
3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details. 1. Download image:
```bash ```
cd /<my_data_location> docker pull gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest
# <model_code> is the model filename without the ".traineddata" suffix ```
ocr \
--input-dir input \ ## Run
--output-dir output \
--model-file models/<model> 1. Create input and output directories for the OCR software:
-m <model_code> <optional_pipeline_arguments> ```
# More then one model mkdir -p /<mydatalocation>/files_for_ocr /<mydatalocation>/files_from_ocr
ocr \ ```
--input-dir input \
--output-dir output \ 2. Place your files inside the `/<mydatalocation>/files_for_ocr` directory. Files can either be PDF (.pdf) or multipage TIFF (.tiff, .tif) files. Files should all contain text of the same language.
--model-file models/<model1>
--model-file models/<model2> 3. Start the OCR process.
-m <model1_code>+<model2_code> <optional_pipeline_arguments> ```
# Instead of multiple --model-file statements, you can also use docker run \
ocr \ --rm \
--input-dir input \ -it \
--output-dir output \ -u $(id -u $USER):$(id -g $USER) \
--model-file models/* -v /<mydatalocation>/files_for_ocr:/input \
-m <model1_code>+<model2_code> <optional_pipeline_arguments> -v /<mydatalocation>/files_from_ocr:/output \
sfb1288inf/ocr:latest \
-i /input \
-l <languagecode> \
-o /output
```
The arguments below `sfb1288inf/ocr:latest` are described in the [OCR arguments](#ocr-arguments) part.
If you want to use the prebuilt image, replace `sfb1288inf/ocr:latest` with `gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest`.
4. Check your results in the `/<mydatalocation>/files_from_ocr` directory.
### OCR arguments
`-l languagecode`
* Tells tesseract which language will be used.
* options = deu (German), eng (English), enm (Middle englisch), fra (French), frk (German Fraktur), frm (Middle french), ita (Italian), por (Portuguese), spa (Spanish)
* required = True
`--keep-intermediates`
* If set, all intermediate files created during the OCR process will be
kept.
* default = False
* required = False
`--nCores corenumber`
* Sets the number of CPU cores being used during the OCR process.
* default = min(4, multiprocessing.cpu_count())
* required = False
`--skip-binarisation`
* Used to skip binarization with ocropus. If skipped, only the tesseract binarization is used.
* default = False
Example with all arguments used:
```
docker run \
--rm \
-it \
-u $(id -u $USER):$(id -g $USER) \
-v "$HOME"/ocr/files_for_ocr:/input \
-v "$HOME"/ocr/files_from_ocr:/output \
sfb1288inf/ocr:latest \
-i /input \
-l eng \
-o /output \
--keep_intermediates \
--nCores 8 \
--skip-binarisation
``` ```
4. Check your results in the `/<my_data_location>/output` directory.

View File

@ -1,44 +0,0 @@
#!/usr/bin/env python3.7
# coding=utf-8
''' Combine multiple hOCR files. '''
from argparse import ArgumentParser
from lxml import html
parser = ArgumentParser(description='Combine multiple hOCR files.')
parser.add_argument(
'-i', '--input-file',
help='Input file',
nargs='+',
required=True
)
parser.add_argument(
'-o', '--output-file',
help='Output file',
required=True
)
args = parser.parse_args()
for input_file in args.input_file:
input_files = []
if input_file.startswith('@'):
with open(input_file[1:], 'r') as f:
input_files += [x for x in f.read().split("\n") if x != '']
else:
input_files.append(input_file)
if len(input_files) == 0:
exit(1)
hocr = html.parse(input_files[0])
hocr_body = hocr.find('body')
for input_file in input_files[1:]:
for ocr_page in html.parse(input_file).findall('//div[@class="ocr_page"]'):
hocr_body.append(ocr_page)
with open(args.output_file, 'wb') as f:
hocr.write(f, encoding='UTF-8', method='html')

View File

@ -1,68 +0,0 @@
#!/usr/bin/env python3.7
# coding=utf-8
''' Convert hOCR to TEI XML. '''
from argparse import ArgumentParser
from lxml import html
from xml.sax.saxutils import escape
import re
parser = ArgumentParser(description='Convert hOCR to TEI XML.')
parser.add_argument(
'-i', '--input-file',
help='Input file',
required=True
)
parser.add_argument(
'-o', '--output-file',
help='Output file',
required=True
)
args = parser.parse_args()
tei = ''
tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n'
tei += ' <teiHeader>\n'
tei += ' <fileDesc>\n'
tei += ' <titleStmt>\n'
tei += ' <title></title>\n'
tei += ' </titleStmt>\n'
tei += ' <publicationStmt>\n'
tei += ' <p></p>\n'
tei += ' </publicationStmt>\n'
tei += ' <sourceDesc>\n'
tei += ' <p></p>\n'
tei += ' </sourceDesc>\n'
tei += ' </fileDesc>\n'
tei += ' </teiHeader>\n'
tei += ' <text>\n'
tei += ' <body>\n'
hocr = html.parse(args.input_file)
for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
ocr_page_title_attrib = ocr_page.attrib.get('title')
facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1)
page_number = re.search(r'ppageno (\d+)', ocr_page_title_attrib).group(1)
tei += f' <pb facs="{facsimile}" n="{page_number}"/>\n'
for ocr_par in ocr_page.findall('.//p[@class="ocr_par"]'):
tei += ' <p>\n'
for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'):
tei += ' <lb/>'
is_first_word_in_line = True
for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'):
if ocrx_word.text is not None:
if not is_first_word_in_line:
tei += ' '
tei += escape(ocrx_word.text)
is_first_word_in_line = False
tei += '\n'
tei += ' </p>\n'
tei += ' </body>\n'
tei += ' </text>\n'
tei += '</TEI>\n'
with open(args.output_file, 'w') as f:
f.write(tei)

58
hocrtotei Executable file
View File

@ -0,0 +1,58 @@
#!/usr/bin/env python3.5
# coding=utf-8
from xml.sax.saxutils import escape
import argparse
import xml.etree.ElementTree as ET
parser = argparse.ArgumentParser(
description='Merges several hOCR files in order of their occurrence on command line to one TEI result file.'
)
parser.add_argument(
'i',
metavar='hOCR-sourcefile',
nargs='+'
)
parser.add_argument(
'o',
metavar='TEI-destfile',
)
args = parser.parse_args()
output_file = open(args.o, 'w')
output_file.write(
'<?xml version="1.0" encoding="UTF-8"?>\n'
+ '<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="dtabf">\n'
+ ' <teiHeader>\n'
+ ' <fileDesc>\n'
+ ' <titleStmt/>\n'
+ ' <publicationStmt/>\n'
+ ' <sourceDesc/>\n'
+ ' </fileDesc>\n'
+ ' <encodingDesc/>\n'
+ ' <profileDesc/>\n'
+ ' </teiHeader>\n'
+ ' <text>\n'
+ ' <body>\n'
)
for index, input_file in enumerate(args.i):
tree = ET.parse(input_file)
output_file.write(' <pb n="%i"/>\n' % (index + 1))
for para in tree.findall('.//*[@class="ocr_par"]'):
output_file.write(' <p>\n')
for line in para.findall('.//*[@class="ocr_line"]'):
first_word_in_line = True
for word in line.findall('.//*[@class="ocrx_word"]'):
if word.text is not None:
output_file.write((' ' if first_word_in_line else ' ') + escape(word.text.strip()))
first_word_in_line = False
if not first_word_in_line:
output_file.write('<lb/>\n')
output_file.write(' </p>\n')
output_file.write(
' </body>\n'
+ ' </text>\n'
+ '</TEI>')
output_file.close()

1132
ocr

File diff suppressed because it is too large Load Diff

View File

@ -1,44 +1,39 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# coding=utf-8 # coding=utf-8
from argparse import ArgumentParser import argparse
import os import os
import subprocess import subprocess
import sys
CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:v0.1.0' container_image = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest'
CONTAINER_INPUT_DIR = '/input' container_input_dir = '/input'
CONTAINER_OUTPUT_DIR = '/output' container_output_dir = '/output'
CONTAINER_MODELS_DIR = '/usr/local/share/tessdata' uid = str(os.getuid())
CONTAINER_LOG_DIR = '/logs' gid = str(os.getgid())
UID = str(os.getuid())
GID = str(os.getgid())
parser = ArgumentParser(add_help=False) parser = argparse.ArgumentParser(add_help=False)
parser.add_argument('-i', '--input-dir') parser.add_argument(
parser.add_argument('-o', '--output-dir') '-i',
parser.add_argument('-t', '--model-file', action='extend', nargs='+') dest='input_dir',
parser.add_argument('--log-dir') required=False
)
parser.add_argument(
'-o',
dest='output_dir',
required=False
)
args, remaining_args = parser.parse_known_args() args, remaining_args = parser.parse_known_args()
cmd = ['docker', 'run', '--rm', '-it', '-u', f'{UID}:{GID}'] cmd = ['docker', 'run', '--rm', '-it', '-u', uid + ':' + gid]
if args.input_dir is not None: if args.input_dir is not None:
mapping = f'{os.path.abspath(args.input_dir)}:{CONTAINER_INPUT_DIR}' host_input_dir = os.path.abspath(args.input_dir)
cmd += ['-v', mapping] cmd += ['-v', host_input_dir + ':' + container_input_dir]
remaining_args += ['-i', CONTAINER_INPUT_DIR] remaining_args += ['-i', container_input_dir]
if args.output_dir is not None: if args.output_dir is not None:
mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}' host_output_dir = os.path.abspath(args.output_dir)
cmd += ['-v', mapping] cmd += ['-v', host_output_dir + ':' + container_output_dir]
remaining_args += ['-o', CONTAINER_OUTPUT_DIR] remaining_args += ['-o', container_output_dir]
if args.model_file is not None: cmd.append(container_image)
for model_file in args.model_file:
mapping = f'{os.path.abspath(model_file)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model_file)}' # noqa
cmd += ['-v', mapping]
if args.log_dir is not None:
mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}'
cmd += ['-v', mapping]
remaining_args += ['--log-dir', CONTAINER_LOG_DIR]
cmd.append(CONTAINER_IMAGE)
cmd += remaining_args cmd += remaining_args
sys.exit(subprocess.run(cmd).returncode) subprocess.run(cmd)