Compare commits

..

No commits in common. "master" and "1.0" have entirely different histories.
master ... 1.0

9 changed files with 755 additions and 968 deletions

View File

@ -1,68 +1,44 @@
image: docker:19.03.13
image: docker:stable
services:
- docker:19.03.13-dind
- docker:stable-dind
variables:
DOCKER_DRIVER: overlay2
stages:
- build
- push
variables:
DOCKER_TLS_CERTDIR: "/certs"
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME-$CI_COMMIT_SHA
before_script:
- docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
.reg_setup:
before_script:
- apk add --no-cache curl
- curl --fail --show-error --location "https://github.com/genuinetools/reg/releases/download/v$REG_VERSION/reg-linux-amd64" --output /usr/local/bin/reg
- echo "$REG_SHA256 /usr/local/bin/reg" | sha256sum -c -
- chmod a+x /usr/local/bin/reg
variables:
REG_SHA256: ade837fc5224acd8c34732bf54a94f579b47851cc6a7fd5899a98386b782e228
REG_VERSION: 0.16.1
build_image:
Build:
script:
- docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
- docker build -t $INTERMEDIATE_IMAGE_TAG .
- docker push $INTERMEDIATE_IMAGE_TAG
- docker build --pull -t $CI_REGISTRY_IMAGE:tmp .
- docker push $CI_REGISTRY_IMAGE:tmp
stage: build
tags:
- docker
- docker
push_master:
extends:
- .reg_setup
Push latest:
only:
- master
script:
- docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
- docker pull $INTERMEDIATE_IMAGE_TAG
- /usr/local/bin/reg rm -d --auth-url $CI_REGISTRY -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $INTERMEDIATE_IMAGE_TAG
- docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG
- docker push $IMAGE_TAG
- docker pull $CI_REGISTRY_IMAGE:tmp
- docker tag $CI_REGISTRY_IMAGE:tmp $CI_REGISTRY_IMAGE:latest
- docker push $CI_REGISTRY_IMAGE:latest
stage: push
tags:
- docker
variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE:latest
- docker
push_other:
extends:
- .reg_setup
except:
- master
Push tag:
only:
- branches
- tags
script:
- docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
- docker pull $INTERMEDIATE_IMAGE_TAG
- /usr/local/bin/reg rm -d --auth-url $CI_REGISTRY -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $INTERMEDIATE_IMAGE_TAG
- docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG
- docker push $IMAGE_TAG
- docker pull $CI_REGISTRY_IMAGE:tmp
- docker tag $CI_REGISTRY_IMAGE:tmp $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME
- docker push $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME
stage: push
tags:
- docker
variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME
- docker

View File

@ -1,85 +1,73 @@
FROM debian:buster-slim
FROM debian:9-slim
LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <porada@posteo.de>"
# Define image metadata
LABEL maintainer="inf_sfb1288@lists.uni-bielefeld.de"
ENV LANG=C.UTF-8
# Install prerequisites
RUN apt-get update \
&& apt-get install --no-install-recommends --yes \
ghostscript \
procps \
python3.7 \
python3-pip \
rename \
&& apt-get install -y --no-install-recommends \
apt-transport-https \
ca-certificates \
gnupg2 \
imagemagick \
poppler-utils \
python2.7 \
python3.5 \
wget \
zip \
&& python3 -m pip install lxml
&& rm -rf /var/lib/apt/lists/*
# Install the OCR pipeline and it's dependencies #
## Install pyFlow ##
ENV PYFLOW_VERSION=1.1.20
RUN wget --no-check-certificate --quiet \
"https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" \
&& tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
&& cd "pyflow-${PYFLOW_VERSION}" \
&& apt-get install --no-install-recommends --yes \
python2.7 \
&& python2.7 setup.py build install \
&& cd - > /dev/null \
&& rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz"
## Install ocropy ##
ENV OCROPY_VERSION=1.3.3
RUN wget --no-check-certificate --quiet \
"https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" \
&& tar -xzf "v${OCROPY_VERSION}.tar.gz" \
ENV OCROPY_VERSION 1.3.3
ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" .
RUN tar -xzf "v${OCROPY_VERSION}.tar.gz" \
&& cd "ocropy-${OCROPY_VERSION}" \
&& apt-get install --no-install-recommends --yes \
python2.7 \
&& apt-get update \
&& apt-get install -y --no-install-recommends \
python-pil \
python-tk \
$(cat PACKAGES) \
&& rm -rf /var/lib/apt/lists/* \
&& python2.7 setup.py install \
&& cd - > /dev/null \
&& rm -r "ocropy-${OCROPY_VERSION}" "v${OCROPY_VERSION}.tar.gz"
&& cd .. \
&& rm -rf \
"ocropy-${OCROPY_VERSION}" \
"v${OCROPY_VERSION}.tar.gz"
ENV PYFLOW_VERSION=1.1.20
ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" .
RUN tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
&& cd "pyflow-${PYFLOW_VERSION}" \
&& python2.7 setup.py build install \
&& cd .. \
&& rm -rf \
"pyflow-${PYFLOW_VERSION}" \
"pyflow-${PYFLOW_VERSION}.tar.gz"
RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list \
&& wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - \
&& apt-get update \
&& apt-get install -y --no-install-recommends \
tesseract-ocr \
tesseract-ocr-deu \
tesseract-ocr-eng \
tesseract-ocr-enm \
tesseract-ocr-fra \
tesseract-ocr-frk \
tesseract-ocr-frm \
tesseract-ocr-ita \
tesseract-ocr-por \
tesseract-ocr-spa \
&& rm -rf /var/lib/apt/lists/*
## Install Tesseract OCR ##
ENV TESSERACT_VERSION=5.0.0
RUN wget --no-check-certificate --quiet \
"https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \
&& tar -xzf "${TESSERACT_VERSION}.tar.gz" \
&& cd "tesseract-${TESSERACT_VERSION}" \
&& apt-get install --no-install-recommends --yes \
autoconf \
automake \
g++ \
libjpeg62-turbo-dev \
libleptonica-dev \
libtiff5-dev \
libtool \
libpng-dev \
make \
pkg-config \
zlib1g-dev \
&& ./autogen.sh \
&& ./configure --disable-openmp --disable-shared 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic' \
&& make \
&& make install \
&& ldconfig \
&& cd - > /dev/null \
&& rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz"
RUN rm -r /var/lib/apt/lists/*
## Install Pipeline ##
COPY hocr2tei hocr-combine ocr /usr/local/bin/
# Install OCR pipeline
COPY hocrtotei /usr/local/bin
COPY ocr /usr/local/bin
ENTRYPOINT ["ocr"]

21
LICENSE
View File

@ -1,21 +0,0 @@
MIT License
Copyright (c) 2021 Bielefeld University - CRC 1288 - INF
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

139
README.md
View File

@ -1,49 +1,96 @@
# OCR - Optical Character Recognition
# OCR
This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided. The pipeline is designed to run on Linux operating systems, but with some tweaks it should also run on Windows with WSL installed.
## Build image
## Software used in this pipeline implementation
- Official Debian Docker image (buster-slim): https://hub.docker.com/_/debian
- Software from Debian Buster's free repositories
- ocropy (1.3.3): https://github.com/ocropus/ocropy/releases/tag/v1.3.3
- pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20
- Tesseract OCR (5.0.0): https://github.com/tesseract-ocr/tesseract/releases/tag/5.0.0
## Installation
1. Install Docker and Python 3.
2. Clone this repository: `git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git`
3. Build the Docker image: `docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:v0.1.0 ocr`
4. Add the wrapper script (`wrapper/ocr` relative to this README file) to your `${PATH}`.
5. Create working directories for the pipeline: `mkdir -p /<my_data_location>/{input,models,output}`.
6. Place your Tesseract OCR model(s) inside `/<my_data_location>/models`.
## Use the Pipeline
1. Place your PDF files inside `/<my_data_location>/input`. Files should all contain text of the same language.
2. Clear your `/<my_data_location>/output` directory.
3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details.
```bash
cd /<my_data_location>
# <model_code> is the model filename without the ".traineddata" suffix
ocr \
--input-dir input \
--output-dir output \
--model-file models/<model>
-m <model_code> <optional_pipeline_arguments>
# More then one model
ocr \
--input-dir input \
--output-dir output \
--model-file models/<model1>
--model-file models/<model2>
-m <model1_code>+<model2_code> <optional_pipeline_arguments>
# Instead of multiple --model-file statements, you can also use
ocr \
--input-dir input \
--output-dir output \
--model-file models/*
-m <model1_code>+<model2_code> <optional_pipeline_arguments>
1. Clone this repository and navigate into it:
```
git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git && cd ocr
```
2. Build image:
```
docker build -t sfb1288inf/ocr:latest .
```
Alternatively build from the GitLab repository without cloning:
1. Build image:
```
docker build -t sfb1288inf/ocr:latest https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
```
## Download prebuilt image
The GitLab registry provides a prebuilt image. It is automatically created, utilizing the conquaire build servers.
1. Download image:
```
docker pull gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest
```
## Run
1. Create input and output directories for the OCR software:
```
mkdir -p /<mydatalocation>/files_for_ocr /<mydatalocation>/files_from_ocr
```
2. Place your files inside the `/<mydatalocation>/files_for_ocr` directory. Files can either be PDF (.pdf) or multipage TIFF (.tiff, .tif) files. Files should all contain text of the same language.
3. Start the OCR process.
```
docker run \
--rm \
-it \
-u $(id -u $USER):$(id -g $USER) \
-v /<mydatalocation>/files_for_ocr:/input \
-v /<mydatalocation>/files_from_ocr:/output \
sfb1288inf/ocr:latest \
-i /input \
-l <languagecode> \
-o /output
```
The arguments below `sfb1288inf/ocr:latest` are described in the [OCR arguments](#ocr-arguments) part.
If you want to use the prebuilt image, replace `sfb1288inf/ocr:latest` with `gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest`.
4. Check your results in the `/<mydatalocation>/files_from_ocr` directory.
### OCR arguments
`-l languagecode`
* Tells tesseract which language will be used.
* options = deu (German), eng (English), enm (Middle englisch), fra (French), frk (German Fraktur), frm (Middle french), ita (Italian), por (Portuguese), spa (Spanish)
* required = True
`--keep-intermediates`
* If set, all intermediate files created during the OCR process will be
kept.
* default = False
* required = False
`--nCores corenumber`
* Sets the number of CPU cores being used during the OCR process.
* default = min(4, multiprocessing.cpu_count())
* required = False
`--skip-binarisation`
* Used to skip binarization with ocropus. If skipped, only the tesseract binarization is used.
* default = False
Example with all arguments used:
```
docker run \
--rm \
-it \
-u $(id -u $USER):$(id -g $USER) \
-v "$HOME"/ocr/files_for_ocr:/input \
-v "$HOME"/ocr/files_from_ocr:/output \
sfb1288inf/ocr:latest \
-i /input \
-l eng \
-o /output \
--keep_intermediates \
--nCores 8 \
--skip-binarisation
```
4. Check your results in the `/<my_data_location>/output` directory.

View File

@ -1,44 +0,0 @@
#!/usr/bin/env python3.7
# coding=utf-8
''' Combine multiple hOCR files. '''
from argparse import ArgumentParser
from lxml import html
parser = ArgumentParser(description='Combine multiple hOCR files.')
parser.add_argument(
'-i', '--input-file',
help='Input file',
nargs='+',
required=True
)
parser.add_argument(
'-o', '--output-file',
help='Output file',
required=True
)
args = parser.parse_args()
for input_file in args.input_file:
input_files = []
if input_file.startswith('@'):
with open(input_file[1:], 'r') as f:
input_files += [x for x in f.read().split("\n") if x != '']
else:
input_files.append(input_file)
if len(input_files) == 0:
exit(1)
hocr = html.parse(input_files[0])
hocr_body = hocr.find('body')
for input_file in input_files[1:]:
for ocr_page in html.parse(input_file).findall('//div[@class="ocr_page"]'):
hocr_body.append(ocr_page)
with open(args.output_file, 'wb') as f:
hocr.write(f, encoding='UTF-8', method='html')

View File

@ -1,68 +0,0 @@
#!/usr/bin/env python3.7
# coding=utf-8
''' Convert hOCR to TEI XML. '''
from argparse import ArgumentParser
from lxml import html
from xml.sax.saxutils import escape
import re
parser = ArgumentParser(description='Convert hOCR to TEI XML.')
parser.add_argument(
'-i', '--input-file',
help='Input file',
required=True
)
parser.add_argument(
'-o', '--output-file',
help='Output file',
required=True
)
args = parser.parse_args()
tei = ''
tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n'
tei += ' <teiHeader>\n'
tei += ' <fileDesc>\n'
tei += ' <titleStmt>\n'
tei += ' <title></title>\n'
tei += ' </titleStmt>\n'
tei += ' <publicationStmt>\n'
tei += ' <p></p>\n'
tei += ' </publicationStmt>\n'
tei += ' <sourceDesc>\n'
tei += ' <p></p>\n'
tei += ' </sourceDesc>\n'
tei += ' </fileDesc>\n'
tei += ' </teiHeader>\n'
tei += ' <text>\n'
tei += ' <body>\n'
hocr = html.parse(args.input_file)
for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
ocr_page_title_attrib = ocr_page.attrib.get('title')
facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1)
page_number = re.search(r'ppageno (\d+)', ocr_page_title_attrib).group(1)
tei += f' <pb facs="{facsimile}" n="{page_number}"/>\n'
for ocr_par in ocr_page.findall('.//p[@class="ocr_par"]'):
tei += ' <p>\n'
for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'):
tei += ' <lb/>'
is_first_word_in_line = True
for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'):
if ocrx_word.text is not None:
if not is_first_word_in_line:
tei += ' '
tei += escape(ocrx_word.text)
is_first_word_in_line = False
tei += '\n'
tei += ' </p>\n'
tei += ' </body>\n'
tei += ' </text>\n'
tei += '</TEI>\n'
with open(args.output_file, 'w') as f:
f.write(tei)

58
hocrtotei Executable file
View File

@ -0,0 +1,58 @@
#!/usr/bin/env python3.5
# coding=utf-8
from xml.sax.saxutils import escape
import argparse
import xml.etree.ElementTree as ET
parser = argparse.ArgumentParser(
description='Merges several hOCR files in order of their occurrence on command line to one TEI result file.'
)
parser.add_argument(
'i',
metavar='hOCR-sourcefile',
nargs='+'
)
parser.add_argument(
'o',
metavar='TEI-destfile',
)
args = parser.parse_args()
output_file = open(args.o, 'w')
output_file.write(
'<?xml version="1.0" encoding="UTF-8"?>\n'
+ '<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="dtabf">\n'
+ ' <teiHeader>\n'
+ ' <fileDesc>\n'
+ ' <titleStmt/>\n'
+ ' <publicationStmt/>\n'
+ ' <sourceDesc/>\n'
+ ' </fileDesc>\n'
+ ' <encodingDesc/>\n'
+ ' <profileDesc/>\n'
+ ' </teiHeader>\n'
+ ' <text>\n'
+ ' <body>\n'
)
for index, input_file in enumerate(args.i):
tree = ET.parse(input_file)
output_file.write(' <pb n="%i"/>\n' % (index + 1))
for para in tree.findall('.//*[@class="ocr_par"]'):
output_file.write(' <p>\n')
for line in para.findall('.//*[@class="ocr_line"]'):
first_word_in_line = True
for word in line.findall('.//*[@class="ocrx_word"]'):
if word.text is not None:
output_file.write((' ' if first_word_in_line else ' ') + escape(word.text.strip()))
first_word_in_line = False
if not first_word_in_line:
output_file.write('<lb/>\n')
output_file.write(' </p>\n')
output_file.write(
' </body>\n'
+ ' </text>\n'
+ '</TEI>')
output_file.close()

1154
ocr

File diff suppressed because it is too large Load Diff

View File

@ -1,44 +1,39 @@
#!/usr/bin/env python3
# coding=utf-8
from argparse import ArgumentParser
import argparse
import os
import subprocess
import sys
CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:v0.1.0'
CONTAINER_INPUT_DIR = '/input'
CONTAINER_OUTPUT_DIR = '/output'
CONTAINER_MODELS_DIR = '/usr/local/share/tessdata'
CONTAINER_LOG_DIR = '/logs'
UID = str(os.getuid())
GID = str(os.getgid())
container_image = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest'
container_input_dir = '/input'
container_output_dir = '/output'
uid = str(os.getuid())
gid = str(os.getgid())
parser = ArgumentParser(add_help=False)
parser.add_argument('-i', '--input-dir')
parser.add_argument('-o', '--output-dir')
parser.add_argument('-t', '--model-file', action='extend', nargs='+')
parser.add_argument('--log-dir')
parser = argparse.ArgumentParser(add_help=False)
parser.add_argument(
'-i',
dest='input_dir',
required=False
)
parser.add_argument(
'-o',
dest='output_dir',
required=False
)
args, remaining_args = parser.parse_known_args()
cmd = ['docker', 'run', '--rm', '-it', '-u', f'{UID}:{GID}']
cmd = ['docker', 'run', '--rm', '-it', '-u', uid + ':' + gid]
if args.input_dir is not None:
mapping = f'{os.path.abspath(args.input_dir)}:{CONTAINER_INPUT_DIR}'
cmd += ['-v', mapping]
remaining_args += ['-i', CONTAINER_INPUT_DIR]
host_input_dir = os.path.abspath(args.input_dir)
cmd += ['-v', host_input_dir + ':' + container_input_dir]
remaining_args += ['-i', container_input_dir]
if args.output_dir is not None:
mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}'
cmd += ['-v', mapping]
remaining_args += ['-o', CONTAINER_OUTPUT_DIR]
if args.model_file is not None:
for model_file in args.model_file:
mapping = f'{os.path.abspath(model_file)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model_file)}' # noqa
cmd += ['-v', mapping]
if args.log_dir is not None:
mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}'
cmd += ['-v', mapping]
remaining_args += ['--log-dir', CONTAINER_LOG_DIR]
cmd.append(CONTAINER_IMAGE)
host_output_dir = os.path.abspath(args.output_dir)
cmd += ['-v', host_output_dir + ':' + container_output_dir]
remaining_args += ['-o', container_output_dir]
cmd.append(container_image)
cmd += remaining_args
sys.exit(subprocess.run(cmd).returncode)
subprocess.run(cmd)