mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2024-12-26 05:24:18 +00:00
First work on version 1.0.0
This commit is contained in:
parent
07635dcdfa
commit
ca7df6d0ed
42
Dockerfile
42
Dockerfile
@ -1,7 +1,7 @@
|
|||||||
FROM debian:buster-slim
|
FROM debian:buster-slim
|
||||||
|
|
||||||
|
|
||||||
LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <sporada@uni-bielefeld.de>"
|
LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>"
|
||||||
|
|
||||||
|
|
||||||
ENV LANG=C.UTF-8
|
ENV LANG=C.UTF-8
|
||||||
@ -16,26 +16,22 @@ ENV PYFLOW_RELEASE=1.1.20
|
|||||||
ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" .
|
ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" .
|
||||||
RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \
|
RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \
|
||||||
&& cd "pyflow-${PYFLOW_RELEASE}" \
|
&& cd "pyflow-${PYFLOW_RELEASE}" \
|
||||||
&& apt-get update \
|
|
||||||
&& apt-get install --no-install-recommends --yes \
|
&& apt-get install --no-install-recommends --yes \
|
||||||
python2.7 \
|
python2.7 \
|
||||||
&& rm -r /var/lib/apt/lists/* \
|
|
||||||
&& python2.7 setup.py build install \
|
&& python2.7 setup.py build install \
|
||||||
&& cd .. \
|
&& cd .. \
|
||||||
&& rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz"
|
&& rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz"
|
||||||
|
|
||||||
|
|
||||||
## Install ocropy ##
|
## Install ocropy ##
|
||||||
ENV OCROPY_RELEASE 1.3.3
|
ENV OCROPY_RELEASE=1.3.3
|
||||||
ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_RELEASE}.tar.gz" .
|
ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_RELEASE}.tar.gz" .
|
||||||
RUN tar -xzf "v${OCROPY_RELEASE}.tar.gz" \
|
RUN tar -xzf "v${OCROPY_RELEASE}.tar.gz" \
|
||||||
&& cd "ocropy-${OCROPY_RELEASE}" \
|
&& cd "ocropy-${OCROPY_RELEASE}" \
|
||||||
&& apt-get update \
|
|
||||||
&& apt-get install --no-install-recommends --yes \
|
&& apt-get install --no-install-recommends --yes \
|
||||||
python-pil \
|
python-pil \
|
||||||
python-tk \
|
python-tk \
|
||||||
$(cat PACKAGES) \
|
$(cat PACKAGES) \
|
||||||
&& rm -r /var/lib/apt/lists/* \
|
|
||||||
&& python2.7 setup.py install \
|
&& python2.7 setup.py install \
|
||||||
&& cd .. \
|
&& cd .. \
|
||||||
&& rm -r "ocropy-${OCROPY_RELEASE}" "v${OCROPY_RELEASE}.tar.gz"
|
&& rm -r "ocropy-${OCROPY_RELEASE}" "v${OCROPY_RELEASE}.tar.gz"
|
||||||
@ -46,7 +42,6 @@ ENV TESSERACT_RELEASE=4.1.1
|
|||||||
ADD "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_RELEASE}.tar.gz" .
|
ADD "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_RELEASE}.tar.gz" .
|
||||||
RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \
|
RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \
|
||||||
&& cd "tesseract-${TESSERACT_RELEASE}" \
|
&& cd "tesseract-${TESSERACT_RELEASE}" \
|
||||||
&& apt-get update \
|
|
||||||
&& apt-get install --no-install-recommends --yes \
|
&& apt-get install --no-install-recommends --yes \
|
||||||
autoconf \
|
autoconf \
|
||||||
automake \
|
automake \
|
||||||
@ -59,7 +54,6 @@ RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \
|
|||||||
make \
|
make \
|
||||||
pkg-config \
|
pkg-config \
|
||||||
zlib1g-dev \
|
zlib1g-dev \
|
||||||
&& rm -r /var/lib/apt/lists/* \
|
|
||||||
&& ./autogen.sh \
|
&& ./autogen.sh \
|
||||||
&& ./configure \
|
&& ./configure \
|
||||||
&& make \
|
&& make \
|
||||||
@ -67,30 +61,34 @@ RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \
|
|||||||
&& ldconfig \
|
&& ldconfig \
|
||||||
&& cd - > /dev/null \
|
&& cd - > /dev/null \
|
||||||
&& rm -r "tesseract-${TESSERACT_RELEASE}" "${TESSERACT_RELEASE}.tar.gz"
|
&& rm -r "tesseract-${TESSERACT_RELEASE}" "${TESSERACT_RELEASE}.tar.gz"
|
||||||
ADD "https://github.com/tesseract-ocr/tessdata_best/raw/master/deu.traineddata" \
|
|
||||||
"https://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata" \
|
ENV TESSDATA_BEST_RELEASE=4.1.0
|
||||||
"https://github.com/tesseract-ocr/tessdata_best/raw/master/enm.traineddata" \
|
ADD "https://github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_RELEASE}.tar.gz" .
|
||||||
"https://github.com/tesseract-ocr/tessdata_best/raw/master/fra.traineddata" \
|
RUN tar -xzf "${TESSDATA_BEST_RELEASE}.tar.gz" \
|
||||||
"https://github.com/tesseract-ocr/tessdata_best/raw/master/frk.traineddata" \
|
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/deu.traineddata" "/usr/local/share/tessdata/" \
|
||||||
"https://github.com/tesseract-ocr/tessdata_best/raw/master/frm.traineddata" \
|
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/eng.traineddata" "/usr/local/share/tessdata/" \
|
||||||
"https://github.com/tesseract-ocr/tessdata_best/raw/master/ita.traineddata" \
|
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/enm.traineddata" "/usr/local/share/tessdata/" \
|
||||||
"https://github.com/tesseract-ocr/tessdata_best/raw/master/por.traineddata" \
|
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/fra.traineddata" "/usr/local/share/tessdata/" \
|
||||||
"https://github.com/tesseract-ocr/tessdata_best/raw/master/spa.traineddata" \
|
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/frk.traineddata" "/usr/local/share/tessdata/" \
|
||||||
"/usr/local/share/tessdata/"
|
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/frm.traineddata" "/usr/local/share/tessdata/" \
|
||||||
RUN chmod 644 /usr/local/share/tessdata/*.traineddata
|
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/ita.traineddata" "/usr/local/share/tessdata/" \
|
||||||
|
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/por.traineddata" "/usr/local/share/tessdata/" \
|
||||||
|
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/spa.traineddata" "/usr/local/share/tessdata/" \
|
||||||
|
&& rm -r "tessdata_best-${TESSDATA_BEST_RELEASE}" "${TESSDATA_BEST_RELEASE}.tar.gz"
|
||||||
|
|
||||||
|
|
||||||
## Further dependencies ##
|
## Further dependencies ##
|
||||||
RUN apt-get update \
|
RUN apt-get install --no-install-recommends --yes \
|
||||||
&& apt-get install --no-install-recommends --yes \
|
|
||||||
ghostscript \
|
ghostscript \
|
||||||
python-pip \
|
python-pip \
|
||||||
python3.7 \
|
python3.7 \
|
||||||
zip \
|
zip \
|
||||||
&& rm -r /var/lib/apt/lists/* \
|
|
||||||
&& pip install natsort
|
&& pip install natsort
|
||||||
|
|
||||||
|
|
||||||
|
RUN rm -r /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
|
||||||
## Install Pipeline ##
|
## Install Pipeline ##
|
||||||
COPY hocrtotei ocr /usr/local/bin/
|
COPY hocrtotei ocr /usr/local/bin/
|
||||||
|
|
||||||
|
85
README.md
85
README.md
@ -1,62 +1,49 @@
|
|||||||
# OCR
|
# OCR - Optical Character Recognition
|
||||||
|
|
||||||
## Build image
|
This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided.
|
||||||
|
|
||||||
1. Clone this repository and navigate into it:
|
## Software used in this pipeline implementation
|
||||||
```
|
- Official Debian Docker image (buster-slim) and programs from its free repositories: https://hub.docker.com/_/debian
|
||||||
git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git && cd ocr
|
- ocropy (1.3.3): https://github.com/ocropus/ocropy/releases/tag/v1.3.3
|
||||||
|
- pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20
|
||||||
|
- Tesseract OCR (4.1.1): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1
|
||||||
|
- tessdata_best (4.1.0): https://github.com/tesseract-ocr/tessdata_best/releases/tag/4.1.0
|
||||||
|
|
||||||
|
|
||||||
|
## Use this image
|
||||||
|
|
||||||
|
1. Create input and output directories for the pipeline.
|
||||||
|
``` bash
|
||||||
|
mkdir -p /<my_data_location>/input /<my_data_location>/output
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Build image:
|
2. Place your PDF files inside `/<my_data_location>/input`. Files should all contain text of the same language.
|
||||||
```
|
|
||||||
docker build -t sfb1288inf/ocr:latest .
|
|
||||||
```
|
|
||||||
|
|
||||||
Alternatively build from the GitLab repository without cloning:
|
3. Start the pipeline process. Check the [Pipeline arguments](#pipeline-arguments) section for more details.
|
||||||
|
|
||||||
1. Build image:
|
|
||||||
```
|
|
||||||
docker build -t sfb1288inf/ocr:latest https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
|
|
||||||
```
|
```
|
||||||
|
# Option one: Use the wrapper script
|
||||||
|
## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/raw/1.0.0/wrapper/ocr, make it executeable and add it to your ${PATH}
|
||||||
|
cd /<my_data_location>
|
||||||
|
ocr -i input -l <language_code> -o output <pipeline_arguments>
|
||||||
|
|
||||||
## Download prebuilt image
|
# Option two: Classic Docker style
|
||||||
|
|
||||||
The GitLab registry provides a prebuilt image. It is automatically created, utilizing the conquaire build servers.
|
|
||||||
|
|
||||||
1. Download image:
|
|
||||||
```
|
|
||||||
docker pull gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest
|
|
||||||
```
|
|
||||||
|
|
||||||
## Run
|
|
||||||
|
|
||||||
1. Create input and output directories for the OCR software:
|
|
||||||
```
|
|
||||||
mkdir -p /<mydatalocation>/files_for_ocr /<mydatalocation>/files_from_ocr
|
|
||||||
```
|
|
||||||
|
|
||||||
2. Place your files inside the `/<mydatalocation>/files_for_ocr` directory. Files can either be PDF (.pdf) or multipage TIFF (.tiff, .tif) files. Files should all contain text of the same language.
|
|
||||||
|
|
||||||
3. Start the OCR process.
|
|
||||||
```
|
|
||||||
docker run \
|
docker run \
|
||||||
--rm \
|
--rm \
|
||||||
-it \
|
-it \
|
||||||
-u $(id -u $USER):$(id -g $USER) \
|
-u $(id -u $USER):$(id -g $USER) \
|
||||||
-v /<mydatalocation>/files_for_ocr:/input \
|
-v /<my_data_location>/input:/input \
|
||||||
-v /<mydatalocation>/files_from_ocr:/output \
|
-v /<my_data_location>/output:/output \
|
||||||
sfb1288inf/ocr:latest \
|
gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:1.0.0 \
|
||||||
-i /input \
|
-i /input \
|
||||||
-l <languagecode> \
|
-l <language_code>
|
||||||
-o /output
|
-o /output \
|
||||||
|
<optional_pipeline_arguments>
|
||||||
```
|
```
|
||||||
The arguments below `sfb1288inf/ocr:latest` are described in the [OCR arguments](#ocr-arguments) part.
|
|
||||||
|
|
||||||
If you want to use the prebuilt image, replace `sfb1288inf/ocr:latest` with `gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest`.
|
4. Check your results in the `/<my_data_location>/output` directory.
|
||||||
|
```
|
||||||
|
|
||||||
4. Check your results in the `/<mydatalocation>/files_from_ocr` directory.
|
### Pipeline arguments
|
||||||
|
|
||||||
### OCR arguments
|
|
||||||
|
|
||||||
`-l languagecode`
|
`-l languagecode`
|
||||||
* Tells tesseract which language will be used.
|
* Tells tesseract which language will be used.
|
||||||
@ -78,15 +65,15 @@ kept.
|
|||||||
* Used to skip binarization with ocropus. If skipped, only the tesseract binarization is used.
|
* Used to skip binarization with ocropus. If skipped, only the tesseract binarization is used.
|
||||||
* default = False
|
* default = False
|
||||||
|
|
||||||
Example with all arguments used:
|
``` bash
|
||||||
```
|
# Example with all arguments used
|
||||||
docker run \
|
docker run \
|
||||||
--rm \
|
--rm \
|
||||||
-it \
|
-it \
|
||||||
-u $(id -u $USER):$(id -g $USER) \
|
-u $(id -u $USER):$(id -g $USER) \
|
||||||
-v "$HOME"/ocr/files_for_ocr:/input \
|
-v "$HOME"/ocr/input:/input \
|
||||||
-v "$HOME"/ocr/files_from_ocr:/output \
|
-v "$HOME"/ocr/output:/output \
|
||||||
sfb1288inf/ocr:latest \
|
gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:1.0.0 \
|
||||||
-i /input \
|
-i /input \
|
||||||
-l eng \
|
-l eng \
|
||||||
-o /output \
|
-o /output \
|
||||||
|
@ -1,11 +1,13 @@
|
|||||||
#!/usr/bin/env python3.7
|
#!/usr/bin/env python3.7
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
|
||||||
|
""""Merges hOCR files into a TEI file."""
|
||||||
|
|
||||||
from xml.sax.saxutils import escape
|
from xml.sax.saxutils import escape
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
parser = ArgumentParser(description='Merges hOCR files to one P5 file.')
|
parser = ArgumentParser(description='Merges hOCR files into a TEI file.')
|
||||||
parser.add_argument('i', metavar='hOCR-sourcefile', nargs='+')
|
parser.add_argument('i', metavar='hOCR-sourcefile', nargs='+')
|
||||||
parser.add_argument('o', metavar='TEI-destfile',)
|
parser.add_argument('o', metavar='TEI-destfile',)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
17
ocr
17
ocr
@ -1,13 +1,10 @@
|
|||||||
#!/usr/bin/env python2.7
|
#!/usr/bin/env python2.7
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
|
||||||
"""
|
"""An OCR pipeline for PDF file processing."""
|
||||||
ocr
|
|
||||||
|
|
||||||
Usage: For usage instructions run with option --help
|
__author__ = 'Patrick Jentsch <p.jentsch@uni-bielefeld.de>'
|
||||||
Authors: Patrick Jentsch <p.jentsch@uni-bielefeld.de
|
__version__ = '1.0.0'
|
||||||
Stephan Porada <sporada@uni-bielefeld.de>
|
|
||||||
"""
|
|
||||||
|
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
from natsort import natsorted
|
from natsort import natsorted
|
||||||
@ -22,7 +19,10 @@ TESSERACT_MODELS = ['deu', 'eng', 'enm', 'fra', 'frk', 'frm', 'ita', 'por', 'spa
|
|||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
parser = ArgumentParser(description='OCR Pipeline utilizing tesseract.')
|
parser = ArgumentParser(
|
||||||
|
description='An OCR pipeline for PDF file processing.',
|
||||||
|
prog='OCR pipeline'
|
||||||
|
)
|
||||||
parser.add_argument('-i', '--input-directory',
|
parser.add_argument('-i', '--input-directory',
|
||||||
help='Input directory (only PDF files get processed)',
|
help='Input directory (only PDF files get processed)',
|
||||||
required=True)
|
required=True)
|
||||||
@ -45,6 +45,9 @@ def parse_args():
|
|||||||
help='Zips all results in different archives depending'
|
help='Zips all results in different archives depending'
|
||||||
' on result types. Also zips everything into one '
|
' on result types. Also zips everything into one '
|
||||||
'archive.')
|
'archive.')
|
||||||
|
parser.add_argument('-v', '--version',
|
||||||
|
action='version',
|
||||||
|
version='%(prog)s {}'.format(__version__))
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,11 +1,14 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
|
||||||
|
"""A wrapper to execute the OCR pipeline in a Docker container"""
|
||||||
|
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest'
|
CONTAINER_IMAGE_TAG = '1.0.0'
|
||||||
|
CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:{}'.format(CONTAINER_IMAGE_TAG) # noqa
|
||||||
CONTAINER_INPUT_DIR = '/input'
|
CONTAINER_INPUT_DIR = '/input'
|
||||||
CONTAINER_INTERMEDIATE_DIR = '/intermediate'
|
CONTAINER_INTERMEDIATE_DIR = '/intermediate'
|
||||||
CONTAINER_OUTPUT_DIR = '/output'
|
CONTAINER_OUTPUT_DIR = '/output'
|
||||||
|
Loading…
Reference in New Issue
Block a user