mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2025-04-05 11:24:21 +00:00
Compare commits
44 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
ca1803ab8a | ||
|
4518ca1c83 | ||
|
aeab9b7802 | ||
|
00c4b17018 | ||
|
c057d324cf | ||
|
f51a8c4546 | ||
|
c640d9743f | ||
|
e3fd679b38 | ||
|
8a3816121c | ||
|
e1b78b6ba4 | ||
|
a0760487ae | ||
|
a798457c43 | ||
|
e2da0fb839 | ||
|
e78f667438 | ||
|
41f70da8eb | ||
|
6db7f70446 | ||
|
947658a7d8 | ||
|
acbf61be05 | ||
|
104598039e | ||
|
da29659a9b | ||
|
613bceb4ff | ||
|
ca7df6d0ed | ||
|
07635dcdfa | ||
|
c0069d5453 | ||
|
e941f64ee4 | ||
|
cb68d6de2d | ||
|
4b84488fe6 | ||
|
7d52ad9f68 | ||
|
ac4b5c2fd8 | ||
|
6d90d43699 | ||
|
4bd0d3bb01 | ||
|
15061bfaaf | ||
|
7cc8ebd666 | ||
|
82285a8e6c | ||
|
7322a5bc7c | ||
|
2b63ba9e59 | ||
|
aee9628e5e | ||
|
ec5b4eb521 | ||
|
b77ca5914f | ||
|
018939ae55 | ||
|
64fe706126 | ||
|
a75b32ca1d | ||
|
364e3d626d | ||
|
36a86887b0 |
@ -1,44 +1,68 @@
|
|||||||
image: docker:stable
|
image: docker:19.03.13
|
||||||
|
|
||||||
services:
|
services:
|
||||||
- docker:stable-dind
|
- docker:19.03.13-dind
|
||||||
|
|
||||||
variables:
|
|
||||||
DOCKER_DRIVER: overlay2
|
|
||||||
|
|
||||||
stages:
|
stages:
|
||||||
- build
|
- build
|
||||||
- push
|
- push
|
||||||
|
|
||||||
before_script:
|
variables:
|
||||||
- docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
|
DOCKER_TLS_CERTDIR: "/certs"
|
||||||
|
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME-$CI_COMMIT_SHA
|
||||||
|
|
||||||
Build:
|
.reg_setup:
|
||||||
|
before_script:
|
||||||
|
- apk add --no-cache curl
|
||||||
|
- curl --fail --show-error --location "https://github.com/genuinetools/reg/releases/download/v$REG_VERSION/reg-linux-amd64" --output /usr/local/bin/reg
|
||||||
|
- echo "$REG_SHA256 /usr/local/bin/reg" | sha256sum -c -
|
||||||
|
- chmod a+x /usr/local/bin/reg
|
||||||
|
variables:
|
||||||
|
REG_SHA256: ade837fc5224acd8c34732bf54a94f579b47851cc6a7fd5899a98386b782e228
|
||||||
|
REG_VERSION: 0.16.1
|
||||||
|
|
||||||
|
build_image:
|
||||||
script:
|
script:
|
||||||
- docker build --pull -t $CI_REGISTRY_IMAGE:tmp .
|
- docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
|
||||||
- docker push $CI_REGISTRY_IMAGE:tmp
|
- docker build -t $INTERMEDIATE_IMAGE_TAG .
|
||||||
|
- docker push $INTERMEDIATE_IMAGE_TAG
|
||||||
stage: build
|
stage: build
|
||||||
tags:
|
tags:
|
||||||
- docker
|
- docker
|
||||||
|
|
||||||
Push latest:
|
push_master:
|
||||||
|
extends:
|
||||||
|
- .reg_setup
|
||||||
only:
|
only:
|
||||||
- master
|
- master
|
||||||
script:
|
script:
|
||||||
- docker pull $CI_REGISTRY_IMAGE:tmp
|
- docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
|
||||||
- docker tag $CI_REGISTRY_IMAGE:tmp $CI_REGISTRY_IMAGE:latest
|
- docker pull $INTERMEDIATE_IMAGE_TAG
|
||||||
- docker push $CI_REGISTRY_IMAGE:latest
|
- /usr/local/bin/reg rm -d --auth-url $CI_REGISTRY -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $INTERMEDIATE_IMAGE_TAG
|
||||||
|
- docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG
|
||||||
|
- docker push $IMAGE_TAG
|
||||||
stage: push
|
stage: push
|
||||||
tags:
|
tags:
|
||||||
- docker
|
- docker
|
||||||
|
variables:
|
||||||
|
IMAGE_TAG: $CI_REGISTRY_IMAGE:latest
|
||||||
|
|
||||||
Push tag:
|
push_other:
|
||||||
|
extends:
|
||||||
|
- .reg_setup
|
||||||
|
except:
|
||||||
|
- master
|
||||||
only:
|
only:
|
||||||
|
- branches
|
||||||
- tags
|
- tags
|
||||||
script:
|
script:
|
||||||
- docker pull $CI_REGISTRY_IMAGE:tmp
|
- docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
|
||||||
- docker tag $CI_REGISTRY_IMAGE:tmp $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME
|
- docker pull $INTERMEDIATE_IMAGE_TAG
|
||||||
- docker push $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME
|
- /usr/local/bin/reg rm -d --auth-url $CI_REGISTRY -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $INTERMEDIATE_IMAGE_TAG
|
||||||
|
- docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG
|
||||||
|
- docker push $IMAGE_TAG
|
||||||
stage: push
|
stage: push
|
||||||
tags:
|
tags:
|
||||||
- docker
|
- docker
|
||||||
|
variables:
|
||||||
|
IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME
|
||||||
|
116
Dockerfile
116
Dockerfile
@ -1,73 +1,85 @@
|
|||||||
FROM debian:9-slim
|
FROM debian:buster-slim
|
||||||
|
|
||||||
|
|
||||||
# Define image metadata
|
LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <porada@posteo.de>"
|
||||||
LABEL maintainer="inf_sfb1288@lists.uni-bielefeld.de"
|
|
||||||
|
|
||||||
|
|
||||||
ENV LANG=C.UTF-8
|
ENV LANG=C.UTF-8
|
||||||
|
|
||||||
|
|
||||||
# Install prerequisites
|
|
||||||
RUN apt-get update \
|
RUN apt-get update \
|
||||||
&& apt-get install -y --no-install-recommends \
|
&& apt-get install --no-install-recommends --yes \
|
||||||
apt-transport-https \
|
ghostscript \
|
||||||
ca-certificates \
|
procps \
|
||||||
gnupg2 \
|
python3.7 \
|
||||||
imagemagick \
|
python3-pip \
|
||||||
poppler-utils \
|
rename \
|
||||||
python2.7 \
|
|
||||||
python3.5 \
|
|
||||||
wget \
|
wget \
|
||||||
zip \
|
zip \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& python3 -m pip install lxml
|
||||||
|
|
||||||
ENV OCROPY_VERSION 1.3.3
|
# Install the OCR pipeline and it's dependencies #
|
||||||
ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" .
|
## Install pyFlow ##
|
||||||
RUN tar -xzf "v${OCROPY_VERSION}.tar.gz" \
|
ENV PYFLOW_VERSION=1.1.20
|
||||||
|
RUN wget --no-check-certificate --quiet \
|
||||||
|
"https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" \
|
||||||
|
&& tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
|
||||||
|
&& cd "pyflow-${PYFLOW_VERSION}" \
|
||||||
|
&& apt-get install --no-install-recommends --yes \
|
||||||
|
python2.7 \
|
||||||
|
&& python2.7 setup.py build install \
|
||||||
|
&& cd - > /dev/null \
|
||||||
|
&& rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz"
|
||||||
|
|
||||||
|
|
||||||
|
## Install ocropy ##
|
||||||
|
ENV OCROPY_VERSION=1.3.3
|
||||||
|
RUN wget --no-check-certificate --quiet \
|
||||||
|
"https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" \
|
||||||
|
&& tar -xzf "v${OCROPY_VERSION}.tar.gz" \
|
||||||
&& cd "ocropy-${OCROPY_VERSION}" \
|
&& cd "ocropy-${OCROPY_VERSION}" \
|
||||||
&& apt-get update \
|
&& apt-get install --no-install-recommends --yes \
|
||||||
&& apt-get install -y --no-install-recommends \
|
python2.7 \
|
||||||
python-pil \
|
python-pil \
|
||||||
python-tk \
|
python-tk \
|
||||||
$(cat PACKAGES) \
|
$(cat PACKAGES) \
|
||||||
&& rm -rf /var/lib/apt/lists/* \
|
|
||||||
&& python2.7 setup.py install \
|
&& python2.7 setup.py install \
|
||||||
&& cd .. \
|
&& cd - > /dev/null \
|
||||||
&& rm -rf \
|
&& rm -r "ocropy-${OCROPY_VERSION}" "v${OCROPY_VERSION}.tar.gz"
|
||||||
"ocropy-${OCROPY_VERSION}" \
|
|
||||||
"v${OCROPY_VERSION}.tar.gz"
|
|
||||||
|
|
||||||
ENV PYFLOW_VERSION=1.1.20
|
|
||||||
ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" .
|
|
||||||
RUN tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
|
|
||||||
&& cd "pyflow-${PYFLOW_VERSION}" \
|
|
||||||
&& python2.7 setup.py build install \
|
|
||||||
&& cd .. \
|
|
||||||
&& rm -rf \
|
|
||||||
"pyflow-${PYFLOW_VERSION}" \
|
|
||||||
"pyflow-${PYFLOW_VERSION}.tar.gz"
|
|
||||||
|
|
||||||
RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list \
|
|
||||||
&& wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - \
|
|
||||||
&& apt-get update \
|
|
||||||
&& apt-get install -y --no-install-recommends \
|
|
||||||
tesseract-ocr \
|
|
||||||
tesseract-ocr-deu \
|
|
||||||
tesseract-ocr-eng \
|
|
||||||
tesseract-ocr-enm \
|
|
||||||
tesseract-ocr-fra \
|
|
||||||
tesseract-ocr-frk \
|
|
||||||
tesseract-ocr-frm \
|
|
||||||
tesseract-ocr-ita \
|
|
||||||
tesseract-ocr-por \
|
|
||||||
tesseract-ocr-spa \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
|
|
||||||
# Install OCR pipeline
|
## Install Tesseract OCR ##
|
||||||
COPY hocrtotei /usr/local/bin
|
ENV TESSERACT_VERSION=5.0.0
|
||||||
COPY ocr /usr/local/bin
|
RUN wget --no-check-certificate --quiet \
|
||||||
|
"https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \
|
||||||
|
&& tar -xzf "${TESSERACT_VERSION}.tar.gz" \
|
||||||
|
&& cd "tesseract-${TESSERACT_VERSION}" \
|
||||||
|
&& apt-get install --no-install-recommends --yes \
|
||||||
|
autoconf \
|
||||||
|
automake \
|
||||||
|
g++ \
|
||||||
|
libjpeg62-turbo-dev \
|
||||||
|
libleptonica-dev \
|
||||||
|
libtiff5-dev \
|
||||||
|
libtool \
|
||||||
|
libpng-dev \
|
||||||
|
make \
|
||||||
|
pkg-config \
|
||||||
|
zlib1g-dev \
|
||||||
|
&& ./autogen.sh \
|
||||||
|
&& ./configure --disable-openmp --disable-shared 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic' \
|
||||||
|
&& make \
|
||||||
|
&& make install \
|
||||||
|
&& ldconfig \
|
||||||
|
&& cd - > /dev/null \
|
||||||
|
&& rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz"
|
||||||
|
|
||||||
|
|
||||||
|
RUN rm -r /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
|
||||||
|
## Install Pipeline ##
|
||||||
|
COPY hocr2tei hocr-combine ocr /usr/local/bin/
|
||||||
|
|
||||||
|
|
||||||
ENTRYPOINT ["ocr"]
|
ENTRYPOINT ["ocr"]
|
||||||
|
21
LICENSE
Normal file
21
LICENSE
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2021 Bielefeld University - CRC 1288 - INF
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
139
README.md
139
README.md
@ -1,96 +1,49 @@
|
|||||||
# OCR
|
# OCR - Optical Character Recognition
|
||||||
|
|
||||||
## Build image
|
This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided. The pipeline is designed to run on Linux operating systems, but with some tweaks it should also run on Windows with WSL installed.
|
||||||
|
|
||||||
1. Clone this repository and navigate into it:
|
## Software used in this pipeline implementation
|
||||||
```
|
|
||||||
git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git && cd ocr
|
- Official Debian Docker image (buster-slim): https://hub.docker.com/_/debian
|
||||||
```
|
- Software from Debian Buster's free repositories
|
||||||
|
- ocropy (1.3.3): https://github.com/ocropus/ocropy/releases/tag/v1.3.3
|
||||||
2. Build image:
|
- pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20
|
||||||
```
|
- Tesseract OCR (5.0.0): https://github.com/tesseract-ocr/tesseract/releases/tag/5.0.0
|
||||||
docker build -t sfb1288inf/ocr:latest .
|
|
||||||
```
|
## Installation
|
||||||
|
|
||||||
Alternatively build from the GitLab repository without cloning:
|
1. Install Docker and Python 3.
|
||||||
|
2. Clone this repository: `git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git`
|
||||||
1. Build image:
|
3. Build the Docker image: `docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:v0.1.0 ocr`
|
||||||
```
|
4. Add the wrapper script (`wrapper/ocr` relative to this README file) to your `${PATH}`.
|
||||||
docker build -t sfb1288inf/ocr:latest https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
|
5. Create working directories for the pipeline: `mkdir -p /<my_data_location>/{input,models,output}`.
|
||||||
```
|
6. Place your Tesseract OCR model(s) inside `/<my_data_location>/models`.
|
||||||
|
|
||||||
## Download prebuilt image
|
## Use the Pipeline
|
||||||
|
|
||||||
The GitLab registry provides a prebuilt image. It is automatically created, utilizing the conquaire build servers.
|
1. Place your PDF files inside `/<my_data_location>/input`. Files should all contain text of the same language.
|
||||||
|
2. Clear your `/<my_data_location>/output` directory.
|
||||||
1. Download image:
|
3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details.
|
||||||
```
|
```bash
|
||||||
docker pull gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest
|
cd /<my_data_location>
|
||||||
```
|
# <model_code> is the model filename without the ".traineddata" suffix
|
||||||
|
ocr \
|
||||||
## Run
|
--input-dir input \
|
||||||
|
--output-dir output \
|
||||||
1. Create input and output directories for the OCR software:
|
--model-file models/<model>
|
||||||
```
|
-m <model_code> <optional_pipeline_arguments>
|
||||||
mkdir -p /<mydatalocation>/files_for_ocr /<mydatalocation>/files_from_ocr
|
# More then one model
|
||||||
```
|
ocr \
|
||||||
|
--input-dir input \
|
||||||
2. Place your files inside the `/<mydatalocation>/files_for_ocr` directory. Files can either be PDF (.pdf) or multipage TIFF (.tiff, .tif) files. Files should all contain text of the same language.
|
--output-dir output \
|
||||||
|
--model-file models/<model1>
|
||||||
3. Start the OCR process.
|
--model-file models/<model2>
|
||||||
```
|
-m <model1_code>+<model2_code> <optional_pipeline_arguments>
|
||||||
docker run \
|
# Instead of multiple --model-file statements, you can also use
|
||||||
--rm \
|
ocr \
|
||||||
-it \
|
--input-dir input \
|
||||||
-u $(id -u $USER):$(id -g $USER) \
|
--output-dir output \
|
||||||
-v /<mydatalocation>/files_for_ocr:/input \
|
--model-file models/*
|
||||||
-v /<mydatalocation>/files_from_ocr:/output \
|
-m <model1_code>+<model2_code> <optional_pipeline_arguments>
|
||||||
sfb1288inf/ocr:latest \
|
|
||||||
-i /input \
|
|
||||||
-l <languagecode> \
|
|
||||||
-o /output
|
|
||||||
```
|
|
||||||
The arguments below `sfb1288inf/ocr:latest` are described in the [OCR arguments](#ocr-arguments) part.
|
|
||||||
|
|
||||||
If you want to use the prebuilt image, replace `sfb1288inf/ocr:latest` with `gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest`.
|
|
||||||
|
|
||||||
4. Check your results in the `/<mydatalocation>/files_from_ocr` directory.
|
|
||||||
|
|
||||||
### OCR arguments
|
|
||||||
|
|
||||||
`-l languagecode`
|
|
||||||
* Tells tesseract which language will be used.
|
|
||||||
* options = deu (German), eng (English), enm (Middle englisch), fra (French), frk (German Fraktur), frm (Middle french), ita (Italian), por (Portuguese), spa (Spanish)
|
|
||||||
* required = True
|
|
||||||
|
|
||||||
`--keep-intermediates`
|
|
||||||
* If set, all intermediate files created during the OCR process will be
|
|
||||||
kept.
|
|
||||||
* default = False
|
|
||||||
* required = False
|
|
||||||
|
|
||||||
`--nCores corenumber`
|
|
||||||
* Sets the number of CPU cores being used during the OCR process.
|
|
||||||
* default = min(4, multiprocessing.cpu_count())
|
|
||||||
* required = False
|
|
||||||
|
|
||||||
`--skip-binarisation`
|
|
||||||
* Used to skip binarization with ocropus. If skipped, only the tesseract binarization is used.
|
|
||||||
* default = False
|
|
||||||
|
|
||||||
Example with all arguments used:
|
|
||||||
```
|
|
||||||
docker run \
|
|
||||||
--rm \
|
|
||||||
-it \
|
|
||||||
-u $(id -u $USER):$(id -g $USER) \
|
|
||||||
-v "$HOME"/ocr/files_for_ocr:/input \
|
|
||||||
-v "$HOME"/ocr/files_from_ocr:/output \
|
|
||||||
sfb1288inf/ocr:latest \
|
|
||||||
-i /input \
|
|
||||||
-l eng \
|
|
||||||
-o /output \
|
|
||||||
--keep_intermediates \
|
|
||||||
--nCores 8 \
|
|
||||||
--skip-binarisation
|
|
||||||
```
|
```
|
||||||
|
4. Check your results in the `/<my_data_location>/output` directory.
|
||||||
|
44
hocr-combine
Executable file
44
hocr-combine
Executable file
@ -0,0 +1,44 @@
|
|||||||
|
#!/usr/bin/env python3.7
|
||||||
|
# coding=utf-8
|
||||||
|
|
||||||
|
''' Combine multiple hOCR files. '''
|
||||||
|
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
from lxml import html
|
||||||
|
|
||||||
|
|
||||||
|
parser = ArgumentParser(description='Combine multiple hOCR files.')
|
||||||
|
parser.add_argument(
|
||||||
|
'-i', '--input-file',
|
||||||
|
help='Input file',
|
||||||
|
nargs='+',
|
||||||
|
required=True
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'-o', '--output-file',
|
||||||
|
help='Output file',
|
||||||
|
required=True
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
for input_file in args.input_file:
|
||||||
|
input_files = []
|
||||||
|
if input_file.startswith('@'):
|
||||||
|
with open(input_file[1:], 'r') as f:
|
||||||
|
input_files += [x for x in f.read().split("\n") if x != '']
|
||||||
|
else:
|
||||||
|
input_files.append(input_file)
|
||||||
|
if len(input_files) == 0:
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
hocr = html.parse(input_files[0])
|
||||||
|
hocr_body = hocr.find('body')
|
||||||
|
for input_file in input_files[1:]:
|
||||||
|
for ocr_page in html.parse(input_file).findall('//div[@class="ocr_page"]'):
|
||||||
|
hocr_body.append(ocr_page)
|
||||||
|
|
||||||
|
|
||||||
|
with open(args.output_file, 'wb') as f:
|
||||||
|
hocr.write(f, encoding='UTF-8', method='html')
|
68
hocr2tei
Executable file
68
hocr2tei
Executable file
@ -0,0 +1,68 @@
|
|||||||
|
#!/usr/bin/env python3.7
|
||||||
|
# coding=utf-8
|
||||||
|
|
||||||
|
''' Convert hOCR to TEI XML. '''
|
||||||
|
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
from lxml import html
|
||||||
|
from xml.sax.saxutils import escape
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
parser = ArgumentParser(description='Convert hOCR to TEI XML.')
|
||||||
|
parser.add_argument(
|
||||||
|
'-i', '--input-file',
|
||||||
|
help='Input file',
|
||||||
|
required=True
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'-o', '--output-file',
|
||||||
|
help='Output file',
|
||||||
|
required=True
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
tei = ''
|
||||||
|
tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n'
|
||||||
|
tei += ' <teiHeader>\n'
|
||||||
|
tei += ' <fileDesc>\n'
|
||||||
|
tei += ' <titleStmt>\n'
|
||||||
|
tei += ' <title></title>\n'
|
||||||
|
tei += ' </titleStmt>\n'
|
||||||
|
tei += ' <publicationStmt>\n'
|
||||||
|
tei += ' <p></p>\n'
|
||||||
|
tei += ' </publicationStmt>\n'
|
||||||
|
tei += ' <sourceDesc>\n'
|
||||||
|
tei += ' <p></p>\n'
|
||||||
|
tei += ' </sourceDesc>\n'
|
||||||
|
tei += ' </fileDesc>\n'
|
||||||
|
tei += ' </teiHeader>\n'
|
||||||
|
tei += ' <text>\n'
|
||||||
|
tei += ' <body>\n'
|
||||||
|
hocr = html.parse(args.input_file)
|
||||||
|
for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
|
||||||
|
ocr_page_title_attrib = ocr_page.attrib.get('title')
|
||||||
|
facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1)
|
||||||
|
page_number = re.search(r'ppageno (\d+)', ocr_page_title_attrib).group(1)
|
||||||
|
tei += f' <pb facs="{facsimile}" n="{page_number}"/>\n'
|
||||||
|
for ocr_par in ocr_page.findall('.//p[@class="ocr_par"]'):
|
||||||
|
tei += ' <p>\n'
|
||||||
|
for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'):
|
||||||
|
tei += ' <lb/>'
|
||||||
|
is_first_word_in_line = True
|
||||||
|
for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'):
|
||||||
|
if ocrx_word.text is not None:
|
||||||
|
if not is_first_word_in_line:
|
||||||
|
tei += ' '
|
||||||
|
tei += escape(ocrx_word.text)
|
||||||
|
is_first_word_in_line = False
|
||||||
|
tei += '\n'
|
||||||
|
tei += ' </p>\n'
|
||||||
|
tei += ' </body>\n'
|
||||||
|
tei += ' </text>\n'
|
||||||
|
tei += '</TEI>\n'
|
||||||
|
|
||||||
|
|
||||||
|
with open(args.output_file, 'w') as f:
|
||||||
|
f.write(tei)
|
58
hocrtotei
58
hocrtotei
@ -1,58 +0,0 @@
|
|||||||
#!/usr/bin/env python3.5
|
|
||||||
# coding=utf-8
|
|
||||||
|
|
||||||
from xml.sax.saxutils import escape
|
|
||||||
import argparse
|
|
||||||
import xml.etree.ElementTree as ET
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description='Merges several hOCR files in order of their occurrence on command line to one TEI result file.'
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'i',
|
|
||||||
metavar='hOCR-sourcefile',
|
|
||||||
nargs='+'
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'o',
|
|
||||||
metavar='TEI-destfile',
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
output_file = open(args.o, 'w')
|
|
||||||
|
|
||||||
output_file.write(
|
|
||||||
'<?xml version="1.0" encoding="UTF-8"?>\n'
|
|
||||||
+ '<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="dtabf">\n'
|
|
||||||
+ ' <teiHeader>\n'
|
|
||||||
+ ' <fileDesc>\n'
|
|
||||||
+ ' <titleStmt/>\n'
|
|
||||||
+ ' <publicationStmt/>\n'
|
|
||||||
+ ' <sourceDesc/>\n'
|
|
||||||
+ ' </fileDesc>\n'
|
|
||||||
+ ' <encodingDesc/>\n'
|
|
||||||
+ ' <profileDesc/>\n'
|
|
||||||
+ ' </teiHeader>\n'
|
|
||||||
+ ' <text>\n'
|
|
||||||
+ ' <body>\n'
|
|
||||||
)
|
|
||||||
for index, input_file in enumerate(args.i):
|
|
||||||
tree = ET.parse(input_file)
|
|
||||||
output_file.write(' <pb n="%i"/>\n' % (index + 1))
|
|
||||||
for para in tree.findall('.//*[@class="ocr_par"]'):
|
|
||||||
output_file.write(' <p>\n')
|
|
||||||
for line in para.findall('.//*[@class="ocr_line"]'):
|
|
||||||
first_word_in_line = True
|
|
||||||
for word in line.findall('.//*[@class="ocrx_word"]'):
|
|
||||||
if word.text is not None:
|
|
||||||
output_file.write((' ' if first_word_in_line else ' ') + escape(word.text.strip()))
|
|
||||||
first_word_in_line = False
|
|
||||||
if not first_word_in_line:
|
|
||||||
output_file.write('<lb/>\n')
|
|
||||||
output_file.write(' </p>\n')
|
|
||||||
output_file.write(
|
|
||||||
' </body>\n'
|
|
||||||
+ ' </text>\n'
|
|
||||||
+ '</TEI>')
|
|
||||||
|
|
||||||
output_file.close()
|
|
57
wrapper/ocr
57
wrapper/ocr
@ -1,39 +1,44 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
|
||||||
import argparse
|
from argparse import ArgumentParser
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import sys
|
||||||
|
|
||||||
container_image = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest'
|
CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:v0.1.0'
|
||||||
container_input_dir = '/input'
|
CONTAINER_INPUT_DIR = '/input'
|
||||||
container_output_dir = '/output'
|
CONTAINER_OUTPUT_DIR = '/output'
|
||||||
uid = str(os.getuid())
|
CONTAINER_MODELS_DIR = '/usr/local/share/tessdata'
|
||||||
gid = str(os.getgid())
|
CONTAINER_LOG_DIR = '/logs'
|
||||||
|
UID = str(os.getuid())
|
||||||
|
GID = str(os.getgid())
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(add_help=False)
|
parser = ArgumentParser(add_help=False)
|
||||||
parser.add_argument(
|
parser.add_argument('-i', '--input-dir')
|
||||||
'-i',
|
parser.add_argument('-o', '--output-dir')
|
||||||
dest='input_dir',
|
parser.add_argument('-t', '--model-file', action='extend', nargs='+')
|
||||||
required=False
|
parser.add_argument('--log-dir')
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'-o',
|
|
||||||
dest='output_dir',
|
|
||||||
required=False
|
|
||||||
)
|
|
||||||
args, remaining_args = parser.parse_known_args()
|
args, remaining_args = parser.parse_known_args()
|
||||||
|
|
||||||
cmd = ['docker', 'run', '--rm', '-it', '-u', uid + ':' + gid]
|
cmd = ['docker', 'run', '--rm', '-it', '-u', f'{UID}:{GID}']
|
||||||
if args.input_dir is not None:
|
if args.input_dir is not None:
|
||||||
host_input_dir = os.path.abspath(args.input_dir)
|
mapping = f'{os.path.abspath(args.input_dir)}:{CONTAINER_INPUT_DIR}'
|
||||||
cmd += ['-v', host_input_dir + ':' + container_input_dir]
|
cmd += ['-v', mapping]
|
||||||
remaining_args += ['-i', container_input_dir]
|
remaining_args += ['-i', CONTAINER_INPUT_DIR]
|
||||||
if args.output_dir is not None:
|
if args.output_dir is not None:
|
||||||
host_output_dir = os.path.abspath(args.output_dir)
|
mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}'
|
||||||
cmd += ['-v', host_output_dir + ':' + container_output_dir]
|
cmd += ['-v', mapping]
|
||||||
remaining_args += ['-o', container_output_dir]
|
remaining_args += ['-o', CONTAINER_OUTPUT_DIR]
|
||||||
cmd.append(container_image)
|
if args.model_file is not None:
|
||||||
|
for model_file in args.model_file:
|
||||||
|
mapping = f'{os.path.abspath(model_file)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model_file)}' # noqa
|
||||||
|
cmd += ['-v', mapping]
|
||||||
|
if args.log_dir is not None:
|
||||||
|
mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}'
|
||||||
|
cmd += ['-v', mapping]
|
||||||
|
remaining_args += ['--log-dir', CONTAINER_LOG_DIR]
|
||||||
|
cmd.append(CONTAINER_IMAGE)
|
||||||
cmd += remaining_args
|
cmd += remaining_args
|
||||||
|
|
||||||
subprocess.run(cmd)
|
sys.exit(subprocess.run(cmd).returncode)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user