Compare commits

...

44 Commits
1.0 ... master

Author SHA1 Message Date
Patrick Jentsch
ca1803ab8a Mark required arguments in scripts as required 2022-02-03 10:40:50 +01:00
Patrick Jentsch
4518ca1c83 Codestyle enhacements 2022-01-27 13:40:23 +01:00
Patrick Jentsch
aeab9b7802 Fix enumeration in readme 2022-01-18 13:46:52 +01:00
Patrick Jentsch
00c4b17018 Codestyle update 2022-01-18 13:45:17 +01:00
Patrick Jentsch
c057d324cf Cleanup and change some output options 2022-01-17 15:07:46 +01:00
Patrick Jentsch
f51a8c4546 Change output files file format 2022-01-14 10:56:16 +01:00
Patrick Jentsch
c640d9743f Add output_files.json (lists all output files) generation. 2022-01-05 11:25:00 +01:00
Patrick Jentsch
e3fd679b38 Mark all scripts as executeable 2022-01-04 13:21:38 +01:00
Patrick Jentsch
8a3816121c fix image tag 2022-01-04 12:10:26 +01:00
Patrick Jentsch
e1b78b6ba4 Update to Tesseract 5.0.0, Set version 0.1.0 2022-01-04 11:42:55 +01:00
Patrick Jentsch
a0760487ae Don't process files in subdirectories 2021-04-12 13:22:28 +02:00
Patrick Jentsch
a798457c43 Add mising --log-dir argument to wrapper script 2021-04-12 09:53:59 +02:00
Patrick Jentsch
e2da0fb839 Tweak the README and pipeline help. 2021-03-26 10:03:59 +01:00
Patrick Jentsch
e78f667438 Use more descriptive argument names then i and o (now: input and output) 2021-03-18 10:32:55 +01:00
Patrick Jentsch
41f70da8eb Update the hocrtotei script 2021-03-17 16:58:13 +01:00
Patrick Jentsch
6db7f70446 Add back german language models 2021-03-17 14:26:24 +01:00
Patrick Jentsch
947658a7d8 Change intermediate image name in order to fix issues with building multiple branches/tags at the same time 2021-03-15 14:11:23 +01:00
Patrick Jentsch
acbf61be05 Cleanup and make use of globbing for input files for binarization and ocr 2021-03-15 12:45:05 +01:00
Patrick Jentsch
104598039e Dockerfile codestyle 2021-02-24 15:28:04 +01:00
Patrick Jentsch
da29659a9b Add back missing author mention 2021-02-24 15:17:42 +01:00
Patrick Jentsch
613bceb4ff Add new models 2021-02-23 11:11:50 +01:00
Patrick Jentsch
ca7df6d0ed First work on version 1.0.0 2021-02-19 13:04:03 +01:00
Patrick Jentsch
07635dcdfa Use "buster" instead of "10" in FROM 2020-10-08 23:17:48 +02:00
Patrick Jentsch
c0069d5453 Use new Dockerfile structure 2020-10-08 23:09:10 +02:00
Patrick Jentsch
e941f64ee4 test new ci config 2020-10-07 16:44:38 +02:00
Stephan Porada
cb68d6de2d One thread per page ocr patch 2020-10-07 13:46:22 +02:00
Patrick Jentsch
4b84488fe6 fix gitlab ci 2020-09-23 16:58:07 +02:00
Patrick Jentsch
7d52ad9f68 Update 2020-09-23 15:52:24 +02:00
Patrick Jentsch
ac4b5c2fd8 Add possibility to use an intermediate dir 2020-09-22 17:44:32 +02:00
Patrick Jentsch
6d90d43699 fix cleanup attempt 2020-09-21 15:36:03 +02:00
Patrick Jentsch
4bd0d3bb01 Use commit_sha for intermediate image 2020-09-21 15:02:04 +02:00
Patrick Jentsch
15061bfaaf add tag to clean stage 2020-09-21 15:00:09 +02:00
Patrick Jentsch
7cc8ebd666 compile tesseract in container 2020-09-21 14:46:03 +02:00
Patrick Jentsch
82285a8e6c better multithreading 2020-07-02 11:49:35 +02:00
Patrick Jentsch
7322a5bc7c More GhostScript, less dependencies! 2020-07-02 11:47:43 +02:00
Patrick Jentsch
2b63ba9e59 Remove unused dependencies and use ghostscript for image split 2020-07-01 11:03:34 +02:00
Patrick Jentsch
aee9628e5e fix pipeline 2020-06-23 15:19:27 +02:00
Stephan Porada
ec5b4eb521 Add PDF compression 2020-06-16 09:31:34 +02:00
Stephan Porada
b77ca5914f Set relative file paths in hocr 2020-06-10 11:48:58 +02:00
Stephan Porada
018939ae55 Add PoCo zips part 1 2020-06-09 16:58:22 +02:00
Patrick Jentsch
64fe706126 Keep uncompressed output files after zip jobs. 2020-05-13 09:11:01 +02:00
Patrick Jentsch
a75b32ca1d Bump versions 2020-04-06 09:21:52 +02:00
Patrick Jentsch
364e3d626d Fix zip creation 2020-04-04 15:37:21 +02:00
Patrick Jentsch
36a86887b0 Update OCR Pipeline 2020-04-03 17:35:30 +02:00
9 changed files with 981 additions and 768 deletions

View File

@ -1,44 +1,68 @@
image: docker:stable image: docker:19.03.13
services: services:
- docker:stable-dind - docker:19.03.13-dind
variables:
DOCKER_DRIVER: overlay2
stages: stages:
- build - build
- push - push
before_script: variables:
- docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY DOCKER_TLS_CERTDIR: "/certs"
INTERMEDIATE_IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME-$CI_COMMIT_SHA
Build: .reg_setup:
before_script:
- apk add --no-cache curl
- curl --fail --show-error --location "https://github.com/genuinetools/reg/releases/download/v$REG_VERSION/reg-linux-amd64" --output /usr/local/bin/reg
- echo "$REG_SHA256 /usr/local/bin/reg" | sha256sum -c -
- chmod a+x /usr/local/bin/reg
variables:
REG_SHA256: ade837fc5224acd8c34732bf54a94f579b47851cc6a7fd5899a98386b782e228
REG_VERSION: 0.16.1
build_image:
script: script:
- docker build --pull -t $CI_REGISTRY_IMAGE:tmp . - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
- docker push $CI_REGISTRY_IMAGE:tmp - docker build -t $INTERMEDIATE_IMAGE_TAG .
- docker push $INTERMEDIATE_IMAGE_TAG
stage: build stage: build
tags: tags:
- docker - docker
Push latest: push_master:
extends:
- .reg_setup
only: only:
- master - master
script: script:
- docker pull $CI_REGISTRY_IMAGE:tmp - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
- docker tag $CI_REGISTRY_IMAGE:tmp $CI_REGISTRY_IMAGE:latest - docker pull $INTERMEDIATE_IMAGE_TAG
- docker push $CI_REGISTRY_IMAGE:latest - /usr/local/bin/reg rm -d --auth-url $CI_REGISTRY -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $INTERMEDIATE_IMAGE_TAG
- docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG
- docker push $IMAGE_TAG
stage: push stage: push
tags: tags:
- docker - docker
variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE:latest
Push tag: push_other:
extends:
- .reg_setup
except:
- master
only: only:
- branches
- tags - tags
script: script:
- docker pull $CI_REGISTRY_IMAGE:tmp - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
- docker tag $CI_REGISTRY_IMAGE:tmp $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME - docker pull $INTERMEDIATE_IMAGE_TAG
- docker push $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME - /usr/local/bin/reg rm -d --auth-url $CI_REGISTRY -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $INTERMEDIATE_IMAGE_TAG
- docker tag $INTERMEDIATE_IMAGE_TAG $IMAGE_TAG
- docker push $IMAGE_TAG
stage: push stage: push
tags: tags:
- docker - docker
variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME

View File

@ -1,73 +1,85 @@
FROM debian:9-slim FROM debian:buster-slim
# Define image metadata LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <porada@posteo.de>"
LABEL maintainer="inf_sfb1288@lists.uni-bielefeld.de"
ENV LANG=C.UTF-8 ENV LANG=C.UTF-8
# Install prerequisites
RUN apt-get update \ RUN apt-get update \
&& apt-get install -y --no-install-recommends \ && apt-get install --no-install-recommends --yes \
apt-transport-https \ ghostscript \
ca-certificates \ procps \
gnupg2 \ python3.7 \
imagemagick \ python3-pip \
poppler-utils \ rename \
python2.7 \
python3.5 \
wget \ wget \
zip \ zip \
&& rm -rf /var/lib/apt/lists/* && python3 -m pip install lxml
ENV OCROPY_VERSION 1.3.3 # Install the OCR pipeline and it's dependencies #
ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" . ## Install pyFlow ##
RUN tar -xzf "v${OCROPY_VERSION}.tar.gz" \ ENV PYFLOW_VERSION=1.1.20
RUN wget --no-check-certificate --quiet \
"https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" \
&& tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
&& cd "pyflow-${PYFLOW_VERSION}" \
&& apt-get install --no-install-recommends --yes \
python2.7 \
&& python2.7 setup.py build install \
&& cd - > /dev/null \
&& rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz"
## Install ocropy ##
ENV OCROPY_VERSION=1.3.3
RUN wget --no-check-certificate --quiet \
"https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" \
&& tar -xzf "v${OCROPY_VERSION}.tar.gz" \
&& cd "ocropy-${OCROPY_VERSION}" \ && cd "ocropy-${OCROPY_VERSION}" \
&& apt-get update \ && apt-get install --no-install-recommends --yes \
&& apt-get install -y --no-install-recommends \ python2.7 \
python-pil \ python-pil \
python-tk \ python-tk \
$(cat PACKAGES) \ $(cat PACKAGES) \
&& rm -rf /var/lib/apt/lists/* \
&& python2.7 setup.py install \ && python2.7 setup.py install \
&& cd .. \ && cd - > /dev/null \
&& rm -rf \ && rm -r "ocropy-${OCROPY_VERSION}" "v${OCROPY_VERSION}.tar.gz"
"ocropy-${OCROPY_VERSION}" \
"v${OCROPY_VERSION}.tar.gz"
ENV PYFLOW_VERSION=1.1.20
ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" .
RUN tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
&& cd "pyflow-${PYFLOW_VERSION}" \
&& python2.7 setup.py build install \
&& cd .. \
&& rm -rf \
"pyflow-${PYFLOW_VERSION}" \
"pyflow-${PYFLOW_VERSION}.tar.gz"
RUN echo "deb https://notesalexp.org/tesseract-ocr/stretch/ stretch main" >> /etc/apt/sources.list \
&& wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - \
&& apt-get update \
&& apt-get install -y --no-install-recommends \
tesseract-ocr \
tesseract-ocr-deu \
tesseract-ocr-eng \
tesseract-ocr-enm \
tesseract-ocr-fra \
tesseract-ocr-frk \
tesseract-ocr-frm \
tesseract-ocr-ita \
tesseract-ocr-por \
tesseract-ocr-spa \
&& rm -rf /var/lib/apt/lists/*
# Install OCR pipeline ## Install Tesseract OCR ##
COPY hocrtotei /usr/local/bin ENV TESSERACT_VERSION=5.0.0
COPY ocr /usr/local/bin RUN wget --no-check-certificate --quiet \
"https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \
&& tar -xzf "${TESSERACT_VERSION}.tar.gz" \
&& cd "tesseract-${TESSERACT_VERSION}" \
&& apt-get install --no-install-recommends --yes \
autoconf \
automake \
g++ \
libjpeg62-turbo-dev \
libleptonica-dev \
libtiff5-dev \
libtool \
libpng-dev \
make \
pkg-config \
zlib1g-dev \
&& ./autogen.sh \
&& ./configure --disable-openmp --disable-shared 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic' \
&& make \
&& make install \
&& ldconfig \
&& cd - > /dev/null \
&& rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz"
RUN rm -r /var/lib/apt/lists/*
## Install Pipeline ##
COPY hocr2tei hocr-combine ocr /usr/local/bin/
ENTRYPOINT ["ocr"] ENTRYPOINT ["ocr"]

21
LICENSE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2021 Bielefeld University - CRC 1288 - INF
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

139
README.md
View File

@ -1,96 +1,49 @@
# OCR # OCR - Optical Character Recognition
## Build image This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided. The pipeline is designed to run on Linux operating systems, but with some tweaks it should also run on Windows with WSL installed.
1. Clone this repository and navigate into it: ## Software used in this pipeline implementation
```
git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git && cd ocr - Official Debian Docker image (buster-slim): https://hub.docker.com/_/debian
``` - Software from Debian Buster's free repositories
- ocropy (1.3.3): https://github.com/ocropus/ocropy/releases/tag/v1.3.3
2. Build image: - pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20
``` - Tesseract OCR (5.0.0): https://github.com/tesseract-ocr/tesseract/releases/tag/5.0.0
docker build -t sfb1288inf/ocr:latest .
``` ## Installation
Alternatively build from the GitLab repository without cloning: 1. Install Docker and Python 3.
2. Clone this repository: `git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git`
1. Build image: 3. Build the Docker image: `docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:v0.1.0 ocr`
``` 4. Add the wrapper script (`wrapper/ocr` relative to this README file) to your `${PATH}`.
docker build -t sfb1288inf/ocr:latest https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git 5. Create working directories for the pipeline: `mkdir -p /<my_data_location>/{input,models,output}`.
``` 6. Place your Tesseract OCR model(s) inside `/<my_data_location>/models`.
## Download prebuilt image ## Use the Pipeline
The GitLab registry provides a prebuilt image. It is automatically created, utilizing the conquaire build servers. 1. Place your PDF files inside `/<my_data_location>/input`. Files should all contain text of the same language.
2. Clear your `/<my_data_location>/output` directory.
1. Download image: 3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details.
``` ```bash
docker pull gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest cd /<my_data_location>
``` # <model_code> is the model filename without the ".traineddata" suffix
ocr \
## Run --input-dir input \
--output-dir output \
1. Create input and output directories for the OCR software: --model-file models/<model>
``` -m <model_code> <optional_pipeline_arguments>
mkdir -p /<mydatalocation>/files_for_ocr /<mydatalocation>/files_from_ocr # More then one model
``` ocr \
--input-dir input \
2. Place your files inside the `/<mydatalocation>/files_for_ocr` directory. Files can either be PDF (.pdf) or multipage TIFF (.tiff, .tif) files. Files should all contain text of the same language. --output-dir output \
--model-file models/<model1>
3. Start the OCR process. --model-file models/<model2>
``` -m <model1_code>+<model2_code> <optional_pipeline_arguments>
docker run \ # Instead of multiple --model-file statements, you can also use
--rm \ ocr \
-it \ --input-dir input \
-u $(id -u $USER):$(id -g $USER) \ --output-dir output \
-v /<mydatalocation>/files_for_ocr:/input \ --model-file models/*
-v /<mydatalocation>/files_from_ocr:/output \ -m <model1_code>+<model2_code> <optional_pipeline_arguments>
sfb1288inf/ocr:latest \
-i /input \
-l <languagecode> \
-o /output
```
The arguments below `sfb1288inf/ocr:latest` are described in the [OCR arguments](#ocr-arguments) part.
If you want to use the prebuilt image, replace `sfb1288inf/ocr:latest` with `gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest`.
4. Check your results in the `/<mydatalocation>/files_from_ocr` directory.
### OCR arguments
`-l languagecode`
* Tells tesseract which language will be used.
* options = deu (German), eng (English), enm (Middle englisch), fra (French), frk (German Fraktur), frm (Middle french), ita (Italian), por (Portuguese), spa (Spanish)
* required = True
`--keep-intermediates`
* If set, all intermediate files created during the OCR process will be
kept.
* default = False
* required = False
`--nCores corenumber`
* Sets the number of CPU cores being used during the OCR process.
* default = min(4, multiprocessing.cpu_count())
* required = False
`--skip-binarisation`
* Used to skip binarization with ocropus. If skipped, only the tesseract binarization is used.
* default = False
Example with all arguments used:
```
docker run \
--rm \
-it \
-u $(id -u $USER):$(id -g $USER) \
-v "$HOME"/ocr/files_for_ocr:/input \
-v "$HOME"/ocr/files_from_ocr:/output \
sfb1288inf/ocr:latest \
-i /input \
-l eng \
-o /output \
--keep_intermediates \
--nCores 8 \
--skip-binarisation
``` ```
4. Check your results in the `/<my_data_location>/output` directory.

44
hocr-combine Executable file
View File

@ -0,0 +1,44 @@
#!/usr/bin/env python3.7
# coding=utf-8
''' Combine multiple hOCR files. '''
from argparse import ArgumentParser
from lxml import html
parser = ArgumentParser(description='Combine multiple hOCR files.')
parser.add_argument(
'-i', '--input-file',
help='Input file',
nargs='+',
required=True
)
parser.add_argument(
'-o', '--output-file',
help='Output file',
required=True
)
args = parser.parse_args()
for input_file in args.input_file:
input_files = []
if input_file.startswith('@'):
with open(input_file[1:], 'r') as f:
input_files += [x for x in f.read().split("\n") if x != '']
else:
input_files.append(input_file)
if len(input_files) == 0:
exit(1)
hocr = html.parse(input_files[0])
hocr_body = hocr.find('body')
for input_file in input_files[1:]:
for ocr_page in html.parse(input_file).findall('//div[@class="ocr_page"]'):
hocr_body.append(ocr_page)
with open(args.output_file, 'wb') as f:
hocr.write(f, encoding='UTF-8', method='html')

68
hocr2tei Executable file
View File

@ -0,0 +1,68 @@
#!/usr/bin/env python3.7
# coding=utf-8
''' Convert hOCR to TEI XML. '''
from argparse import ArgumentParser
from lxml import html
from xml.sax.saxutils import escape
import re
parser = ArgumentParser(description='Convert hOCR to TEI XML.')
parser.add_argument(
'-i', '--input-file',
help='Input file',
required=True
)
parser.add_argument(
'-o', '--output-file',
help='Output file',
required=True
)
args = parser.parse_args()
tei = ''
tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n'
tei += ' <teiHeader>\n'
tei += ' <fileDesc>\n'
tei += ' <titleStmt>\n'
tei += ' <title></title>\n'
tei += ' </titleStmt>\n'
tei += ' <publicationStmt>\n'
tei += ' <p></p>\n'
tei += ' </publicationStmt>\n'
tei += ' <sourceDesc>\n'
tei += ' <p></p>\n'
tei += ' </sourceDesc>\n'
tei += ' </fileDesc>\n'
tei += ' </teiHeader>\n'
tei += ' <text>\n'
tei += ' <body>\n'
hocr = html.parse(args.input_file)
for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
ocr_page_title_attrib = ocr_page.attrib.get('title')
facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1)
page_number = re.search(r'ppageno (\d+)', ocr_page_title_attrib).group(1)
tei += f' <pb facs="{facsimile}" n="{page_number}"/>\n'
for ocr_par in ocr_page.findall('.//p[@class="ocr_par"]'):
tei += ' <p>\n'
for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'):
tei += ' <lb/>'
is_first_word_in_line = True
for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'):
if ocrx_word.text is not None:
if not is_first_word_in_line:
tei += ' '
tei += escape(ocrx_word.text)
is_first_word_in_line = False
tei += '\n'
tei += ' </p>\n'
tei += ' </body>\n'
tei += ' </text>\n'
tei += '</TEI>\n'
with open(args.output_file, 'w') as f:
f.write(tei)

View File

@ -1,58 +0,0 @@
#!/usr/bin/env python3.5
# coding=utf-8
from xml.sax.saxutils import escape
import argparse
import xml.etree.ElementTree as ET
parser = argparse.ArgumentParser(
description='Merges several hOCR files in order of their occurrence on command line to one TEI result file.'
)
parser.add_argument(
'i',
metavar='hOCR-sourcefile',
nargs='+'
)
parser.add_argument(
'o',
metavar='TEI-destfile',
)
args = parser.parse_args()
output_file = open(args.o, 'w')
output_file.write(
'<?xml version="1.0" encoding="UTF-8"?>\n'
+ '<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="dtabf">\n'
+ ' <teiHeader>\n'
+ ' <fileDesc>\n'
+ ' <titleStmt/>\n'
+ ' <publicationStmt/>\n'
+ ' <sourceDesc/>\n'
+ ' </fileDesc>\n'
+ ' <encodingDesc/>\n'
+ ' <profileDesc/>\n'
+ ' </teiHeader>\n'
+ ' <text>\n'
+ ' <body>\n'
)
for index, input_file in enumerate(args.i):
tree = ET.parse(input_file)
output_file.write(' <pb n="%i"/>\n' % (index + 1))
for para in tree.findall('.//*[@class="ocr_par"]'):
output_file.write(' <p>\n')
for line in para.findall('.//*[@class="ocr_line"]'):
first_word_in_line = True
for word in line.findall('.//*[@class="ocrx_word"]'):
if word.text is not None:
output_file.write((' ' if first_word_in_line else ' ') + escape(word.text.strip()))
first_word_in_line = False
if not first_word_in_line:
output_file.write('<lb/>\n')
output_file.write(' </p>\n')
output_file.write(
' </body>\n'
+ ' </text>\n'
+ '</TEI>')
output_file.close()

1180
ocr

File diff suppressed because it is too large Load Diff

View File

@ -1,39 +1,44 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# coding=utf-8 # coding=utf-8
import argparse from argparse import ArgumentParser
import os import os
import subprocess import subprocess
import sys
container_image = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest' CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:v0.1.0'
container_input_dir = '/input' CONTAINER_INPUT_DIR = '/input'
container_output_dir = '/output' CONTAINER_OUTPUT_DIR = '/output'
uid = str(os.getuid()) CONTAINER_MODELS_DIR = '/usr/local/share/tessdata'
gid = str(os.getgid()) CONTAINER_LOG_DIR = '/logs'
UID = str(os.getuid())
GID = str(os.getgid())
parser = argparse.ArgumentParser(add_help=False) parser = ArgumentParser(add_help=False)
parser.add_argument( parser.add_argument('-i', '--input-dir')
'-i', parser.add_argument('-o', '--output-dir')
dest='input_dir', parser.add_argument('-t', '--model-file', action='extend', nargs='+')
required=False parser.add_argument('--log-dir')
)
parser.add_argument(
'-o',
dest='output_dir',
required=False
)
args, remaining_args = parser.parse_known_args() args, remaining_args = parser.parse_known_args()
cmd = ['docker', 'run', '--rm', '-it', '-u', uid + ':' + gid] cmd = ['docker', 'run', '--rm', '-it', '-u', f'{UID}:{GID}']
if args.input_dir is not None: if args.input_dir is not None:
host_input_dir = os.path.abspath(args.input_dir) mapping = f'{os.path.abspath(args.input_dir)}:{CONTAINER_INPUT_DIR}'
cmd += ['-v', host_input_dir + ':' + container_input_dir] cmd += ['-v', mapping]
remaining_args += ['-i', container_input_dir] remaining_args += ['-i', CONTAINER_INPUT_DIR]
if args.output_dir is not None: if args.output_dir is not None:
host_output_dir = os.path.abspath(args.output_dir) mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}'
cmd += ['-v', host_output_dir + ':' + container_output_dir] cmd += ['-v', mapping]
remaining_args += ['-o', container_output_dir] remaining_args += ['-o', CONTAINER_OUTPUT_DIR]
cmd.append(container_image) if args.model_file is not None:
for model_file in args.model_file:
mapping = f'{os.path.abspath(model_file)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model_file)}' # noqa
cmd += ['-v', mapping]
if args.log_dir is not None:
mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}'
cmd += ['-v', mapping]
remaining_args += ['--log-dir', CONTAINER_LOG_DIR]
cmd.append(CONTAINER_IMAGE)
cmd += remaining_args cmd += remaining_args
subprocess.run(cmd) sys.exit(subprocess.run(cmd).returncode)