Update to Tesseract 5.0.0, Set version 0.1.0

This commit is contained in:
Patrick Jentsch 2022-01-04 11:42:55 +01:00
parent a0760487ae
commit e1b78b6ba4
7 changed files with 574 additions and 338 deletions

View File

@ -9,8 +9,14 @@ ENV LANG=C.UTF-8
RUN apt-get update \
&& apt-get install --no-install-recommends --yes \
wget
ghostscript \
procps \
python3.7 \
python3-pip \
rename \
wget \
zip \
&& python3 -m pip install lxml
# Install the OCR pipeline and it's dependencies #
## Install pyFlow ##
@ -43,7 +49,7 @@ RUN wget --no-check-certificate --quiet \
## Install Tesseract OCR ##
ENV TESSERACT_VERSION=4.1.1
ENV TESSERACT_VERSION=5.0.0
RUN wget --no-check-certificate --quiet \
"https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \
&& tar -xzf "${TESSERACT_VERSION}.tar.gz" \
@ -61,37 +67,20 @@ RUN wget --no-check-certificate --quiet \
pkg-config \
zlib1g-dev \
&& ./autogen.sh \
&& ./configure \
&& ./configure --disable-openmp --disable-shared 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic' \
&& make \
&& make install \
&& ldconfig \
&& cd - > /dev/null \
&& rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz"
ENV TESSERACT_MODELS="ara,chi_tra,dan,deu,ell,eng,enm,fra,frk,frm,ita,por,rus,spa"
ENV TESSDATA_BEST_VERSION=4.1.0
RUN wget --no-check-certificate --quiet \
"https://github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}.tar.gz" \
&& tar -xzf "${TESSDATA_BEST_VERSION}.tar.gz" \
&& for tesseract_model in $(echo ${TESSERACT_MODELS} | tr "," "\n"); do mv "tessdata_best-${TESSDATA_BEST_VERSION}/${tesseract_model}.traineddata" "/usr/local/share/tessdata/"; done \
&& rm -r "tessdata_best-${TESSDATA_BEST_VERSION}" "${TESSDATA_BEST_VERSION}.tar.gz"
## Further dependencies ##
RUN apt-get install --no-install-recommends --yes \
procps \
ghostscript \
python3.7 \
rename \
zip
## Install Pipeline ##
COPY hocrtotei ocr /usr/local/bin/
RUN rm -r /var/lib/apt/lists/*
## Install Pipeline ##
COPY hocr2tei hocr-combine ocr /usr/local/bin/
ENTRYPOINT ["ocr"]
CMD ["--help"]

21
LICENSE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2021 Bielefeld University - CRC 1288 - INF
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -1,6 +1,6 @@
# OCR - Optical Character Recognition
This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided.
This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided. The pipeline is designed to run on Linux operating systems, but with some tweaks it should also run on Windows with WSL installed.
## Software used in this pipeline implementation
@ -8,37 +8,26 @@ This software implements a heavily parallelized pipeline to recognize text in PD
- Software from Debian Buster's free repositories
- ocropy (1.3.3): https://github.com/ocropus/ocropy/releases/tag/v1.3.3
- pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20
- Tesseract OCR (4.1.1): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1
- tessdata_best (4.1.0): https://github.com/tesseract-ocr/tessdata_best/releases/tag/4.1.0
- Tesseract OCR (5.0.0): https://github.com/tesseract-ocr/tesseract/releases/tag/5.0.0
## Use this image
## Installation
1. Create input and output directories for the pipeline.
``` bash
mkdir -p /<my_data_location>/input /<my_data_location>/output
```
1. Install Docker and Python 3.
2. Clone this repository: `git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git`
2. Build the Docker image: `docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:0.1.0 ocr`
2. Add the wrapper script (`wrapper/ocr` relative to this README file) to your `${PATH}`.
3. Create working directories for the pipeline: `mkdir -p /<my_data_location>/{input,models,output}`.
4. Place your Tesseract OCR model(s) inside `/<my_data_location>/models`.
2. Place your PDF files inside `/<my_data_location>/input`. Files should all contain text of the same language.
## Use the Pipeline
1. Place your PDF files inside `/<my_data_location>/input`. Files should all contain text of the same language.
2. Clear your `/<my_data_location>/output` directory.
3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details.
```
# Option one: Use the wrapper script
## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/raw/development/wrapper/ocr, make it executeable and add it to your ${PATH}
```bash
cd /<my_data_location>
ocr -i input -l <language_code> -o output <optional_pipeline_arguments>
# Option two: Classic Docker style
docker run \
--rm \
-it \
-u $(id -u $USER):$(id -g $USER) \
-v /<my_data_location>/input:/input \
-v /<my_data_location>/output:/output \
gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:development \
-i /ocr_pipeline/input \
-l <language_code> \
-o /ocr_pipeline/output \
<optional_pipeline_arguments>
ocr -i input -o output -m models/<model_name> -l <language_code> <optional_pipeline_arguments>
# or
ocr -i input -o output -m models/* -l <language_code> <optional_pipeline_arguments>
```
4. Check your results in the `/<my_data_location>/output` directory.

35
hocr-combine Normal file
View File

@ -0,0 +1,35 @@
#!/usr/bin/env python3.7
# coding=utf-8
""""Combine multiple hOCR files."""
from argparse import ArgumentParser
from lxml import html
parser = ArgumentParser(description='Combine multiple hOCR files.')
parser.add_argument('file', help='Input file(s)', nargs='+')
parser.add_argument('-o', '--output-file', help='Output file', required=True)
args = parser.parse_args()
for file in args.file:
files = []
if file.startswith('@'):
with open(file[1:], 'r') as f:
files += [x for x in f.read().split("\n") if x != '']
else:
files.append(file)
if len(files) == 0:
exit(1)
hocr = html.parse(files[0])
hocr_body = hocr.find('body')
for file in files[1:]:
for ocr_page in html.parse(file).findall('//div[@class="ocr_page"]'):
hocr_body.append(ocr_page)
with open(args.output_file, 'wb') as f:
hocr.write(f, encoding='UTF-8', method='html')

39
hocrtotei → hocr2tei Executable file → Normal file
View File

@ -3,16 +3,18 @@
""""Convert hOCR to TEI XML."""
from xml.sax.saxutils import escape
from argparse import ArgumentParser
from lxml import html
from xml.sax.saxutils import escape
import re
import xml.etree.ElementTree as ET
parser = ArgumentParser(description='Convert hOCR to TEI XML.')
parser.add_argument('input', metavar='Path to hOCR input file')
parser.add_argument('output', metavar='Path to TEI output file')
parser.add_argument('file', help='Input file')
parser.add_argument('-o', '--output-file', help='Output file', required=True)
args = parser.parse_args()
tei = ''
tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n'
tei += ' <teiHeader>\n'
@ -30,28 +32,27 @@ tei += ' </fileDesc>\n'
tei += ' </teiHeader>\n'
tei += ' <text>\n'
tei += ' <body>\n'
# Conversion start
hocr = ET.parse(args.input)
for page in hocr.findall('.//*[@class="ocr_page"]'):
page_properties = page.attrib.get('title')
facsimile = re.search(r'image \"(.*?)\"', page_properties).group(1)
page_number = re.search(r'ppageno (\d+)', page_properties).group(1)
tei += ' <pb facs="{}" n="{}"/>\n'.format(facsimile, page_number)
for para in page.findall('.//*[@class="ocr_par"]'):
hocr = html.parse(args.file)
for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
ocr_page_title_attrib = ocr_page.attrib.get('title')
facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1)
page_number = re.search(r'ppageno (\d+)', ocr_page_title_attrib).group(1)
tei += f' <pb facs="{facsimile}" n="{page_number}"/>\n'
for ocr_par in ocr_page.findall('.//p[@class="ocr_par"]'):
tei += ' <p>\n'
for line in para.findall('.//*[@class="ocr_line"]'):
for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'):
tei += ' <lb/>'
indent = ''
for word in line.findall('.//*[@class="ocrx_word"]'):
if word.text is not None:
tei += indent + escape(word.text.strip())
for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'):
if ocrx_word.text is not None:
tei += indent + escape(ocrx_word.text)
indent = ' '
tei += '\n'
tei += ' </p>\n'
# Conversion end
tei += ' </body>\n'
tei += ' </text>\n'
tei += '</TEI>\n'
with open(args.output, 'w') as tei_file:
tei_file.write(tei)
with open(args.output_file, 'w') as f:
f.write(tei)

719
ocr
View File

@ -1,11 +1,9 @@
#!/usr/bin/env python2.7
# coding=utf-8
"""OCR pipeline for PDF file processing."""
''' OCR pipeline for PDF file processing. '''
__version__ = '0.1.0'
__author__ = 'Patrick Jentsch <p.jentsch@uni-bielefeld.de>,' \
'Stephan Porada <porada@posteo.de>'
__version__ = '1.0.0'
from argparse import ArgumentParser
from pyflow import WorkflowRunner
@ -14,145 +12,402 @@ import os
import sys
class OCRPipelineJob:
"""An OCR pipeline job class
class PipelineJob:
'''
OCR pipeline job class.
Each input file of the pipeline is represented as an OCR pipeline job,
which holds all necessary information for the pipeline to process it.
Arguments:
file -- Path to the file
output_dir -- Path to a directory, where job results a stored
"""
output_dir -- Path to a directory, where job results are stored
'''
def __init__(self, file, output_dir):
self.file = file
self.name = os.path.basename(file).rsplit('.', 1)[0]
self.name = os.path.basename(file)[:-4]
self.output_dir = output_dir
self.page_dir = os.path.join(output_dir, 'pages')
self.tmp_dir = os.path.join(output_dir, 'tmp')
class OCRPipeline(WorkflowRunner):
def __init__(self, input_dir, lang, output_dir, binarize, zip):
class SplitInputWorkflow(WorkflowRunner):
def __init__(self, job):
self.job = job
def workflow(self):
'''
' ##################################################
' # gs #
' ##################################################
'''
n_cores = min(2, self.getNCores())
mem_mb = min(n_cores * 512, self.getMemMb())
cmd = 'gs'
cmd += ' -dBATCH'
cmd += ' -dNOPAUSE'
cmd += ' -dBufferSpace={}'.format(mem_mb * 1000000)
cmd += ' -dNumRenderingThreads={}'.format(n_cores)
cmd += ' -dQUIET'
cmd += ' -r300'
cmd += ' -sDEVICE=png16m'
cmd += ' -sOutputFile="{}/page-%d.png"'.format(
os.path.join(self.job.tmp_dir, 'images')
)
cmd += ' "{}"'.format(self.job.file)
self.addTask(
'gs',
command=cmd,
memMb=mem_mb,
nCores=n_cores
)
class BinarizationWorkflow(WorkflowRunner):
def __init__(self, job):
self.job = job
def workflow(self):
'''
' ##################################################
' # ocropus-nlbin #
' ##################################################
'''
# TODO: Update to newer ocropus-nlbin and start one task per page
n_cores = self.getNCores()
mem_mb = min(512 * n_cores, self.getMemMb())
cmd = 'ls -dv "{}/"* > "{}"'.format(
os.path.join(self.job.tmp_dir, 'images'),
os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')
)
cmd += ' && '
cmd += 'ocropus-nlbin "@{}"'.format(os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')) # noqa
cmd += ' --nocheck'
cmd += ' --output "{}"'.format(
os.path.join(self.job.tmp_dir, 'images'))
cmd += ' --parallel "{}"'.format(n_cores)
cmd += ' && '
cmd += 'rm "{}"'.format(os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')) # noqa
ocropus_nlbin_task = self.addTask(
'ocropus_nlbin',
command=cmd,
memMb=mem_mb,
nCores=n_cores
)
'''
' ##################################################
' # cleanup #
' ##################################################
'''
n_cores = 1
mem_mb = min(128, self.getMemMb())
cmd = 'cd "{}"'.format(os.path.join(self.job.tmp_dir, 'images'))
cmd += ' && '
cmd += 'mkdir tmp'
cmd += ' && '
cmd += 'mv *.bin.png tmp'
cmd += ' && '
cmd += 'rm *.png'
cmd += ' && '
cmd += 'mv tmp/* .'
cmd += ' && '
cmd += 'rmdir tmp'
cmd += ' && '
cmd += 'rename \'s/^0*/page-/\' *'
cmd += ' && '
cmd += 'rename \'s/.bin.png$/.png/\' *'
cmd += ' && '
cmd += 'cd -'
self.addTask(
'cleanup',
command=cmd,
dependencies=ocropus_nlbin_task,
memMb=mem_mb,
nCores=n_cores
)
class OCRWorkflow(WorkflowRunner):
def __init__(self, job, lang):
self.job = job
self.lang = lang
def workflow(self):
'''
' ##################################################
' # tesseract #
' ##################################################
'''
tesseract_tasks = []
n_cores = 1
mem_mb = min(512, self.getMemMb())
for i, file in enumerate(os.listdir(os.path.join(self.job.tmp_dir, 'images'))): # noqa
cmd = 'tesseract "{}" "{}"'.format(
os.path.join(self.job.tmp_dir, 'images', file),
os.path.join(self.job.tmp_dir, file[:-4])
)
cmd += ' -l "{}"'.format(self.lang)
cmd += ' hocr pdf txt'
cmd += ' || '
cmd += 'echo "${?}"'
task = self.addTask(
'tesseract_-_{}'.format(i),
command=cmd,
env={'OMP_THREAD_LIMIT': '{}'.format(n_cores)},
memMb=mem_mb,
nCores=n_cores
)
tesseract_tasks.append(task)
'''
' ##################################################
' # move_files #
' ##################################################
'''
n_cores = 1
mem_mb = min(128, self.getMemMb())
for i, file_extension in enumerate(['hocr', 'pdf', 'txt']):
cmd = 'mv "{}/"*.{} "{}"'.format(
self.job.tmp_dir,
file_extension,
os.path.join(self.job.tmp_dir, file_extension)
)
self.addTask(
'move_{}_files'.format(file_extension),
command=cmd,
dependencies=tesseract_tasks,
memMb=mem_mb,
nCores=n_cores
)
cmd = 'mv "{}" "{}"'.format(
os.path.join(self.job.tmp_dir, 'images'),
os.path.join(self.job.output_dir)
)
self.addTask(
'move_image_files',
command=cmd,
dependencies=tesseract_tasks,
memMb=mem_mb,
nCores=n_cores
)
class CreateHOCRWorkflow(WorkflowRunner):
def __init__(self, job):
self.job = job
def workflow(self):
'''
' ##################################################
' # fix-hocr #
' ##################################################
'''
fix_hocr_tasks = []
n_cores = 1
mem_mb = min(256, self.getMemMb())
for i, file in enumerate(os.listdir(os.path.join(self.job.tmp_dir, 'hocr'))): # noqa
cmd = 'sed -i \'s>{}>images>g\' "{}"'.format(
os.path.join(self.job.tmp_dir, 'images'),
os.path.join(self.job.tmp_dir, 'hocr', file)
)
cmd += ' && '
cmd += 'sed -i \'s>ppageno [0-9]\\+>ppageno {}>g\' "{}"'.format(
file[5:-5],
os.path.join(self.job.tmp_dir, 'hocr', file)
)
cmd += ' && '
cmd += 'sed -i \'s>page_[0-9]\\+>page_{}>g\' "{}"'.format(
file[5:-5],
os.path.join(self.job.tmp_dir, 'hocr', file)
)
cmd += ' && '
cmd += 'sed -i \'s>block_[0-9]\\+>block_{}>g\' "{}"'.format(
file[5:-5],
os.path.join(self.job.tmp_dir, 'hocr', file)
)
cmd += ' && '
cmd += 'sed -i \'s>par_[0-9]\\+>par_{}>g\' "{}"'.format(
file[5:-5],
os.path.join(self.job.tmp_dir, 'hocr', file)
)
cmd += ' && '
cmd += 'sed -i \'s>line_[0-9]\\+>line_{}>g\' "{}"'.format(
file[5:-5],
os.path.join(self.job.tmp_dir, 'hocr', file)
)
cmd += ' && '
cmd += 'sed -i \'s>word_[0-9]\\+>word_{}>g\' "{}"'.format(
file[5:-5],
os.path.join(self.job.tmp_dir, 'hocr', file)
)
task = self.addTask(
'fix-hocr_-_{}'.format(i),
command=cmd,
memMb=mem_mb,
nCores=n_cores
)
fix_hocr_tasks.append(task)
'''
' ##################################################
' # hocr-combine #
' ##################################################
'''
n_cores = 1
mem_mb = min(512, self.getMemMb())
cmd = 'ls -dv "{}/"* > "{}"'.format(
os.path.join(self.job.tmp_dir, 'hocr'),
os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt')
)
cmd += ' && '
cmd += 'hocr-combine "@{}"'.format(
os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt')
)
cmd += ' --output-file "{}.hocr"'.format(
os.path.join(self.job.output_dir, self.job.name)
)
cmd += ' && '
cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'hocr'))
self.addTask(
'hocr_combine',
command=cmd,
dependencies=fix_hocr_tasks,
memMb=mem_mb,
nCores=n_cores
)
class CreatePDFWorkflow(WorkflowRunner):
def __init__(self, job):
self.job = job
def workflow(self):
'''
' ##################################################
' # pdf_combine #
' ##################################################
'''
n_cores = min(2, self.getNCores())
mem_mb = min(n_cores * 256, self.getMemMb())
cmd = 'ls -dQv "{}"/*'.format(os.path.join(self.job.tmp_dir, 'pdf'))
cmd += ' | '
cmd += 'xargs gs'
cmd += ' -dBATCH'
cmd += ' -dNOPAUSE'
cmd += ' -dBufferSpace={}'.format(mem_mb * 1000000)
cmd += ' -dNumRenderingThreads={}'.format(n_cores)
cmd += ' -dPDFSETTINGS=/ebook'
cmd += ' -dQUIET'
cmd += ' -sDEVICE=pdfwrite'
cmd += ' -sOutputFile="{}.pdf"'.format(
os.path.join(self.job.output_dir, self.job.name)
)
cmd += ' && '
cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'pdf'))
self.addTask('pdf_combine', command=cmd, memMb=mem_mb, nCores=n_cores)
class CreateTEIWorkflow(WorkflowRunner):
def __init__(self, job):
self.job = job
def workflow(self):
'''
' ##################################################
' # hocr2tei #
' ##################################################
'''
n_cores = 1
mem_mb = min(512, self.getMemMb())
cmd = 'hocr2tei "{}.hocr"'.format(
os.path.join(self.job.output_dir, self.job.name)
)
cmd += ' --output-file "{}.xml"'.format(
os.path.join(self.job.output_dir, self.job.name)
)
self.addTask('hocr2tei', command=cmd, memMb=mem_mb, nCores=n_cores)
class CreateTxtWorkflow(WorkflowRunner):
def __init__(self, job):
self.job = job
def workflow(self):
'''
' ##################################################
' # txt_combine #
' ##################################################
'''
n_cores = 1
mem_mb = min(512, self.getMemMb())
cmd = 'ls -dQv "{}"/*'.format(os.path.join(self.job.tmp_dir, 'txt'))
cmd += ' | '
cmd += 'xargs cat'
cmd += ' > '
cmd += '"{}.txt"'.format(os.path.join(self.job.output_dir, self.job.name)) # noqa
cmd += ' && '
cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'txt'))
self.addTask('txt_combine', command=cmd, memMb=mem_mb, nCores=n_cores)
class MainWorkflow(WorkflowRunner):
def __init__(self, input_dir, lang, output_dir, binarize):
self.input_dir = input_dir
self.lang = lang
self.output_dir = output_dir
self.binarize = binarize
self.zip = zip
self.jobs = collect_jobs(self.input_dir, self.output_dir)
self.jobs = self.collect_jobs()
def collect_jobs(self):
jobs = []
for file in os.listdir(self.input_dir):
if os.path.isdir(os.path.join(self.input_dir, file)):
continue
if file.lower().endswith('.pdf'):
job = PipelineJob(
os.path.join(self.input_dir, file),
os.path.join(self.output_dir, file)
)
jobs.append(job)
return jobs
def workflow(self):
if not self.jobs:
return
'''
' ##################################################
' # setup output directory #
' ##################################################
'''
setup_output_directory_tasks = []
for i, job in enumerate(self.jobs):
cmd = 'mkdir -p "{}"'.format(job.page_dir)
lbl = 'setup_output_directory_-_{}'.format(i)
task = self.addTask(command=cmd, label=lbl)
setup_output_directory_tasks.append(task)
# Create output and temporary directories
for job in self.jobs:
os.mkdir(job.output_dir)
os.mkdir(job.tmp_dir)
os.mkdir(os.path.join(job.tmp_dir, 'hocr'))
os.mkdir(os.path.join(job.tmp_dir, 'pdf'))
os.mkdir(os.path.join(job.tmp_dir, 'images'))
os.mkdir(os.path.join(job.tmp_dir, 'txt'))
'''
' ##################################################
' # split input #
' # split-input #
' ##################################################
'''
split_input_tasks = []
n_cores = max(1, int(self.getNCores() / len(self.jobs)))
for i, job in enumerate(self.jobs):
input_file = job.file
output_file = '{}/page-%d.tif'.format(job.page_dir)
cmd = 'gs'
cmd += ' -dBATCH'
cmd += ' -dNOPAUSE'
cmd += ' -dNumRenderingThreads={}'.format(n_cores)
cmd += ' -dQUIET'
cmd += ' -r300'
cmd += ' -sDEVICE=tiff24nc'
cmd += ' -sCompression=lzw'
cmd += ' "-sOutputFile={}"'.format(output_file)
cmd += ' "{}"'.format(input_file)
deps = 'setup_output_directory_-_{}'.format(i)
lbl = 'split_input_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl,
nCores=n_cores)
split_input_tasks.append(task)
self.addWorkflowTask(
'split_input_-_{}'.format(i),
SplitInputWorkflow(job)
)
if self.binarize:
'''
' ##################################################
' # pre binarization #
' ##################################################
'''
pre_binarization_tasks = []
for i, job in enumerate(self.jobs):
input_file = os.path.join(job.output_dir, 'binarization_input_files.txt') # noqa
cmd = 'ls -dv "{}/"* >> "{}"'.format(job.page_dir, input_file)
deps = 'split_input_-_{}'.format(i)
lbl = 'pre_binarization_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
pre_binarization_tasks.append(task)
'''
' ##################################################
' # binarization #
' ##################################################
'''
binarization_tasks = []
n_cores = self.getNCores()
mem_mb = self.getMemMb()
for i, job in enumerate(self.jobs):
input_file = os.path.join(job.output_dir, 'binarization_input_files.txt') # noqa
cmd = 'ocropus-nlbin "@{}"'.format(input_file)
cmd += ' --nocheck'
cmd += ' --output "{}"'.format(job.page_dir)
cmd += ' --parallel "{}"'.format(n_cores)
deps = 'pre_binarization_-_{}'.format(i)
lbl = 'binarization_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl,
memMb=mem_mb, nCores=n_cores)
binarization_tasks.append(task)
'''
' ##################################################
' # post binarization #
' ##################################################
'''
post_binarization_tasks = []
for i, job in enumerate(self.jobs):
input_file = os.path.join(job.output_dir, 'binarization_input_files.txt') # noqa
cmd = 'rm "{}"'.format(input_file)
cmd += ' && '
cmd += 'cd "{}"'.format(job.page_dir)
cmd += ' && '
cmd += 'rm *.{nrm.png,tif}'
cmd += ' && '
cmd += 'rename \'s/^0*/page-/\' *'
cmd += ' && '
cmd += 'cd -'
deps = 'binarization_-_{}'.format(i)
lbl = 'post_binarization_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
post_binarization_tasks.append(task)
'''
' ##################################################
' # pre ocr #
' ##################################################
'''
pre_ocr_tasks = []
for i, job in enumerate(self.jobs):
input_file = os.path.join(job.output_dir, 'ocr_input_files.txt')
cmd = 'ls -dv "{}/"* >> "{}"'.format(job.page_dir, input_file)
deps = 'post_binarization_-_{}'.format(i) if self.binarize else 'split_input_-_{}'.format(i) # noqa
lbl = 'pre_ocr_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
pre_ocr_tasks.append(task)
self.addWorkflowTask(
'binarization_-_{}'.format(i),
BinarizationWorkflow(job),
dependencies='split_input_-_{}'.format(i)
)
'''
' ##################################################
@ -160,175 +415,117 @@ class OCRPipeline(WorkflowRunner):
' ##################################################
'''
ocr_tasks = []
n_cores = min(4, self.getNCores())
mem_mb = min(n_cores * 2048, self.getMemMb())
for i, job in enumerate(self.jobs):
input_file = os.path.join(job.output_dir, 'ocr_input_files.txt')
output_file_base = os.path.join(job.output_dir, job.name)
cmd = 'tesseract "{}" "{}"'.format(input_file, output_file_base)
cmd += ' -l "{}"'.format(self.lang)
cmd += ' hocr pdf txt'
deps = 'pre_ocr_-_{}'.format(i)
lbl = 'ocr_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps,
env={'OMP_THREAD_LIMIT': '{}'.format(n_cores)},
label=lbl, memMb=mem_mb, nCores=n_cores)
if self.binarize:
deps = 'binarization_-_{}'.format(i)
else:
deps = 'split_input_-_{}'.format(i)
task = self.addWorkflowTask(
'ocr_-_{}'.format(i),
OCRWorkflow(job, self.lang),
dependencies=deps
)
ocr_tasks.append(task)
'''
' ##################################################
' # post ocr #
' # create-hocr #
' ##################################################
'''
post_ocr_tasks = []
create_hocr_tasks = []
for i, job in enumerate(self.jobs):
input_file = os.path.join(job.output_dir, 'ocr_input_files.txt')
output_file_base = os.path.join(job.output_dir, job.name)
cmd = 'rm "{}"'.format(input_file)
cmd += ' && '
cmd += 'sed -i \'s+{}+pages+g\' "{}.hocr"'.format(job.page_dir, output_file_base) # noqa
deps = 'ocr_-_{}'.format(i)
lbl = 'post_ocr_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
post_ocr_tasks.append(task)
task = self.addWorkflowTask(
'create_hocr_-_{}'.format(i),
CreateHOCRWorkflow(job),
dependencies='ocr_-_{}'.format(i)
)
create_hocr_tasks.append(task)
'''
' ##################################################
' # hocr to tei #
' # create-pdf #
' ##################################################
'''
hocr_to_tei_tasks = []
create_pdf_tasks = []
for i, job in enumerate(self.jobs):
output_file_base = os.path.join(job.output_dir, job.name)
cmd = 'hocrtotei "{}.hocr" "{}.xml"'.format(output_file_base, output_file_base) # noqa
deps = 'post_ocr_-_{}'.format(i)
lbl = 'hocr_to_tei_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
hocr_to_tei_tasks.append(task)
task = self.addWorkflowTask(
'create_pdf_-_{}'.format(i),
CreatePDFWorkflow(job),
dependencies='ocr_-_{}'.format(i)
)
create_pdf_tasks.append(task)
'''
' ##################################################
' # zip creation #
' # create-tei #
' ##################################################
'''
zip_creation_tasks = []
if self.zip is not None:
# zip all files
cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' -r'
cmd += ' "{}.all.zip" .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.pdf" "*.txt" "*.xml" "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif') # noqa
cmd += ' && '
cmd += 'cd -'
deps = hocr_to_tei_tasks
lbl = 'zip_creation_-_all'
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
zip_creation_tasks.append(task)
# zip PDF files
cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' -r'
cmd += ' "{}.pdf.zip" .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.pdf"'
cmd += ' && '
cmd += 'cd -'
deps = ocr_tasks
lbl = 'zip_creation_-_pdf'
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
zip_creation_tasks.append(task)
# zip TXT files
cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' -r'
cmd += ' "{}.txt.zip" .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.txt"'
cmd += ' && '
cmd += 'cd -'
deps = ocr_tasks
lbl = 'zip_creation_-_txt'
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
zip_creation_tasks.append(task)
# zip XML files
cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' -r'
cmd += ' "{}.xml.zip" .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.xml"'
cmd += ' && '
cmd += 'cd -'
deps = hocr_to_tei_tasks
lbl = 'zip_creation_-_xml'
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
zip_creation_tasks.append(task)
# zip PoCo bundles
cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' -r'
cmd += ' "{}.poco.zip" .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif') # noqa
cmd += ' && '
cmd += 'cd -'
deps = post_ocr_tasks
lbl = 'zip_creation_-_poco'
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
zip_creation_tasks.append(task)
create_tei_tasks = []
for i, job in enumerate(self.jobs):
task = self.addWorkflowTask(
'create_tei_-_{}'.format(i),
CreateTEIWorkflow(job),
dependencies='create_hocr_-_{}'.format(i)
)
create_tei_tasks.append(task)
'''
' ##################################################
' # create-txt #
' ##################################################
'''
create_txt_tasks = []
for i, job in enumerate(self.jobs):
task = self.addWorkflowTask(
'create_txt_-_{}'.format(i),
CreateTxtWorkflow(job),
dependencies='ocr_-_{}'.format(i)
)
create_txt_tasks.append(task)
def collect_jobs(input_dir, output_dir):
jobs = []
for file in os.listdir(input_dir):
if os.path.isdir(os.path.join(input_dir, file)):
continue
if file.lower().endswith('.pdf'):
job = OCRPipelineJob(os.path.join(input_dir, file),
os.path.join(output_dir, file))
jobs.append(job)
return jobs
# Remove temporary directories when all tasks are completed
self.waitForTasks()
for job in self.jobs:
os.rmdir(job.tmp_dir)
def parse_args():
parser = ArgumentParser(description='OCR pipeline for PDF file processing',
prog='OCR pipeline')
parser.add_argument('-i', '--input-dir',
help='Input directory',
required=True)
parser.add_argument('-o', '--output-dir',
help='Output directory',
required=True)
parser.add_argument('-l', '--language',
choices=list(map(lambda x: x[:-12], filter(lambda x: x.endswith('.traineddata'), os.listdir('/usr/local/share/tessdata')))), # noqa
help='Language of the input '
'(3-character ISO 639-2 language codes)',
required=True)
parser.add_argument('--binarize',
action='store_true',
help='Add binarization as a preprocessing step')
parser.add_argument('--log-dir',
help='Logging directory')
parser.add_argument('--mem-mb',
help='Amount of system memory to be used (Default: min(--n-cores * 2048, available system memory))', # noqa
type=int)
parser.add_argument('--n-cores',
default=min(4, multiprocessing.cpu_count()),
help='Number of CPU threads to be used (Default: min(4, number of CPUs))', # noqa
type=int)
parser.add_argument('--zip',
help='Create one zip file per filetype')
parser.add_argument('-v', '--version',
action='version',
help='Returns the current version of the OCR pipeline',
version='%(prog)s {}'.format(__version__))
parser = ArgumentParser(description='OCR pipeline for PDF file processing')
parser.add_argument(
'-i', '--input-dir', help='Input directory', required=True)
parser.add_argument(
'-o', '--output-dir', help='Output directory', required=True)
parser.add_argument(
'-l', '--language',
choices=[x[:-12] for x in os.listdir('/usr/local/share/tessdata')
if x.endswith('.traineddata') and len(x) > 12],
help='Language of the input (3-character ISO 639-2 language codes)',
required=True
)
parser.add_argument(
'--binarize',
action='store_true',
help='Add binarization as a preprocessing step'
)
parser.add_argument(
'--log-dir', help='Logging directory (Default: --output-dir)')
parser.add_argument(
'--mem-mb',
help='Amount of system memory to be used (Default: min(--n-cores * 512, available system memory))', # noqa
type=int
)
parser.add_argument(
'--n-cores',
default=min(4, multiprocessing.cpu_count()),
help='Number of CPU threads to be used (Default: min(4, CPU count))',
type=int
)
parser.add_argument(
'-v', '--version',
action='version',
help='Returns the current version of the OCR pipeline',
version='%(prog)s {}'.format(__version__)
)
args = parser.parse_args()
# Set some tricky default values and check for insufficient input
@ -338,20 +535,18 @@ def parse_args():
raise Exception('--n-cores must be greater or equal 1')
if args.mem_mb is None:
max_mem_mb = int(os.popen('free -t -m').readlines()[-1].split()[1:][0])
args.mem_mb = min(args.n_cores * 2048, max_mem_mb)
if args.mem_mb < 2048:
raise Exception('--mem-mb must be greater or equal 2048')
if args.zip is not None and args.zip.lower().endswith('.zip'):
# Remove .zip file extension if provided
args.zip = args.zip[:-4]
args.zip = args.zip if args.zip else 'output'
args.mem_mb = min(args.n_cores * 512, max_mem_mb)
if args.mem_mb < 512:
raise Exception('--mem-mb must be greater or equal 512')
return args
def main():
args = parse_args()
ocr_pipeline = OCRPipeline(args.input_dir, args.language, args.output_dir, args.binarize, args.zip) # noqa
retval = ocr_pipeline.run(dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores) # noqa
ocr_pipeline = MainWorkflow(
args.input_dir, args.language, args.output_dir, args.binarize)
retval = ocr_pipeline.run(
dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores)
sys.exit(retval)

View File

@ -6,9 +6,10 @@ import os
import subprocess
import sys
CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:1.0.0'
CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:0.1.0'
CONTAINER_INPUT_DIR = '/input'
CONTAINER_OUTPUT_DIR = '/output'
CONTAINER_MODELS_DIR = '/usr/local/share/tessdata'
CONTAINER_LOG_DIR = '/logs'
UID = str(os.getuid())
GID = str(os.getgid())
@ -16,20 +17,25 @@ GID = str(os.getgid())
parser = ArgumentParser(add_help=False)
parser.add_argument('-i', '--input-dir')
parser.add_argument('-o', '--output-dir')
parser.add_argument('-m', '--model', action='extend', dest='models', nargs='+')
parser.add_argument('--log-dir')
args, remaining_args = parser.parse_known_args()
cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)]
cmd = ['docker', 'run', '--rm', '-it', '-u', f'{UID}:{GID}']
if args.input_dir is not None:
mapping = os.path.abspath(args.input_dir) + ':' + CONTAINER_INPUT_DIR
mapping = f'{os.path.abspath(args.input_dir)}:{CONTAINER_INPUT_DIR}'
cmd += ['-v', mapping]
remaining_args += ['-i', CONTAINER_INPUT_DIR]
if args.output_dir is not None:
mapping = os.path.abspath(args.output_dir) + ':' + CONTAINER_OUTPUT_DIR
mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}'
cmd += ['-v', mapping]
remaining_args += ['-o', CONTAINER_OUTPUT_DIR]
if args.models is not None:
for model in args.models:
mapping = f'{os.path.abspath(model)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model)}' # noqa
cmd += ['-v', mapping]
if args.log_dir is not None:
mapping = os.path.abspath(args.log_dir) + ':' + CONTAINER_LOG_DIR
mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}'
cmd += ['-v', mapping]
remaining_args += ['--log-dir', CONTAINER_LOG_DIR]
cmd.append(CONTAINER_IMAGE)