Update to Tesseract 5.0.0, Set version 0.1.0

This commit is contained in:
Patrick Jentsch 2022-01-04 11:42:55 +01:00
parent a0760487ae
commit e1b78b6ba4
7 changed files with 574 additions and 338 deletions

View File

@ -9,8 +9,14 @@ ENV LANG=C.UTF-8
RUN apt-get update \ RUN apt-get update \
&& apt-get install --no-install-recommends --yes \ && apt-get install --no-install-recommends --yes \
wget ghostscript \
procps \
python3.7 \
python3-pip \
rename \
wget \
zip \
&& python3 -m pip install lxml
# Install the OCR pipeline and it's dependencies # # Install the OCR pipeline and it's dependencies #
## Install pyFlow ## ## Install pyFlow ##
@ -43,7 +49,7 @@ RUN wget --no-check-certificate --quiet \
## Install Tesseract OCR ## ## Install Tesseract OCR ##
ENV TESSERACT_VERSION=4.1.1 ENV TESSERACT_VERSION=5.0.0
RUN wget --no-check-certificate --quiet \ RUN wget --no-check-certificate --quiet \
"https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \ "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \
&& tar -xzf "${TESSERACT_VERSION}.tar.gz" \ && tar -xzf "${TESSERACT_VERSION}.tar.gz" \
@ -61,37 +67,20 @@ RUN wget --no-check-certificate --quiet \
pkg-config \ pkg-config \
zlib1g-dev \ zlib1g-dev \
&& ./autogen.sh \ && ./autogen.sh \
&& ./configure \ && ./configure --disable-openmp --disable-shared 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic' \
&& make \ && make \
&& make install \ && make install \
&& ldconfig \ && ldconfig \
&& cd - > /dev/null \ && cd - > /dev/null \
&& rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz" && rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz"
ENV TESSERACT_MODELS="ara,chi_tra,dan,deu,ell,eng,enm,fra,frk,frm,ita,por,rus,spa"
ENV TESSDATA_BEST_VERSION=4.1.0
RUN wget --no-check-certificate --quiet \
"https://github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}.tar.gz" \
&& tar -xzf "${TESSDATA_BEST_VERSION}.tar.gz" \
&& for tesseract_model in $(echo ${TESSERACT_MODELS} | tr "," "\n"); do mv "tessdata_best-${TESSDATA_BEST_VERSION}/${tesseract_model}.traineddata" "/usr/local/share/tessdata/"; done \
&& rm -r "tessdata_best-${TESSDATA_BEST_VERSION}" "${TESSDATA_BEST_VERSION}.tar.gz"
## Further dependencies ##
RUN apt-get install --no-install-recommends --yes \
procps \
ghostscript \
python3.7 \
rename \
zip
## Install Pipeline ##
COPY hocrtotei ocr /usr/local/bin/
RUN rm -r /var/lib/apt/lists/* RUN rm -r /var/lib/apt/lists/*
## Install Pipeline ##
COPY hocr2tei hocr-combine ocr /usr/local/bin/
ENTRYPOINT ["ocr"] ENTRYPOINT ["ocr"]
CMD ["--help"] CMD ["--help"]

21
LICENSE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2021 Bielefeld University - CRC 1288 - INF
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -1,6 +1,6 @@
# OCR - Optical Character Recognition # OCR - Optical Character Recognition
This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided. This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided. The pipeline is designed to run on Linux operating systems, but with some tweaks it should also run on Windows with WSL installed.
## Software used in this pipeline implementation ## Software used in this pipeline implementation
@ -8,37 +8,26 @@ This software implements a heavily parallelized pipeline to recognize text in PD
- Software from Debian Buster's free repositories - Software from Debian Buster's free repositories
- ocropy (1.3.3): https://github.com/ocropus/ocropy/releases/tag/v1.3.3 - ocropy (1.3.3): https://github.com/ocropus/ocropy/releases/tag/v1.3.3
- pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20 - pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20
- Tesseract OCR (4.1.1): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1 - Tesseract OCR (5.0.0): https://github.com/tesseract-ocr/tesseract/releases/tag/5.0.0
- tessdata_best (4.1.0): https://github.com/tesseract-ocr/tessdata_best/releases/tag/4.1.0
## Use this image ## Installation
1. Create input and output directories for the pipeline. 1. Install Docker and Python 3.
``` bash 2. Clone this repository: `git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git`
mkdir -p /<my_data_location>/input /<my_data_location>/output 2. Build the Docker image: `docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:0.1.0 ocr`
``` 2. Add the wrapper script (`wrapper/ocr` relative to this README file) to your `${PATH}`.
3. Create working directories for the pipeline: `mkdir -p /<my_data_location>/{input,models,output}`.
4. Place your Tesseract OCR model(s) inside `/<my_data_location>/models`.
2. Place your PDF files inside `/<my_data_location>/input`. Files should all contain text of the same language. ## Use the Pipeline
1. Place your PDF files inside `/<my_data_location>/input`. Files should all contain text of the same language.
2. Clear your `/<my_data_location>/output` directory.
3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details. 3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details.
``` ```bash
# Option one: Use the wrapper script
## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/raw/development/wrapper/ocr, make it executeable and add it to your ${PATH}
cd /<my_data_location> cd /<my_data_location>
ocr -i input -l <language_code> -o output <optional_pipeline_arguments> ocr -i input -o output -m models/<model_name> -l <language_code> <optional_pipeline_arguments>
# or
# Option two: Classic Docker style ocr -i input -o output -m models/* -l <language_code> <optional_pipeline_arguments>
docker run \
--rm \
-it \
-u $(id -u $USER):$(id -g $USER) \
-v /<my_data_location>/input:/input \
-v /<my_data_location>/output:/output \
gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:development \
-i /ocr_pipeline/input \
-l <language_code> \
-o /ocr_pipeline/output \
<optional_pipeline_arguments>
``` ```
4. Check your results in the `/<my_data_location>/output` directory. 4. Check your results in the `/<my_data_location>/output` directory.

35
hocr-combine Normal file
View File

@ -0,0 +1,35 @@
#!/usr/bin/env python3.7
# coding=utf-8
""""Combine multiple hOCR files."""
from argparse import ArgumentParser
from lxml import html
parser = ArgumentParser(description='Combine multiple hOCR files.')
parser.add_argument('file', help='Input file(s)', nargs='+')
parser.add_argument('-o', '--output-file', help='Output file', required=True)
args = parser.parse_args()
for file in args.file:
files = []
if file.startswith('@'):
with open(file[1:], 'r') as f:
files += [x for x in f.read().split("\n") if x != '']
else:
files.append(file)
if len(files) == 0:
exit(1)
hocr = html.parse(files[0])
hocr_body = hocr.find('body')
for file in files[1:]:
for ocr_page in html.parse(file).findall('//div[@class="ocr_page"]'):
hocr_body.append(ocr_page)
with open(args.output_file, 'wb') as f:
hocr.write(f, encoding='UTF-8', method='html')

39
hocrtotei → hocr2tei Executable file → Normal file
View File

@ -3,16 +3,18 @@
""""Convert hOCR to TEI XML.""" """"Convert hOCR to TEI XML."""
from xml.sax.saxutils import escape
from argparse import ArgumentParser from argparse import ArgumentParser
from lxml import html
from xml.sax.saxutils import escape
import re import re
import xml.etree.ElementTree as ET
parser = ArgumentParser(description='Convert hOCR to TEI XML.') parser = ArgumentParser(description='Convert hOCR to TEI XML.')
parser.add_argument('input', metavar='Path to hOCR input file') parser.add_argument('file', help='Input file')
parser.add_argument('output', metavar='Path to TEI output file') parser.add_argument('-o', '--output-file', help='Output file', required=True)
args = parser.parse_args() args = parser.parse_args()
tei = '' tei = ''
tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n' tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n'
tei += ' <teiHeader>\n' tei += ' <teiHeader>\n'
@ -30,28 +32,27 @@ tei += ' </fileDesc>\n'
tei += ' </teiHeader>\n' tei += ' </teiHeader>\n'
tei += ' <text>\n' tei += ' <text>\n'
tei += ' <body>\n' tei += ' <body>\n'
# Conversion start hocr = html.parse(args.file)
hocr = ET.parse(args.input) for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
for page in hocr.findall('.//*[@class="ocr_page"]'): ocr_page_title_attrib = ocr_page.attrib.get('title')
page_properties = page.attrib.get('title') facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1)
facsimile = re.search(r'image \"(.*?)\"', page_properties).group(1) page_number = re.search(r'ppageno (\d+)', ocr_page_title_attrib).group(1)
page_number = re.search(r'ppageno (\d+)', page_properties).group(1) tei += f' <pb facs="{facsimile}" n="{page_number}"/>\n'
tei += ' <pb facs="{}" n="{}"/>\n'.format(facsimile, page_number) for ocr_par in ocr_page.findall('.//p[@class="ocr_par"]'):
for para in page.findall('.//*[@class="ocr_par"]'):
tei += ' <p>\n' tei += ' <p>\n'
for line in para.findall('.//*[@class="ocr_line"]'): for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'):
tei += ' <lb/>' tei += ' <lb/>'
indent = '' indent = ''
for word in line.findall('.//*[@class="ocrx_word"]'): for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'):
if word.text is not None: if ocrx_word.text is not None:
tei += indent + escape(word.text.strip()) tei += indent + escape(ocrx_word.text)
indent = ' ' indent = ' '
tei += '\n' tei += '\n'
tei += ' </p>\n' tei += ' </p>\n'
# Conversion end
tei += ' </body>\n' tei += ' </body>\n'
tei += ' </text>\n' tei += ' </text>\n'
tei += '</TEI>\n' tei += '</TEI>\n'
with open(args.output, 'w') as tei_file:
tei_file.write(tei) with open(args.output_file, 'w') as f:
f.write(tei)

719
ocr
View File

@ -1,11 +1,9 @@
#!/usr/bin/env python2.7 #!/usr/bin/env python2.7
# coding=utf-8 # coding=utf-8
"""OCR pipeline for PDF file processing.""" ''' OCR pipeline for PDF file processing. '''
__version__ = '0.1.0'
__author__ = 'Patrick Jentsch <p.jentsch@uni-bielefeld.de>,' \
'Stephan Porada <porada@posteo.de>'
__version__ = '1.0.0'
from argparse import ArgumentParser from argparse import ArgumentParser
from pyflow import WorkflowRunner from pyflow import WorkflowRunner
@ -14,145 +12,402 @@ import os
import sys import sys
class OCRPipelineJob: class PipelineJob:
"""An OCR pipeline job class '''
OCR pipeline job class.
Each input file of the pipeline is represented as an OCR pipeline job, Each input file of the pipeline is represented as an OCR pipeline job,
which holds all necessary information for the pipeline to process it. which holds all necessary information for the pipeline to process it.
Arguments: Arguments:
file -- Path to the file file -- Path to the file
output_dir -- Path to a directory, where job results a stored output_dir -- Path to a directory, where job results are stored
""" '''
def __init__(self, file, output_dir): def __init__(self, file, output_dir):
self.file = file self.file = file
self.name = os.path.basename(file).rsplit('.', 1)[0] self.name = os.path.basename(file)[:-4]
self.output_dir = output_dir self.output_dir = output_dir
self.page_dir = os.path.join(output_dir, 'pages') self.tmp_dir = os.path.join(output_dir, 'tmp')
class OCRPipeline(WorkflowRunner): class SplitInputWorkflow(WorkflowRunner):
def __init__(self, input_dir, lang, output_dir, binarize, zip): def __init__(self, job):
self.job = job
def workflow(self):
'''
' ##################################################
' # gs #
' ##################################################
'''
n_cores = min(2, self.getNCores())
mem_mb = min(n_cores * 512, self.getMemMb())
cmd = 'gs'
cmd += ' -dBATCH'
cmd += ' -dNOPAUSE'
cmd += ' -dBufferSpace={}'.format(mem_mb * 1000000)
cmd += ' -dNumRenderingThreads={}'.format(n_cores)
cmd += ' -dQUIET'
cmd += ' -r300'
cmd += ' -sDEVICE=png16m'
cmd += ' -sOutputFile="{}/page-%d.png"'.format(
os.path.join(self.job.tmp_dir, 'images')
)
cmd += ' "{}"'.format(self.job.file)
self.addTask(
'gs',
command=cmd,
memMb=mem_mb,
nCores=n_cores
)
class BinarizationWorkflow(WorkflowRunner):
def __init__(self, job):
self.job = job
def workflow(self):
'''
' ##################################################
' # ocropus-nlbin #
' ##################################################
'''
# TODO: Update to newer ocropus-nlbin and start one task per page
n_cores = self.getNCores()
mem_mb = min(512 * n_cores, self.getMemMb())
cmd = 'ls -dv "{}/"* > "{}"'.format(
os.path.join(self.job.tmp_dir, 'images'),
os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')
)
cmd += ' && '
cmd += 'ocropus-nlbin "@{}"'.format(os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')) # noqa
cmd += ' --nocheck'
cmd += ' --output "{}"'.format(
os.path.join(self.job.tmp_dir, 'images'))
cmd += ' --parallel "{}"'.format(n_cores)
cmd += ' && '
cmd += 'rm "{}"'.format(os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')) # noqa
ocropus_nlbin_task = self.addTask(
'ocropus_nlbin',
command=cmd,
memMb=mem_mb,
nCores=n_cores
)
'''
' ##################################################
' # cleanup #
' ##################################################
'''
n_cores = 1
mem_mb = min(128, self.getMemMb())
cmd = 'cd "{}"'.format(os.path.join(self.job.tmp_dir, 'images'))
cmd += ' && '
cmd += 'mkdir tmp'
cmd += ' && '
cmd += 'mv *.bin.png tmp'
cmd += ' && '
cmd += 'rm *.png'
cmd += ' && '
cmd += 'mv tmp/* .'
cmd += ' && '
cmd += 'rmdir tmp'
cmd += ' && '
cmd += 'rename \'s/^0*/page-/\' *'
cmd += ' && '
cmd += 'rename \'s/.bin.png$/.png/\' *'
cmd += ' && '
cmd += 'cd -'
self.addTask(
'cleanup',
command=cmd,
dependencies=ocropus_nlbin_task,
memMb=mem_mb,
nCores=n_cores
)
class OCRWorkflow(WorkflowRunner):
def __init__(self, job, lang):
self.job = job
self.lang = lang
def workflow(self):
'''
' ##################################################
' # tesseract #
' ##################################################
'''
tesseract_tasks = []
n_cores = 1
mem_mb = min(512, self.getMemMb())
for i, file in enumerate(os.listdir(os.path.join(self.job.tmp_dir, 'images'))): # noqa
cmd = 'tesseract "{}" "{}"'.format(
os.path.join(self.job.tmp_dir, 'images', file),
os.path.join(self.job.tmp_dir, file[:-4])
)
cmd += ' -l "{}"'.format(self.lang)
cmd += ' hocr pdf txt'
cmd += ' || '
cmd += 'echo "${?}"'
task = self.addTask(
'tesseract_-_{}'.format(i),
command=cmd,
env={'OMP_THREAD_LIMIT': '{}'.format(n_cores)},
memMb=mem_mb,
nCores=n_cores
)
tesseract_tasks.append(task)
'''
' ##################################################
' # move_files #
' ##################################################
'''
n_cores = 1
mem_mb = min(128, self.getMemMb())
for i, file_extension in enumerate(['hocr', 'pdf', 'txt']):
cmd = 'mv "{}/"*.{} "{}"'.format(
self.job.tmp_dir,
file_extension,
os.path.join(self.job.tmp_dir, file_extension)
)
self.addTask(
'move_{}_files'.format(file_extension),
command=cmd,
dependencies=tesseract_tasks,
memMb=mem_mb,
nCores=n_cores
)
cmd = 'mv "{}" "{}"'.format(
os.path.join(self.job.tmp_dir, 'images'),
os.path.join(self.job.output_dir)
)
self.addTask(
'move_image_files',
command=cmd,
dependencies=tesseract_tasks,
memMb=mem_mb,
nCores=n_cores
)
class CreateHOCRWorkflow(WorkflowRunner):
def __init__(self, job):
self.job = job
def workflow(self):
'''
' ##################################################
' # fix-hocr #
' ##################################################
'''
fix_hocr_tasks = []
n_cores = 1
mem_mb = min(256, self.getMemMb())
for i, file in enumerate(os.listdir(os.path.join(self.job.tmp_dir, 'hocr'))): # noqa
cmd = 'sed -i \'s>{}>images>g\' "{}"'.format(
os.path.join(self.job.tmp_dir, 'images'),
os.path.join(self.job.tmp_dir, 'hocr', file)
)
cmd += ' && '
cmd += 'sed -i \'s>ppageno [0-9]\\+>ppageno {}>g\' "{}"'.format(
file[5:-5],
os.path.join(self.job.tmp_dir, 'hocr', file)
)
cmd += ' && '
cmd += 'sed -i \'s>page_[0-9]\\+>page_{}>g\' "{}"'.format(
file[5:-5],
os.path.join(self.job.tmp_dir, 'hocr', file)
)
cmd += ' && '
cmd += 'sed -i \'s>block_[0-9]\\+>block_{}>g\' "{}"'.format(
file[5:-5],
os.path.join(self.job.tmp_dir, 'hocr', file)
)
cmd += ' && '
cmd += 'sed -i \'s>par_[0-9]\\+>par_{}>g\' "{}"'.format(
file[5:-5],
os.path.join(self.job.tmp_dir, 'hocr', file)
)
cmd += ' && '
cmd += 'sed -i \'s>line_[0-9]\\+>line_{}>g\' "{}"'.format(
file[5:-5],
os.path.join(self.job.tmp_dir, 'hocr', file)
)
cmd += ' && '
cmd += 'sed -i \'s>word_[0-9]\\+>word_{}>g\' "{}"'.format(
file[5:-5],
os.path.join(self.job.tmp_dir, 'hocr', file)
)
task = self.addTask(
'fix-hocr_-_{}'.format(i),
command=cmd,
memMb=mem_mb,
nCores=n_cores
)
fix_hocr_tasks.append(task)
'''
' ##################################################
' # hocr-combine #
' ##################################################
'''
n_cores = 1
mem_mb = min(512, self.getMemMb())
cmd = 'ls -dv "{}/"* > "{}"'.format(
os.path.join(self.job.tmp_dir, 'hocr'),
os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt')
)
cmd += ' && '
cmd += 'hocr-combine "@{}"'.format(
os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt')
)
cmd += ' --output-file "{}.hocr"'.format(
os.path.join(self.job.output_dir, self.job.name)
)
cmd += ' && '
cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'hocr'))
self.addTask(
'hocr_combine',
command=cmd,
dependencies=fix_hocr_tasks,
memMb=mem_mb,
nCores=n_cores
)
class CreatePDFWorkflow(WorkflowRunner):
def __init__(self, job):
self.job = job
def workflow(self):
'''
' ##################################################
' # pdf_combine #
' ##################################################
'''
n_cores = min(2, self.getNCores())
mem_mb = min(n_cores * 256, self.getMemMb())
cmd = 'ls -dQv "{}"/*'.format(os.path.join(self.job.tmp_dir, 'pdf'))
cmd += ' | '
cmd += 'xargs gs'
cmd += ' -dBATCH'
cmd += ' -dNOPAUSE'
cmd += ' -dBufferSpace={}'.format(mem_mb * 1000000)
cmd += ' -dNumRenderingThreads={}'.format(n_cores)
cmd += ' -dPDFSETTINGS=/ebook'
cmd += ' -dQUIET'
cmd += ' -sDEVICE=pdfwrite'
cmd += ' -sOutputFile="{}.pdf"'.format(
os.path.join(self.job.output_dir, self.job.name)
)
cmd += ' && '
cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'pdf'))
self.addTask('pdf_combine', command=cmd, memMb=mem_mb, nCores=n_cores)
class CreateTEIWorkflow(WorkflowRunner):
def __init__(self, job):
self.job = job
def workflow(self):
'''
' ##################################################
' # hocr2tei #
' ##################################################
'''
n_cores = 1
mem_mb = min(512, self.getMemMb())
cmd = 'hocr2tei "{}.hocr"'.format(
os.path.join(self.job.output_dir, self.job.name)
)
cmd += ' --output-file "{}.xml"'.format(
os.path.join(self.job.output_dir, self.job.name)
)
self.addTask('hocr2tei', command=cmd, memMb=mem_mb, nCores=n_cores)
class CreateTxtWorkflow(WorkflowRunner):
def __init__(self, job):
self.job = job
def workflow(self):
'''
' ##################################################
' # txt_combine #
' ##################################################
'''
n_cores = 1
mem_mb = min(512, self.getMemMb())
cmd = 'ls -dQv "{}"/*'.format(os.path.join(self.job.tmp_dir, 'txt'))
cmd += ' | '
cmd += 'xargs cat'
cmd += ' > '
cmd += '"{}.txt"'.format(os.path.join(self.job.output_dir, self.job.name)) # noqa
cmd += ' && '
cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'txt'))
self.addTask('txt_combine', command=cmd, memMb=mem_mb, nCores=n_cores)
class MainWorkflow(WorkflowRunner):
def __init__(self, input_dir, lang, output_dir, binarize):
self.input_dir = input_dir self.input_dir = input_dir
self.lang = lang self.lang = lang
self.output_dir = output_dir self.output_dir = output_dir
self.binarize = binarize self.binarize = binarize
self.zip = zip self.jobs = self.collect_jobs()
self.jobs = collect_jobs(self.input_dir, self.output_dir)
def collect_jobs(self):
jobs = []
for file in os.listdir(self.input_dir):
if os.path.isdir(os.path.join(self.input_dir, file)):
continue
if file.lower().endswith('.pdf'):
job = PipelineJob(
os.path.join(self.input_dir, file),
os.path.join(self.output_dir, file)
)
jobs.append(job)
return jobs
def workflow(self): def workflow(self):
if not self.jobs: if not self.jobs:
return return
''' # Create output and temporary directories
' ################################################## for job in self.jobs:
' # setup output directory # os.mkdir(job.output_dir)
' ################################################## os.mkdir(job.tmp_dir)
''' os.mkdir(os.path.join(job.tmp_dir, 'hocr'))
setup_output_directory_tasks = [] os.mkdir(os.path.join(job.tmp_dir, 'pdf'))
for i, job in enumerate(self.jobs): os.mkdir(os.path.join(job.tmp_dir, 'images'))
cmd = 'mkdir -p "{}"'.format(job.page_dir) os.mkdir(os.path.join(job.tmp_dir, 'txt'))
lbl = 'setup_output_directory_-_{}'.format(i)
task = self.addTask(command=cmd, label=lbl)
setup_output_directory_tasks.append(task)
''' '''
' ################################################## ' ##################################################
' # split input # ' # split-input #
' ################################################## ' ##################################################
''' '''
split_input_tasks = []
n_cores = max(1, int(self.getNCores() / len(self.jobs)))
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
input_file = job.file self.addWorkflowTask(
output_file = '{}/page-%d.tif'.format(job.page_dir) 'split_input_-_{}'.format(i),
cmd = 'gs' SplitInputWorkflow(job)
cmd += ' -dBATCH' )
cmd += ' -dNOPAUSE'
cmd += ' -dNumRenderingThreads={}'.format(n_cores)
cmd += ' -dQUIET'
cmd += ' -r300'
cmd += ' -sDEVICE=tiff24nc'
cmd += ' -sCompression=lzw'
cmd += ' "-sOutputFile={}"'.format(output_file)
cmd += ' "{}"'.format(input_file)
deps = 'setup_output_directory_-_{}'.format(i)
lbl = 'split_input_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl,
nCores=n_cores)
split_input_tasks.append(task)
if self.binarize: if self.binarize:
'''
' ##################################################
' # pre binarization #
' ##################################################
'''
pre_binarization_tasks = []
for i, job in enumerate(self.jobs):
input_file = os.path.join(job.output_dir, 'binarization_input_files.txt') # noqa
cmd = 'ls -dv "{}/"* >> "{}"'.format(job.page_dir, input_file)
deps = 'split_input_-_{}'.format(i)
lbl = 'pre_binarization_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
pre_binarization_tasks.append(task)
''' '''
' ################################################## ' ##################################################
' # binarization # ' # binarization #
' ################################################## ' ##################################################
''' '''
binarization_tasks = []
n_cores = self.getNCores()
mem_mb = self.getMemMb()
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
input_file = os.path.join(job.output_dir, 'binarization_input_files.txt') # noqa self.addWorkflowTask(
cmd = 'ocropus-nlbin "@{}"'.format(input_file) 'binarization_-_{}'.format(i),
cmd += ' --nocheck' BinarizationWorkflow(job),
cmd += ' --output "{}"'.format(job.page_dir) dependencies='split_input_-_{}'.format(i)
cmd += ' --parallel "{}"'.format(n_cores) )
deps = 'pre_binarization_-_{}'.format(i)
lbl = 'binarization_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl,
memMb=mem_mb, nCores=n_cores)
binarization_tasks.append(task)
'''
' ##################################################
' # post binarization #
' ##################################################
'''
post_binarization_tasks = []
for i, job in enumerate(self.jobs):
input_file = os.path.join(job.output_dir, 'binarization_input_files.txt') # noqa
cmd = 'rm "{}"'.format(input_file)
cmd += ' && '
cmd += 'cd "{}"'.format(job.page_dir)
cmd += ' && '
cmd += 'rm *.{nrm.png,tif}'
cmd += ' && '
cmd += 'rename \'s/^0*/page-/\' *'
cmd += ' && '
cmd += 'cd -'
deps = 'binarization_-_{}'.format(i)
lbl = 'post_binarization_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
post_binarization_tasks.append(task)
'''
' ##################################################
' # pre ocr #
' ##################################################
'''
pre_ocr_tasks = []
for i, job in enumerate(self.jobs):
input_file = os.path.join(job.output_dir, 'ocr_input_files.txt')
cmd = 'ls -dv "{}/"* >> "{}"'.format(job.page_dir, input_file)
deps = 'post_binarization_-_{}'.format(i) if self.binarize else 'split_input_-_{}'.format(i) # noqa
lbl = 'pre_ocr_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
pre_ocr_tasks.append(task)
''' '''
' ################################################## ' ##################################################
@ -160,175 +415,117 @@ class OCRPipeline(WorkflowRunner):
' ################################################## ' ##################################################
''' '''
ocr_tasks = [] ocr_tasks = []
n_cores = min(4, self.getNCores())
mem_mb = min(n_cores * 2048, self.getMemMb())
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
input_file = os.path.join(job.output_dir, 'ocr_input_files.txt') if self.binarize:
output_file_base = os.path.join(job.output_dir, job.name) deps = 'binarization_-_{}'.format(i)
cmd = 'tesseract "{}" "{}"'.format(input_file, output_file_base) else:
cmd += ' -l "{}"'.format(self.lang) deps = 'split_input_-_{}'.format(i)
cmd += ' hocr pdf txt' task = self.addWorkflowTask(
deps = 'pre_ocr_-_{}'.format(i) 'ocr_-_{}'.format(i),
lbl = 'ocr_-_{}'.format(i) OCRWorkflow(job, self.lang),
task = self.addTask(command=cmd, dependencies=deps, dependencies=deps
env={'OMP_THREAD_LIMIT': '{}'.format(n_cores)}, )
label=lbl, memMb=mem_mb, nCores=n_cores)
ocr_tasks.append(task) ocr_tasks.append(task)
''' '''
' ################################################## ' ##################################################
' # post ocr # ' # create-hocr #
' ################################################## ' ##################################################
''' '''
post_ocr_tasks = [] create_hocr_tasks = []
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
input_file = os.path.join(job.output_dir, 'ocr_input_files.txt') task = self.addWorkflowTask(
output_file_base = os.path.join(job.output_dir, job.name) 'create_hocr_-_{}'.format(i),
cmd = 'rm "{}"'.format(input_file) CreateHOCRWorkflow(job),
cmd += ' && ' dependencies='ocr_-_{}'.format(i)
cmd += 'sed -i \'s+{}+pages+g\' "{}.hocr"'.format(job.page_dir, output_file_base) # noqa )
deps = 'ocr_-_{}'.format(i) create_hocr_tasks.append(task)
lbl = 'post_ocr_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
post_ocr_tasks.append(task)
''' '''
' ################################################## ' ##################################################
' # hocr to tei # ' # create-pdf #
' ################################################## ' ##################################################
''' '''
hocr_to_tei_tasks = [] create_pdf_tasks = []
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
output_file_base = os.path.join(job.output_dir, job.name) task = self.addWorkflowTask(
cmd = 'hocrtotei "{}.hocr" "{}.xml"'.format(output_file_base, output_file_base) # noqa 'create_pdf_-_{}'.format(i),
deps = 'post_ocr_-_{}'.format(i) CreatePDFWorkflow(job),
lbl = 'hocr_to_tei_-_{}'.format(i) dependencies='ocr_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl) )
hocr_to_tei_tasks.append(task) create_pdf_tasks.append(task)
''' '''
' ################################################## ' ##################################################
' # zip creation # ' # create-tei #
' ################################################## ' ##################################################
''' '''
zip_creation_tasks = [] create_tei_tasks = []
if self.zip is not None: for i, job in enumerate(self.jobs):
# zip all files task = self.addWorkflowTask(
cmd = 'cd "{}"'.format(self.output_dir) 'create_tei_-_{}'.format(i),
cmd += ' && ' CreateTEIWorkflow(job),
cmd += 'zip' dependencies='create_hocr_-_{}'.format(i)
cmd += ' -r' )
cmd += ' "{}.all.zip" .'.format(self.zip) create_tei_tasks.append(task)
cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.pdf" "*.txt" "*.xml" "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif') # noqa
cmd += ' && '
cmd += 'cd -'
deps = hocr_to_tei_tasks
lbl = 'zip_creation_-_all'
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
zip_creation_tasks.append(task)
# zip PDF files
cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' -r'
cmd += ' "{}.pdf.zip" .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.pdf"'
cmd += ' && '
cmd += 'cd -'
deps = ocr_tasks
lbl = 'zip_creation_-_pdf'
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
zip_creation_tasks.append(task)
# zip TXT files
cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' -r'
cmd += ' "{}.txt.zip" .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.txt"'
cmd += ' && '
cmd += 'cd -'
deps = ocr_tasks
lbl = 'zip_creation_-_txt'
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
zip_creation_tasks.append(task)
# zip XML files
cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' -r'
cmd += ' "{}.xml.zip" .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.xml"'
cmd += ' && '
cmd += 'cd -'
deps = hocr_to_tei_tasks
lbl = 'zip_creation_-_xml'
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
zip_creation_tasks.append(task)
# zip PoCo bundles
cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' -r'
cmd += ' "{}.poco.zip" .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif') # noqa
cmd += ' && '
cmd += 'cd -'
deps = post_ocr_tasks
lbl = 'zip_creation_-_poco'
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
zip_creation_tasks.append(task)
'''
' ##################################################
' # create-txt #
' ##################################################
'''
create_txt_tasks = []
for i, job in enumerate(self.jobs):
task = self.addWorkflowTask(
'create_txt_-_{}'.format(i),
CreateTxtWorkflow(job),
dependencies='ocr_-_{}'.format(i)
)
create_txt_tasks.append(task)
def collect_jobs(input_dir, output_dir): # Remove temporary directories when all tasks are completed
jobs = [] self.waitForTasks()
for file in os.listdir(input_dir): for job in self.jobs:
if os.path.isdir(os.path.join(input_dir, file)): os.rmdir(job.tmp_dir)
continue
if file.lower().endswith('.pdf'):
job = OCRPipelineJob(os.path.join(input_dir, file),
os.path.join(output_dir, file))
jobs.append(job)
return jobs
def parse_args(): def parse_args():
parser = ArgumentParser(description='OCR pipeline for PDF file processing', parser = ArgumentParser(description='OCR pipeline for PDF file processing')
prog='OCR pipeline') parser.add_argument(
parser.add_argument('-i', '--input-dir', '-i', '--input-dir', help='Input directory', required=True)
help='Input directory', parser.add_argument(
required=True) '-o', '--output-dir', help='Output directory', required=True)
parser.add_argument('-o', '--output-dir', parser.add_argument(
help='Output directory', '-l', '--language',
required=True) choices=[x[:-12] for x in os.listdir('/usr/local/share/tessdata')
parser.add_argument('-l', '--language', if x.endswith('.traineddata') and len(x) > 12],
choices=list(map(lambda x: x[:-12], filter(lambda x: x.endswith('.traineddata'), os.listdir('/usr/local/share/tessdata')))), # noqa help='Language of the input (3-character ISO 639-2 language codes)',
help='Language of the input ' required=True
'(3-character ISO 639-2 language codes)', )
required=True) parser.add_argument(
parser.add_argument('--binarize', '--binarize',
action='store_true', action='store_true',
help='Add binarization as a preprocessing step') help='Add binarization as a preprocessing step'
parser.add_argument('--log-dir', )
help='Logging directory') parser.add_argument(
parser.add_argument('--mem-mb', '--log-dir', help='Logging directory (Default: --output-dir)')
help='Amount of system memory to be used (Default: min(--n-cores * 2048, available system memory))', # noqa parser.add_argument(
type=int) '--mem-mb',
parser.add_argument('--n-cores', help='Amount of system memory to be used (Default: min(--n-cores * 512, available system memory))', # noqa
default=min(4, multiprocessing.cpu_count()), type=int
help='Number of CPU threads to be used (Default: min(4, number of CPUs))', # noqa )
type=int) parser.add_argument(
parser.add_argument('--zip', '--n-cores',
help='Create one zip file per filetype') default=min(4, multiprocessing.cpu_count()),
parser.add_argument('-v', '--version', help='Number of CPU threads to be used (Default: min(4, CPU count))',
action='version', type=int
help='Returns the current version of the OCR pipeline', )
version='%(prog)s {}'.format(__version__)) parser.add_argument(
'-v', '--version',
action='version',
help='Returns the current version of the OCR pipeline',
version='%(prog)s {}'.format(__version__)
)
args = parser.parse_args() args = parser.parse_args()
# Set some tricky default values and check for insufficient input # Set some tricky default values and check for insufficient input
@ -338,20 +535,18 @@ def parse_args():
raise Exception('--n-cores must be greater or equal 1') raise Exception('--n-cores must be greater or equal 1')
if args.mem_mb is None: if args.mem_mb is None:
max_mem_mb = int(os.popen('free -t -m').readlines()[-1].split()[1:][0]) max_mem_mb = int(os.popen('free -t -m').readlines()[-1].split()[1:][0])
args.mem_mb = min(args.n_cores * 2048, max_mem_mb) args.mem_mb = min(args.n_cores * 512, max_mem_mb)
if args.mem_mb < 2048: if args.mem_mb < 512:
raise Exception('--mem-mb must be greater or equal 2048') raise Exception('--mem-mb must be greater or equal 512')
if args.zip is not None and args.zip.lower().endswith('.zip'):
# Remove .zip file extension if provided
args.zip = args.zip[:-4]
args.zip = args.zip if args.zip else 'output'
return args return args
def main(): def main():
args = parse_args() args = parse_args()
ocr_pipeline = OCRPipeline(args.input_dir, args.language, args.output_dir, args.binarize, args.zip) # noqa ocr_pipeline = MainWorkflow(
retval = ocr_pipeline.run(dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores) # noqa args.input_dir, args.language, args.output_dir, args.binarize)
retval = ocr_pipeline.run(
dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores)
sys.exit(retval) sys.exit(retval)

View File

@ -6,9 +6,10 @@ import os
import subprocess import subprocess
import sys import sys
CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:1.0.0' CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:0.1.0'
CONTAINER_INPUT_DIR = '/input' CONTAINER_INPUT_DIR = '/input'
CONTAINER_OUTPUT_DIR = '/output' CONTAINER_OUTPUT_DIR = '/output'
CONTAINER_MODELS_DIR = '/usr/local/share/tessdata'
CONTAINER_LOG_DIR = '/logs' CONTAINER_LOG_DIR = '/logs'
UID = str(os.getuid()) UID = str(os.getuid())
GID = str(os.getgid()) GID = str(os.getgid())
@ -16,20 +17,25 @@ GID = str(os.getgid())
parser = ArgumentParser(add_help=False) parser = ArgumentParser(add_help=False)
parser.add_argument('-i', '--input-dir') parser.add_argument('-i', '--input-dir')
parser.add_argument('-o', '--output-dir') parser.add_argument('-o', '--output-dir')
parser.add_argument('-m', '--model', action='extend', dest='models', nargs='+')
parser.add_argument('--log-dir') parser.add_argument('--log-dir')
args, remaining_args = parser.parse_known_args() args, remaining_args = parser.parse_known_args()
cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)] cmd = ['docker', 'run', '--rm', '-it', '-u', f'{UID}:{GID}']
if args.input_dir is not None: if args.input_dir is not None:
mapping = os.path.abspath(args.input_dir) + ':' + CONTAINER_INPUT_DIR mapping = f'{os.path.abspath(args.input_dir)}:{CONTAINER_INPUT_DIR}'
cmd += ['-v', mapping] cmd += ['-v', mapping]
remaining_args += ['-i', CONTAINER_INPUT_DIR] remaining_args += ['-i', CONTAINER_INPUT_DIR]
if args.output_dir is not None: if args.output_dir is not None:
mapping = os.path.abspath(args.output_dir) + ':' + CONTAINER_OUTPUT_DIR mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}'
cmd += ['-v', mapping] cmd += ['-v', mapping]
remaining_args += ['-o', CONTAINER_OUTPUT_DIR] remaining_args += ['-o', CONTAINER_OUTPUT_DIR]
if args.models is not None:
for model in args.models:
mapping = f'{os.path.abspath(model)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model)}' # noqa
cmd += ['-v', mapping]
if args.log_dir is not None: if args.log_dir is not None:
mapping = os.path.abspath(args.log_dir) + ':' + CONTAINER_LOG_DIR mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}'
cmd += ['-v', mapping] cmd += ['-v', mapping]
remaining_args += ['--log-dir', CONTAINER_LOG_DIR] remaining_args += ['--log-dir', CONTAINER_LOG_DIR]
cmd.append(CONTAINER_IMAGE) cmd.append(CONTAINER_IMAGE)