Compare commits

..

No commits in common. "master" and "1.0.0b" have entirely different histories.

8 changed files with 378 additions and 797 deletions

View File

@ -9,14 +9,8 @@ ENV LANG=C.UTF-8
RUN apt-get update \ RUN apt-get update \
&& apt-get install --no-install-recommends --yes \ && apt-get install --no-install-recommends --yes \
ghostscript \ wget
procps \
python3.7 \
python3-pip \
rename \
wget \
zip \
&& python3 -m pip install lxml
# Install the OCR pipeline and it's dependencies # # Install the OCR pipeline and it's dependencies #
## Install pyFlow ## ## Install pyFlow ##
@ -49,7 +43,7 @@ RUN wget --no-check-certificate --quiet \
## Install Tesseract OCR ## ## Install Tesseract OCR ##
ENV TESSERACT_VERSION=5.0.0 ENV TESSERACT_VERSION=4.1.1
RUN wget --no-check-certificate --quiet \ RUN wget --no-check-certificate --quiet \
"https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \ "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \
&& tar -xzf "${TESSERACT_VERSION}.tar.gz" \ && tar -xzf "${TESSERACT_VERSION}.tar.gz" \
@ -67,20 +61,37 @@ RUN wget --no-check-certificate --quiet \
pkg-config \ pkg-config \
zlib1g-dev \ zlib1g-dev \
&& ./autogen.sh \ && ./autogen.sh \
&& ./configure --disable-openmp --disable-shared 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic' \ && ./configure \
&& make \ && make \
&& make install \ && make install \
&& ldconfig \ && ldconfig \
&& cd - > /dev/null \ && cd - > /dev/null \
&& rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz" && rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz"
ENV TESSERACT_MODELS="ara,chi_tra,dan,deu,ell,eng,enm,fra,frk,frm,ita,por,rus,spa"
ENV TESSDATA_BEST_VERSION=4.1.0
RUN wget --no-check-certificate --quiet \
"https://github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}.tar.gz" \
&& tar -xzf "${TESSDATA_BEST_VERSION}.tar.gz" \
&& for tesseract_model in $(echo ${TESSERACT_MODELS} | tr "," "\n"); do mv "tessdata_best-${TESSDATA_BEST_VERSION}/${tesseract_model}.traineddata" "/usr/local/share/tessdata/"; done \
&& rm -r "tessdata_best-${TESSDATA_BEST_VERSION}" "${TESSDATA_BEST_VERSION}.tar.gz"
## Further dependencies ##
RUN apt-get install --no-install-recommends --yes \
procps \
ghostscript \
python3.7 \
rename \
zip
## Install Pipeline ##
COPY hocrtotei ocr /usr/local/bin/
RUN rm -r /var/lib/apt/lists/* RUN rm -r /var/lib/apt/lists/*
## Install Pipeline ##
COPY hocr2tei hocr-combine ocr /usr/local/bin/
ENTRYPOINT ["ocr"] ENTRYPOINT ["ocr"]
CMD ["--help"] CMD ["--help"]

21
LICENSE
View File

@ -1,21 +0,0 @@
MIT License
Copyright (c) 2021 Bielefeld University - CRC 1288 - INF
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -1,6 +1,6 @@
# OCR - Optical Character Recognition # OCR - Optical Character Recognition
This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided. The pipeline is designed to run on Linux operating systems, but with some tweaks it should also run on Windows with WSL installed. This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided.
## Software used in this pipeline implementation ## Software used in this pipeline implementation
@ -8,42 +8,37 @@ This software implements a heavily parallelized pipeline to recognize text in PD
- Software from Debian Buster's free repositories - Software from Debian Buster's free repositories
- ocropy (1.3.3): https://github.com/ocropus/ocropy/releases/tag/v1.3.3 - ocropy (1.3.3): https://github.com/ocropus/ocropy/releases/tag/v1.3.3
- pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20 - pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20
- Tesseract OCR (5.0.0): https://github.com/tesseract-ocr/tesseract/releases/tag/5.0.0 - Tesseract OCR (4.1.1): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1
- tessdata_best (4.1.0): https://github.com/tesseract-ocr/tessdata_best/releases/tag/4.1.0
## Installation ## Use this image
1. Install Docker and Python 3. 1. Create input and output directories for the pipeline.
2. Clone this repository: `git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git`
3. Build the Docker image: `docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:v0.1.0 ocr`
4. Add the wrapper script (`wrapper/ocr` relative to this README file) to your `${PATH}`.
5. Create working directories for the pipeline: `mkdir -p /<my_data_location>/{input,models,output}`.
6. Place your Tesseract OCR model(s) inside `/<my_data_location>/models`.
## Use the Pipeline
1. Place your PDF files inside `/<my_data_location>/input`. Files should all contain text of the same language.
2. Clear your `/<my_data_location>/output` directory.
3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details.
``` bash ``` bash
cd /<my_data_location> mkdir -p /<my_data_location>/input /<my_data_location>/output
# <model_code> is the model filename without the ".traineddata" suffix
ocr \
--input-dir input \
--output-dir output \
--model-file models/<model>
-m <model_code> <optional_pipeline_arguments>
# More then one model
ocr \
--input-dir input \
--output-dir output \
--model-file models/<model1>
--model-file models/<model2>
-m <model1_code>+<model2_code> <optional_pipeline_arguments>
# Instead of multiple --model-file statements, you can also use
ocr \
--input-dir input \
--output-dir output \
--model-file models/*
-m <model1_code>+<model2_code> <optional_pipeline_arguments>
``` ```
2. Place your PDF files inside `/<my_data_location>/input`. Files should all contain text of the same language.
3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details.
```
# Option one: Use the wrapper script
## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/raw/development/wrapper/ocr, make it executeable and add it to your ${PATH}
cd /<my_data_location>
ocr -i input -l <language_code> -o output <optional_pipeline_arguments>
# Option two: Classic Docker style
docker run \
--rm \
-it \
-u $(id -u $USER):$(id -g $USER) \
-v /<my_data_location>/input:/input \
-v /<my_data_location>/output:/output \
gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:development \
-i /ocr_pipeline/input \
-l <language_code> \
-o /ocr_pipeline/output \
<optional_pipeline_arguments>
```
4. Check your results in the `/<my_data_location>/output` directory. 4. Check your results in the `/<my_data_location>/output` directory.

View File

@ -1,44 +0,0 @@
#!/usr/bin/env python3.7
# coding=utf-8
''' Combine multiple hOCR files. '''
from argparse import ArgumentParser
from lxml import html
parser = ArgumentParser(description='Combine multiple hOCR files.')
parser.add_argument(
'-i', '--input-file',
help='Input file',
nargs='+',
required=True
)
parser.add_argument(
'-o', '--output-file',
help='Output file',
required=True
)
args = parser.parse_args()
for input_file in args.input_file:
input_files = []
if input_file.startswith('@'):
with open(input_file[1:], 'r') as f:
input_files += [x for x in f.read().split("\n") if x != '']
else:
input_files.append(input_file)
if len(input_files) == 0:
exit(1)
hocr = html.parse(input_files[0])
hocr_body = hocr.find('body')
for input_file in input_files[1:]:
for ocr_page in html.parse(input_file).findall('//div[@class="ocr_page"]'):
hocr_body.append(ocr_page)
with open(args.output_file, 'wb') as f:
hocr.write(f, encoding='UTF-8', method='html')

View File

@ -1,68 +0,0 @@
#!/usr/bin/env python3.7
# coding=utf-8
''' Convert hOCR to TEI XML. '''
from argparse import ArgumentParser
from lxml import html
from xml.sax.saxutils import escape
import re
parser = ArgumentParser(description='Convert hOCR to TEI XML.')
parser.add_argument(
'-i', '--input-file',
help='Input file',
required=True
)
parser.add_argument(
'-o', '--output-file',
help='Output file',
required=True
)
args = parser.parse_args()
tei = ''
tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n'
tei += ' <teiHeader>\n'
tei += ' <fileDesc>\n'
tei += ' <titleStmt>\n'
tei += ' <title></title>\n'
tei += ' </titleStmt>\n'
tei += ' <publicationStmt>\n'
tei += ' <p></p>\n'
tei += ' </publicationStmt>\n'
tei += ' <sourceDesc>\n'
tei += ' <p></p>\n'
tei += ' </sourceDesc>\n'
tei += ' </fileDesc>\n'
tei += ' </teiHeader>\n'
tei += ' <text>\n'
tei += ' <body>\n'
hocr = html.parse(args.input_file)
for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
ocr_page_title_attrib = ocr_page.attrib.get('title')
facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1)
page_number = re.search(r'ppageno (\d+)', ocr_page_title_attrib).group(1)
tei += f' <pb facs="{facsimile}" n="{page_number}"/>\n'
for ocr_par in ocr_page.findall('.//p[@class="ocr_par"]'):
tei += ' <p>\n'
for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'):
tei += ' <lb/>'
is_first_word_in_line = True
for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'):
if ocrx_word.text is not None:
if not is_first_word_in_line:
tei += ' '
tei += escape(ocrx_word.text)
is_first_word_in_line = False
tei += '\n'
tei += ' </p>\n'
tei += ' </body>\n'
tei += ' </text>\n'
tei += '</TEI>\n'
with open(args.output_file, 'w') as f:
f.write(tei)

57
hocrtotei Executable file
View File

@ -0,0 +1,57 @@
#!/usr/bin/env python3.7
# coding=utf-8
""""Convert hOCR to TEI XML."""
from xml.sax.saxutils import escape
from argparse import ArgumentParser
import re
import xml.etree.ElementTree as ET
parser = ArgumentParser(description='Convert hOCR to TEI XML.')
parser.add_argument('input', metavar='Path to hOCR input file')
parser.add_argument('output', metavar='Path to TEI output file')
args = parser.parse_args()
tei = ''
tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n'
tei += ' <teiHeader>\n'
tei += ' <fileDesc>\n'
tei += ' <titleStmt>\n'
tei += ' <title></title>\n'
tei += ' </titleStmt>\n'
tei += ' <publicationStmt>\n'
tei += ' <p></p>\n'
tei += ' </publicationStmt>\n'
tei += ' <sourceDesc>\n'
tei += ' <p></p>\n'
tei += ' </sourceDesc>\n'
tei += ' </fileDesc>\n'
tei += ' </teiHeader>\n'
tei += ' <text>\n'
tei += ' <body>\n'
# Conversion start
hocr = ET.parse(args.input)
for page in hocr.findall('.//*[@class="ocr_page"]'):
page_properties = page.attrib.get('title')
facsimile = re.search(r'image \"(.*?)\"', page_properties).group(1)
page_number = re.search(r'ppageno (\d+)', page_properties).group(1)
tei += ' <pb facs="{}" n="{}"/>\n'.format(facsimile, page_number)
for para in page.findall('.//*[@class="ocr_par"]'):
tei += ' <p>\n'
for line in para.findall('.//*[@class="ocr_line"]'):
tei += ' <lb/>'
indent = ''
for word in line.findall('.//*[@class="ocrx_word"]'):
if word.text is not None:
tei += indent + escape(word.text.strip())
indent = ' '
tei += '\n'
tei += ' </p>\n'
# Conversion end
tei += ' </body>\n'
tei += ' </text>\n'
tei += '</TEI>\n'
with open(args.output, 'w') as tei_file:
tei_file.write(tei)

853
ocr
View File

@ -1,671 +1,334 @@
#!/usr/bin/env python2.7 #!/usr/bin/env python2.7
# coding=utf-8 # coding=utf-8
''' OCR pipeline for PDF file processing. ''' """OCR pipeline for PDF file processing."""
__version__ = '0.1.0'
__author__ = 'Patrick Jentsch <p.jentsch@uni-bielefeld.de>,' \
'Stephan Porada <porada@posteo.de>'
__version__ = '1.0.0'
from argparse import ArgumentParser from argparse import ArgumentParser
from pyflow import WorkflowRunner from pyflow import WorkflowRunner
import json import multiprocessing
import os import os
import sys import sys
class PipelineJob: class OCRPipelineJob:
''' """An OCR pipeline job class
OCR pipeline job class.
Each input file of the pipeline is represented as an OCR pipeline job, Each input file of the pipeline is represented as an OCR pipeline job,
which holds all necessary information for the pipeline to process it. which holds all necessary information for the pipeline to process it.
Arguments: Arguments:
file -- Path to the file file -- Path to the file
output_dir -- Path to a directory, where job results are stored output_dir -- Path to a directory, where job results a stored
''' """
def __init__(self, file, output_dir): def __init__(self, file, output_dir):
self.file = file self.file = file
self.name = os.path.basename(file)[:-4] self.name = os.path.basename(file).rsplit('.', 1)[0]
self.output_dir = output_dir self.output_dir = output_dir
self.tmp_dir = os.path.join(output_dir, 'tmp') self.page_dir = os.path.join(output_dir, 'pages')
class SplitInputWorkflow(WorkflowRunner): class OCRPipeline(WorkflowRunner):
def __init__(self, job): def __init__(self, input_dir, lang, output_dir, binarize, zip):
self.job = job
def workflow(self):
'''
' ##################################################
' # gs #
' ##################################################
'''
n_cores = min(2, self.getNCores())
mem_mb = min(n_cores * 512, self.getMemMb())
cmd = 'gs'
cmd += ' -dBATCH'
cmd += ' -dNOPAUSE'
cmd += ' -dBufferSpace={}'.format(mem_mb * 1000000)
cmd += ' -dNumRenderingThreads={}'.format(n_cores)
cmd += ' -dQUIET'
cmd += ' -r300'
cmd += ' -sDEVICE=png16m'
cmd += ' -sOutputFile="{}"'.format(
os.path.join(self.job.tmp_dir, 'images', 'page-%d.png')
)
cmd += ' "{}"'.format(self.job.file)
self.addTask(
'gs',
command=cmd,
memMb=mem_mb,
nCores=n_cores
)
class BinarizationWorkflow(WorkflowRunner):
def __init__(self, job):
self.job = job
def workflow(self):
'''
' ##################################################
' # ocropus-nlbin #
' ##################################################
'''
# TODO: Update to newer ocropus-nlbin and start one task per page
n_cores = self.getNCores()
mem_mb = min(512 * n_cores, self.getMemMb())
cmd = 'ls -dv "{}/"* > "{}"'.format(
os.path.join(self.job.tmp_dir, 'images'),
os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')
)
cmd += ' && '
cmd += 'ocropus-nlbin "@{}"'.format(
os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')
)
cmd += ' --nocheck'
cmd += ' --output "{}"'.format(
os.path.join(self.job.tmp_dir, 'images')
)
cmd += ' --parallel "{}"'.format(n_cores)
cmd += ' && '
cmd += 'rm "{}"'.format(
os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')
)
ocropus_nlbin_task = self.addTask(
'ocropus_nlbin',
command=cmd,
memMb=mem_mb,
nCores=n_cores
)
'''
' ##################################################
' # cleanup #
' ##################################################
'''
n_cores = 1
mem_mb = min(128, self.getMemMb())
cmd = 'cd "{}"'.format(os.path.join(self.job.tmp_dir, 'images'))
cmd += ' && '
cmd += 'mkdir tmp'
cmd += ' && '
cmd += 'mv *.bin.png tmp'
cmd += ' && '
cmd += 'rm *.png'
cmd += ' && '
cmd += 'mv tmp/* .'
cmd += ' && '
cmd += 'rmdir tmp'
cmd += ' && '
cmd += 'rename \'s/^0*/page-/\' *'
cmd += ' && '
cmd += 'rename \'s/.bin.png$/.png/\' *'
cmd += ' && '
cmd += 'cd -'
self.addTask(
'cleanup',
command=cmd,
dependencies=ocropus_nlbin_task,
memMb=mem_mb,
nCores=n_cores
)
class OCRWorkflow(WorkflowRunner):
def __init__(self, job, model):
self.job = job
self.model = model
def workflow(self):
'''
' ##################################################
' # tesseract #
' ##################################################
'''
tesseract_tasks = []
n_cores = 1
mem_mb = min(512, self.getMemMb())
for i, file in enumerate(os.listdir(os.path.join(self.job.tmp_dir, 'images'))): # noqa
cmd = 'tesseract "{}" "{}"'.format(
os.path.join(self.job.tmp_dir, 'images', file),
os.path.join(self.job.tmp_dir, file[:-4])
)
cmd += ' -l "{}"'.format(self.model)
cmd += ' hocr pdf txt'
cmd += ' || '
cmd += 'echo "${?}"'
task = self.addTask(
'tesseract_-_{}'.format(i),
command=cmd,
env={'OMP_THREAD_LIMIT': '{}'.format(n_cores)},
memMb=mem_mb,
nCores=n_cores
)
tesseract_tasks.append(task)
'''
' ##################################################
' # move_files #
' ##################################################
'''
move_files_tasks = []
n_cores = 1
mem_mb = min(128, self.getMemMb())
for i, file_extension in enumerate(['hocr', 'pdf', 'txt']):
cmd = 'mv "{}/"*.{} "{}"'.format(
self.job.tmp_dir,
file_extension,
os.path.join(self.job.tmp_dir, file_extension)
)
task = self.addTask(
'move_{}_files'.format(file_extension),
command=cmd,
dependencies=tesseract_tasks,
memMb=mem_mb,
nCores=n_cores
)
move_files_tasks.append(task)
cmd = 'mv "{}" "{}"'.format(
os.path.join(self.job.tmp_dir, 'images'),
os.path.join(self.job.output_dir)
)
task = self.addTask(
'move_image_files',
command=cmd,
dependencies=tesseract_tasks,
memMb=mem_mb,
nCores=n_cores
)
move_files_tasks.append(task)
class CreateHOCRWorkflow(WorkflowRunner):
def __init__(self, job):
self.job = job
def workflow(self):
'''
' ##################################################
' # fix-hocr #
' ##################################################
'''
fix_hocr_tasks = []
n_cores = 1
mem_mb = min(256, self.getMemMb())
for i, file in enumerate(os.listdir(os.path.join(self.job.tmp_dir, 'hocr'))): # noqa
cmd = 'sed -i \'s>{}>images>g\' "{}"'.format(
os.path.join(self.job.tmp_dir, 'images'),
os.path.join(self.job.tmp_dir, 'hocr', file)
)
cmd += ' && '
cmd += 'sed -i \'s>ppageno [0-9]\\+>ppageno {}>g\' "{}"'.format(
file[5:-5],
os.path.join(self.job.tmp_dir, 'hocr', file)
)
cmd += ' && '
cmd += 'sed -i \'s>page_[0-9]\\+>page_{}>g\' "{}"'.format(
file[5:-5],
os.path.join(self.job.tmp_dir, 'hocr', file)
)
cmd += ' && '
cmd += 'sed -i \'s>block_[0-9]\\+>block_{}>g\' "{}"'.format(
file[5:-5],
os.path.join(self.job.tmp_dir, 'hocr', file)
)
cmd += ' && '
cmd += 'sed -i \'s>par_[0-9]\\+>par_{}>g\' "{}"'.format(
file[5:-5],
os.path.join(self.job.tmp_dir, 'hocr', file)
)
cmd += ' && '
cmd += 'sed -i \'s>line_[0-9]\\+>line_{}>g\' "{}"'.format(
file[5:-5],
os.path.join(self.job.tmp_dir, 'hocr', file)
)
cmd += ' && '
cmd += 'sed -i \'s>word_[0-9]\\+>word_{}>g\' "{}"'.format(
file[5:-5],
os.path.join(self.job.tmp_dir, 'hocr', file)
)
task = self.addTask(
'fix-hocr_-_{}'.format(i),
command=cmd,
memMb=mem_mb,
nCores=n_cores
)
fix_hocr_tasks.append(task)
'''
' ##################################################
' # hocr-combine #
' ##################################################
'''
n_cores = 1
mem_mb = min(256, self.getMemMb())
cmd = 'ls -dv "{}/"* > "{}"'.format(
os.path.join(self.job.tmp_dir, 'hocr'),
os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt')
)
cmd += ' && '
cmd += 'hocr-combine'
cmd += ' --input-file "@{}"'.format(
os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt')
)
cmd += ' --output-file "{}.hocr"'.format(
os.path.join(self.job.output_dir, self.job.name)
)
cmd += ' && '
cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'hocr'))
self.addTask(
'hocr_combine',
command=cmd,
dependencies=fix_hocr_tasks,
memMb=mem_mb,
nCores=n_cores
)
class CreatePDFWorkflow(WorkflowRunner):
def __init__(self, job):
self.job = job
def workflow(self):
'''
' ##################################################
' # pdf_combine #
' ##################################################
'''
n_cores = min(2, self.getNCores())
mem_mb = min(n_cores * 256, self.getMemMb())
cmd = 'ls -dQv "{}"/*'.format(os.path.join(self.job.tmp_dir, 'pdf'))
cmd += ' | '
cmd += 'xargs gs'
cmd += ' -dBATCH'
cmd += ' -dNOPAUSE'
cmd += ' -dBufferSpace={}'.format(mem_mb * 1000000)
cmd += ' -dNumRenderingThreads={}'.format(n_cores)
cmd += ' -dPDFSETTINGS=/ebook'
cmd += ' -dQUIET'
cmd += ' -sDEVICE=pdfwrite'
cmd += ' -sOutputFile="{}"'.format(
os.path.join(self.job.output_dir, '{}.pdf'.format(self.job.name))
)
cmd += ' && '
cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'pdf'))
self.addTask(
'pdf_combine',
command=cmd,
memMb=mem_mb,
nCores=n_cores
)
class CreateTEIWorkflow(WorkflowRunner):
def __init__(self, job):
self.job = job
def workflow(self):
'''
' ##################################################
' # hocr2tei #
' ##################################################
'''
n_cores = 1
mem_mb = min(256, self.getMemMb())
cmd = 'hocr2tei'
cmd += ' --input-file "{}"'.format(
os.path.join(self.job.output_dir, '{}.hocr'.format(self.job.name))
)
cmd += ' --output-file "{}"'.format(
os.path.join(
self.job.output_dir,
'{}.tei.xml'.format(self.job.name)
)
)
self.addTask(
'hocr2tei',
command=cmd,
memMb=mem_mb,
nCores=n_cores
)
class CreatePoCoZipWorkflow(WorkflowRunner):
def __init__(self, job):
self.job = job
def workflow(self):
'''
' ##################################################
' # zip #
' ##################################################
'''
n_cores = 1
mem_mb = min(512, self.getMemMb())
zip_tasks = []
cmd = 'cd "{}"'.format(self.job.output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' -r'
cmd += ' -m'
cmd += ' "{}.poco.zip" .'.format(self.job.name)
cmd += ' -i "images/*.png" "{}.hocr"'.format(self.job.name)
cmd += ' && '
cmd += 'rm -r images'
cmd += ' && '
cmd += 'cd -'
task = self.addTask(
'zip',
command=cmd,
memMb=mem_mb,
nCores=n_cores
)
zip_tasks.append(task)
class CreateTxtWorkflow(WorkflowRunner):
def __init__(self, job):
self.job = job
def workflow(self):
'''
' ##################################################
' # txt_combine #
' ##################################################
'''
n_cores = 1
mem_mb = min(512, self.getMemMb())
cmd = 'ls -dQv "{}"/*'.format(os.path.join(self.job.tmp_dir, 'txt'))
cmd += ' | '
cmd += 'xargs cat'
cmd += ' > '
cmd += '"{}.txt"'.format(os.path.join(self.job.output_dir, self.job.name)) # noqa
cmd += ' && '
cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'txt'))
self.addTask(
'txt_combine',
command=cmd,
memMb=mem_mb,
nCores=n_cores
)
class MainWorkflow(WorkflowRunner):
def __init__(self, input_dir, model, output_dir, binarize):
self.input_dir = input_dir self.input_dir = input_dir
self.model = model self.lang = lang
self.output_dir = output_dir self.output_dir = output_dir
self.binarize = binarize self.binarize = binarize
self.jobs = [] self.zip = zip
self.jobs = collect_jobs(self.input_dir, self.output_dir)
def collect_jobs(self):
self.jobs = []
for file in os.listdir(self.input_dir):
if os.path.isdir(os.path.join(self.input_dir, file)):
continue
if not file.lower().endswith('.pdf'):
continue
job = PipelineJob(
os.path.join(self.input_dir, file),
os.path.join(self.output_dir, file)
)
self.jobs.append(job)
def workflow(self): def workflow(self):
if not self.jobs: if not self.jobs:
return return
# Create output and temporary directories '''
for job in self.jobs: ' ##################################################
os.mkdir(job.output_dir) ' # setup output directory #
os.mkdir(job.tmp_dir) ' ##################################################
os.mkdir(os.path.join(job.tmp_dir, 'hocr')) '''
os.mkdir(os.path.join(job.tmp_dir, 'pdf')) setup_output_directory_tasks = []
os.mkdir(os.path.join(job.tmp_dir, 'images')) for i, job in enumerate(self.jobs):
os.mkdir(os.path.join(job.tmp_dir, 'txt')) cmd = 'mkdir -p "{}"'.format(job.page_dir)
lbl = 'setup_output_directory_-_{}'.format(i)
task = self.addTask(command=cmd, label=lbl)
setup_output_directory_tasks.append(task)
''' '''
' ################################################## ' ##################################################
' # split-input # ' # split input #
' ################################################## ' ##################################################
''' '''
split_input_tasks = [] split_input_tasks = []
n_cores = max(1, int(self.getNCores() / len(self.jobs)))
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
task = self.addWorkflowTask( input_file = job.file
'split_input_-_{}'.format(i), output_file = '{}/page-%d.tif'.format(job.page_dir)
SplitInputWorkflow(job) cmd = 'gs'
) cmd += ' -dBATCH'
cmd += ' -dNOPAUSE'
cmd += ' -dNumRenderingThreads={}'.format(n_cores)
cmd += ' -dQUIET'
cmd += ' -r300'
cmd += ' -sDEVICE=tiff24nc'
cmd += ' -sCompression=lzw'
cmd += ' "-sOutputFile={}"'.format(output_file)
cmd += ' "{}"'.format(input_file)
deps = 'setup_output_directory_-_{}'.format(i)
lbl = 'split_input_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl,
nCores=n_cores)
split_input_tasks.append(task) split_input_tasks.append(task)
if self.binarize: if self.binarize:
'''
' ##################################################
' # pre binarization #
' ##################################################
'''
pre_binarization_tasks = []
for i, job in enumerate(self.jobs):
input_file = os.path.join(job.output_dir, 'binarization_input_files.txt') # noqa
cmd = 'ls -dv "{}/"* >> "{}"'.format(job.page_dir, input_file)
deps = 'split_input_-_{}'.format(i)
lbl = 'pre_binarization_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
pre_binarization_tasks.append(task)
''' '''
' ################################################## ' ##################################################
' # binarization # ' # binarization #
' ################################################## ' ##################################################
''' '''
binarization_tasks = [] binarization_tasks = []
n_cores = self.getNCores()
mem_mb = self.getMemMb()
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
task = self.addWorkflowTask( input_file = os.path.join(job.output_dir, 'binarization_input_files.txt') # noqa
'binarization_-_{}'.format(i), cmd = 'ocropus-nlbin "@{}"'.format(input_file)
BinarizationWorkflow(job), cmd += ' --nocheck'
dependencies='split_input_-_{}'.format(i) cmd += ' --output "{}"'.format(job.page_dir)
) cmd += ' --parallel "{}"'.format(n_cores)
deps = 'pre_binarization_-_{}'.format(i)
lbl = 'binarization_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl,
memMb=mem_mb, nCores=n_cores)
binarization_tasks.append(task) binarization_tasks.append(task)
'''
' ##################################################
' # post binarization #
' ##################################################
'''
post_binarization_tasks = []
for i, job in enumerate(self.jobs):
input_file = os.path.join(job.output_dir, 'binarization_input_files.txt') # noqa
cmd = 'rm "{}"'.format(input_file)
cmd += ' && '
cmd += 'cd "{}"'.format(job.page_dir)
cmd += ' && '
cmd += 'rm *.{nrm.png,tif}'
cmd += ' && '
cmd += 'rename \'s/^0*/page-/\' *'
cmd += ' && '
cmd += 'cd -'
deps = 'binarization_-_{}'.format(i)
lbl = 'post_binarization_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
post_binarization_tasks.append(task)
'''
' ##################################################
' # pre ocr #
' ##################################################
'''
pre_ocr_tasks = []
for i, job in enumerate(self.jobs):
input_file = os.path.join(job.output_dir, 'ocr_input_files.txt')
cmd = 'ls -dv "{}/"* >> "{}"'.format(job.page_dir, input_file)
deps = 'post_binarization_-_{}'.format(i) if self.binarize else 'split_input_-_{}'.format(i) # noqa
lbl = 'pre_ocr_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
pre_ocr_tasks.append(task)
''' '''
' ################################################## ' ##################################################
' # ocr # ' # ocr #
' ################################################## ' ##################################################
''' '''
ocr_tasks = [] ocr_tasks = []
n_cores = min(4, self.getNCores())
mem_mb = min(n_cores * 2048, self.getMemMb())
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
if self.binarize: input_file = os.path.join(job.output_dir, 'ocr_input_files.txt')
deps = 'binarization_-_{}'.format(i) output_file_base = os.path.join(job.output_dir, job.name)
else: cmd = 'tesseract "{}" "{}"'.format(input_file, output_file_base)
deps = 'split_input_-_{}'.format(i) cmd += ' -l "{}"'.format(self.lang)
task = self.addWorkflowTask( cmd += ' hocr pdf txt'
'ocr_-_{}'.format(i), deps = 'pre_ocr_-_{}'.format(i)
OCRWorkflow(job, self.model), lbl = 'ocr_-_{}'.format(i)
dependencies=deps task = self.addTask(command=cmd, dependencies=deps,
) env={'OMP_THREAD_LIMIT': '{}'.format(n_cores)},
label=lbl, memMb=mem_mb, nCores=n_cores)
ocr_tasks.append(task) ocr_tasks.append(task)
''' '''
' ################################################## ' ##################################################
' # create-hocr # ' # post ocr #
' ################################################## ' ##################################################
''' '''
create_hocr_tasks = [] post_ocr_tasks = []
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
task = self.addWorkflowTask( input_file = os.path.join(job.output_dir, 'ocr_input_files.txt')
'create_hocr_-_{}'.format(i), output_file_base = os.path.join(job.output_dir, job.name)
CreateHOCRWorkflow(job), cmd = 'rm "{}"'.format(input_file)
dependencies='ocr_-_{}'.format(i) cmd += ' && '
) cmd += 'sed -i \'s+{}+pages+g\' "{}.hocr"'.format(job.page_dir, output_file_base) # noqa
create_hocr_tasks.append(task) deps = 'ocr_-_{}'.format(i)
lbl = 'post_ocr_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
post_ocr_tasks.append(task)
''' '''
' ################################################## ' ##################################################
' # create-pdf # ' # hocr to tei #
' ################################################## ' ##################################################
''' '''
create_pdf_tasks = [] hocr_to_tei_tasks = []
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
task = self.addWorkflowTask( output_file_base = os.path.join(job.output_dir, job.name)
'create_pdf_-_{}'.format(i), cmd = 'hocrtotei "{}.hocr" "{}.xml"'.format(output_file_base, output_file_base) # noqa
CreatePDFWorkflow(job), deps = 'post_ocr_-_{}'.format(i)
dependencies='ocr_-_{}'.format(i) lbl = 'hocr_to_tei_-_{}'.format(i)
) task = self.addTask(command=cmd, dependencies=deps, label=lbl)
create_pdf_tasks.append(task) hocr_to_tei_tasks.append(task)
''' '''
' ################################################## ' ##################################################
' # create-tei # ' # zip creation #
' ################################################## ' ##################################################
''' '''
create_tei_tasks = [] zip_creation_tasks = []
for i, job in enumerate(self.jobs): if self.zip is not None:
task = self.addWorkflowTask( # zip all files
'create_tei_-_{}'.format(i), cmd = 'cd "{}"'.format(self.output_dir)
CreateTEIWorkflow(job), cmd += ' && '
dependencies='create_hocr_-_{}'.format(i) cmd += 'zip'
) cmd += ' -r'
create_tei_tasks.append(task) cmd += ' "{}.all.zip" .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.pdf" "*.txt" "*.xml" "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif') # noqa
cmd += ' && '
cmd += 'cd -'
deps = hocr_to_tei_tasks
lbl = 'zip_creation_-_all'
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
zip_creation_tasks.append(task)
# zip PDF files
cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' -r'
cmd += ' "{}.pdf.zip" .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.pdf"'
cmd += ' && '
cmd += 'cd -'
deps = ocr_tasks
lbl = 'zip_creation_-_pdf'
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
zip_creation_tasks.append(task)
# zip TXT files
cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' -r'
cmd += ' "{}.txt.zip" .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.txt"'
cmd += ' && '
cmd += 'cd -'
deps = ocr_tasks
lbl = 'zip_creation_-_txt'
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
zip_creation_tasks.append(task)
# zip XML files
cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' -r'
cmd += ' "{}.xml.zip" .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.xml"'
cmd += ' && '
cmd += 'cd -'
deps = hocr_to_tei_tasks
lbl = 'zip_creation_-_xml'
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
zip_creation_tasks.append(task)
# zip PoCo bundles
cmd = 'cd "{}"'.format(self.output_dir)
cmd += ' && '
cmd += 'zip'
cmd += ' -r'
cmd += ' "{}.poco.zip" .'.format(self.zip)
cmd += ' -x "pyflow.data*" "*tmp*"'
cmd += ' -i "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif') # noqa
cmd += ' && '
cmd += 'cd -'
deps = post_ocr_tasks
lbl = 'zip_creation_-_poco'
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
zip_creation_tasks.append(task)
'''
' ##################################################
' # create-poco-zip #
' ##################################################
'''
create_poco_zip_tasks = []
for i, job in enumerate(self.jobs):
task = self.addWorkflowTask(
'create_poco_zip_-_{}'.format(i),
CreatePoCoZipWorkflow(job),
dependencies='create_tei_-_{}'.format(i)
)
create_poco_zip_tasks.append(task)
''' def collect_jobs(input_dir, output_dir):
' ################################################## jobs = []
' # create-txt # for file in os.listdir(input_dir):
' ################################################## if os.path.isdir(os.path.join(input_dir, file)):
''' continue
create_txt_tasks = [] if file.lower().endswith('.pdf'):
for i, job in enumerate(self.jobs): job = OCRPipelineJob(os.path.join(input_dir, file),
task = self.addWorkflowTask( os.path.join(output_dir, file))
'create_txt_-_{}'.format(i), jobs.append(job)
CreateTxtWorkflow(job), return jobs
dependencies='ocr_-_{}'.format(i)
)
create_txt_tasks.append(task)
self.waitForTasks()
outputs = []
for job in self.jobs:
# Remove temporary directory
os.rmdir(job.tmp_dir)
# Track output files
relative_output_dir = os.path.relpath(
job.output_dir,
start=self.output_dir
)
outputs.append(
{
'description': 'Post correction package (.png and .hocr).',
'file': os.path.join(
relative_output_dir,
'{}.poco.zip'.format(job.name)
),
'mimetype': 'application/zip'
}
)
outputs.append(
{
'description': 'PDF file with text layer.',
'file': os.path.join(
relative_output_dir,
'{}.pdf'.format(job.name)
),
'mimetype': 'application/pdf'
}
)
outputs.append(
{
'description': 'Plain text file.',
'file': os.path.join(
relative_output_dir,
'{}.txt'.format(job.name)
),
'mimetype': 'text/plain'
}
)
outputs.append(
{
'description': 'TEI compliant XML file.',
'file': os.path.join(
relative_output_dir,
'{}.tei.xml'.format(job.name)
),
'mimetype': 'application/tei+xml'
}
)
with open(os.path.join(self.output_dir, 'outputs.json'), 'w') as f:
json.dump(outputs, f, indent=4)
def parse_args(): def parse_args():
parser = ArgumentParser( parser = ArgumentParser(description='OCR pipeline for PDF file processing',
description='Pipeline for PDF file OCR processing' prog='OCR pipeline')
) parser.add_argument('-i', '--input-dir',
parser.add_argument(
'-i', '--input-dir',
help='Input directory', help='Input directory',
required=True required=True)
) parser.add_argument('-o', '--output-dir',
parser.add_argument(
'-o', '--output-dir',
help='Output directory', help='Output directory',
required=True required=True)
) parser.add_argument('-l', '--language',
parser.add_argument( choices=list(map(lambda x: x[:-12], filter(lambda x: x.endswith('.traineddata'), os.listdir('/usr/local/share/tessdata')))), # noqa
'-m', '--model', help='Language of the input '
choices=[ '(3-character ISO 639-2 language codes)',
x[:-12] for x in os.listdir('/usr/local/share/tessdata') required=True)
if x.endswith('.traineddata') and len(x) > 12 parser.add_argument('--binarize',
],
help='Name of the model to be used',
required=True
)
parser.add_argument(
'--binarize',
action='store_true', action='store_true',
help='Add binarization as a preprocessing step' help='Add binarization as a preprocessing step')
) parser.add_argument('--log-dir',
parser.add_argument( help='Logging directory')
'--log-dir', parser.add_argument('--mem-mb',
help='Logging directory (Default: --output-dir)' help='Amount of system memory to be used (Default: min(--n-cores * 2048, available system memory))', # noqa
) type=int)
parser.add_argument( parser.add_argument('--n-cores',
'--mem-mb', default=min(4, multiprocessing.cpu_count()),
help='Amount of system memory to be used ' help='Number of CPU threads to be used (Default: min(4, number of CPUs))', # noqa
'(Default: min(--n-cores * 512, available system memory))', type=int)
type=int parser.add_argument('--zip',
) help='Create one zip file per filetype')
parser.add_argument( parser.add_argument('-v', '--version',
'--n-cores',
default=1,
help='Number of CPU threads to be used',
type=int
)
parser.add_argument(
'-v', '--version',
action='version', action='version',
help='Returns the current version of the OCR pipeline', help='Returns the current version of the OCR pipeline',
version='%(prog)s {}'.format(__version__) version='%(prog)s {}'.format(__version__))
)
args = parser.parse_args() args = parser.parse_args()
# Set some tricky default values and check for insufficient input # Set some tricky default values and check for insufficient input
@ -675,26 +338,20 @@ def parse_args():
raise Exception('--n-cores must be greater or equal 1') raise Exception('--n-cores must be greater or equal 1')
if args.mem_mb is None: if args.mem_mb is None:
max_mem_mb = int(os.popen('free -t -m').readlines()[-1].split()[1:][0]) max_mem_mb = int(os.popen('free -t -m').readlines()[-1].split()[1:][0])
args.mem_mb = min(args.n_cores * 512, max_mem_mb) args.mem_mb = min(args.n_cores * 2048, max_mem_mb)
if args.mem_mb < 512: if args.mem_mb < 2048:
raise Exception('--mem-mb must be greater or equal 512') raise Exception('--mem-mb must be greater or equal 2048')
if args.zip is not None and args.zip.lower().endswith('.zip'):
# Remove .zip file extension if provided
args.zip = args.zip[:-4]
args.zip = args.zip if args.zip else 'output'
return args return args
def main(): def main():
args = parse_args() args = parse_args()
main_workflow = MainWorkflow( ocr_pipeline = OCRPipeline(args.input_dir, args.language, args.output_dir, args.binarize, args.zip) # noqa
args.input_dir, retval = ocr_pipeline.run(dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores) # noqa
args.model,
args.output_dir,
args.binarize
)
main_workflow.collect_jobs()
retval = main_workflow.run(
dataDirRoot=args.log_dir,
memMb=args.mem_mb,
nCores=args.n_cores
)
sys.exit(retval) sys.exit(retval)

View File

@ -6,10 +6,9 @@ import os
import subprocess import subprocess
import sys import sys
CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:v0.1.0' CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:1.0.0'
CONTAINER_INPUT_DIR = '/input' CONTAINER_INPUT_DIR = '/input'
CONTAINER_OUTPUT_DIR = '/output' CONTAINER_OUTPUT_DIR = '/output'
CONTAINER_MODELS_DIR = '/usr/local/share/tessdata'
CONTAINER_LOG_DIR = '/logs' CONTAINER_LOG_DIR = '/logs'
UID = str(os.getuid()) UID = str(os.getuid())
GID = str(os.getgid()) GID = str(os.getgid())
@ -17,25 +16,20 @@ GID = str(os.getgid())
parser = ArgumentParser(add_help=False) parser = ArgumentParser(add_help=False)
parser.add_argument('-i', '--input-dir') parser.add_argument('-i', '--input-dir')
parser.add_argument('-o', '--output-dir') parser.add_argument('-o', '--output-dir')
parser.add_argument('-t', '--model-file', action='extend', nargs='+')
parser.add_argument('--log-dir') parser.add_argument('--log-dir')
args, remaining_args = parser.parse_known_args() args, remaining_args = parser.parse_known_args()
cmd = ['docker', 'run', '--rm', '-it', '-u', f'{UID}:{GID}'] cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)]
if args.input_dir is not None: if args.input_dir is not None:
mapping = f'{os.path.abspath(args.input_dir)}:{CONTAINER_INPUT_DIR}' mapping = os.path.abspath(args.input_dir) + ':' + CONTAINER_INPUT_DIR
cmd += ['-v', mapping] cmd += ['-v', mapping]
remaining_args += ['-i', CONTAINER_INPUT_DIR] remaining_args += ['-i', CONTAINER_INPUT_DIR]
if args.output_dir is not None: if args.output_dir is not None:
mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}' mapping = os.path.abspath(args.output_dir) + ':' + CONTAINER_OUTPUT_DIR
cmd += ['-v', mapping] cmd += ['-v', mapping]
remaining_args += ['-o', CONTAINER_OUTPUT_DIR] remaining_args += ['-o', CONTAINER_OUTPUT_DIR]
if args.model_file is not None:
for model_file in args.model_file:
mapping = f'{os.path.abspath(model_file)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model_file)}' # noqa
cmd += ['-v', mapping]
if args.log_dir is not None: if args.log_dir is not None:
mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}' mapping = os.path.abspath(args.log_dir) + ':' + CONTAINER_LOG_DIR
cmd += ['-v', mapping] cmd += ['-v', mapping]
remaining_args += ['--log-dir', CONTAINER_LOG_DIR] remaining_args += ['--log-dir', CONTAINER_LOG_DIR]
cmd.append(CONTAINER_IMAGE) cmd.append(CONTAINER_IMAGE)