Cleanup and make use of globbing for input files for binarization and ocr

This commit is contained in:
Patrick Jentsch 2021-03-15 12:45:05 +01:00
parent 104598039e
commit acbf61be05
5 changed files with 273 additions and 374 deletions

View File

@ -7,41 +7,47 @@ LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <por
ENV LANG=C.UTF-8 ENV LANG=C.UTF-8
RUN apt-get update RUN apt-get update \
&& apt-get install --no-install-recommends --yes \
wget
# Install pipeline dependencies # # Install the OCR pipeline and it's dependencies #
## Install pyFlow ## ## Install pyFlow ##
ENV PYFLOW_RELEASE=1.1.20 ENV PYFLOW_VERSION=1.1.20
ADD "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_RELEASE}/pyflow-${PYFLOW_RELEASE}.tar.gz" . RUN wget --no-check-certificate --quiet \
RUN tar -xzf "pyflow-${PYFLOW_RELEASE}.tar.gz" \ "https://github.com/Illumina/pyflow/releases/download/v${PYFLOW_VERSION}/pyflow-${PYFLOW_VERSION}.tar.gz" \
&& cd "pyflow-${PYFLOW_RELEASE}" \ && tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
&& cd "pyflow-${PYFLOW_VERSION}" \
&& apt-get install --no-install-recommends --yes \ && apt-get install --no-install-recommends --yes \
python2.7 \ python2.7 \
&& python2.7 setup.py build install \ && python2.7 setup.py build install \
&& cd .. \ && cd - > /dev/null \
&& rm -r "pyflow-${PYFLOW_RELEASE}" "pyflow-${PYFLOW_RELEASE}.tar.gz" && rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz"
## Install ocropy ## ## Install ocropy ##
ENV OCROPY_RELEASE=1.3.3 ENV OCROPY_VERSION=1.3.3
ADD "https://github.com/tmbdev/ocropy/archive/v${OCROPY_RELEASE}.tar.gz" . RUN wget --no-check-certificate --quiet \
RUN tar -xzf "v${OCROPY_RELEASE}.tar.gz" \ "https://github.com/tmbdev/ocropy/archive/v${OCROPY_VERSION}.tar.gz" \
&& cd "ocropy-${OCROPY_RELEASE}" \ && tar -xzf "v${OCROPY_VERSION}.tar.gz" \
&& cd "ocropy-${OCROPY_VERSION}" \
&& apt-get install --no-install-recommends --yes \ && apt-get install --no-install-recommends --yes \
python2.7 \
python-pil \ python-pil \
python-tk \ python-tk \
$(cat PACKAGES) \ $(cat PACKAGES) \
&& python2.7 setup.py install \ && python2.7 setup.py install \
&& cd .. \ && cd - > /dev/null \
&& rm -r "ocropy-${OCROPY_RELEASE}" "v${OCROPY_RELEASE}.tar.gz" && rm -r "ocropy-${OCROPY_VERSION}" "v${OCROPY_VERSION}.tar.gz"
## Install Tesseract OCR ## ## Install Tesseract OCR ##
ENV TESSERACT_RELEASE=4.1.1 ENV TESSERACT_VERSION=4.1.1
ADD "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_RELEASE}.tar.gz" . RUN wget --no-check-certificate --quiet \
RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \ "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \
&& cd "tesseract-${TESSERACT_RELEASE}" \ && tar -xzf "${TESSERACT_VERSION}.tar.gz" \
&& cd "tesseract-${TESSERACT_VERSION}" \
&& apt-get install --no-install-recommends --yes \ && apt-get install --no-install-recommends --yes \
autoconf \ autoconf \
automake \ automake \
@ -60,35 +66,24 @@ RUN tar -xzf "${TESSERACT_RELEASE}.tar.gz" \
&& make install \ && make install \
&& ldconfig \ && ldconfig \
&& cd - > /dev/null \ && cd - > /dev/null \
&& rm -r "tesseract-${TESSERACT_RELEASE}" "${TESSERACT_RELEASE}.tar.gz" && rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz"
ENV TESSDATA_BEST_RELEASE=4.1.0 ENV TESSERACT_MODELS="ara,chi_tra,dan,ell,eng,enm,fra,frk,frm,ita,por,rus,spa"
ADD "https://github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_RELEASE}.tar.gz" . ENV TESSDATA_BEST_VERSION=4.1.0
RUN tar -xzf "${TESSDATA_BEST_RELEASE}.tar.gz" \ RUN wget --no-check-certificate --quiet \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/ara.traineddata" "/usr/local/share/tessdata/" \ "https://github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}.tar.gz" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/chi_tra.traineddata" "/usr/local/share/tessdata/" \ && tar -xzf "${TESSDATA_BEST_VERSION}.tar.gz" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/dan.traineddata" "/usr/local/share/tessdata/" \ && for tesseract_model in $(echo ${TESSERACT_MODELS} | tr "," "\n"); do mv "tessdata_best-${TESSDATA_BEST_VERSION}/${tesseract_model}.traineddata" "/usr/local/share/tessdata/"; done \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/deu.traineddata" "/usr/local/share/tessdata/" \ && rm -r "tessdata_best-${TESSDATA_BEST_VERSION}" "${TESSDATA_BEST_VERSION}.tar.gz"
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/ell.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/eng.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/enm.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/fra.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/frk.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/frm.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/ita.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/por.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/rus.traineddata" "/usr/local/share/tessdata/" \
&& mv "tessdata_best-${TESSDATA_BEST_RELEASE}/spa.traineddata" "/usr/local/share/tessdata/" \
&& rm -r "tessdata_best-${TESSDATA_BEST_RELEASE}" "${TESSDATA_BEST_RELEASE}.tar.gz"
## Further dependencies ## ## Further dependencies ##
RUN apt-get install --no-install-recommends --yes \ RUN apt-get install --no-install-recommends --yes \
procps \
ghostscript \ ghostscript \
python-pip \
python3.7 \ python3.7 \
zip \ rename \
&& pip install natsort zip
## Install Pipeline ## ## Install Pipeline ##

View File

@ -3,13 +3,14 @@
This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided. This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided.
## Software used in this pipeline implementation ## Software used in this pipeline implementation
- Official Debian Docker image (buster-slim) and programs from its free repositories: https://hub.docker.com/_/debian
- Official Debian Docker image (buster-slim): https://hub.docker.com/_/debian
- Software from Debian Buster's free repositories
- ocropy (1.3.3): https://github.com/ocropus/ocropy/releases/tag/v1.3.3 - ocropy (1.3.3): https://github.com/ocropus/ocropy/releases/tag/v1.3.3
- pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20 - pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20
- Tesseract OCR (4.1.1): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1 - Tesseract OCR (4.1.1): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1
- tessdata_best (4.1.0): https://github.com/tesseract-ocr/tessdata_best/releases/tag/4.1.0 - tessdata_best (4.1.0): https://github.com/tesseract-ocr/tessdata_best/releases/tag/4.1.0
## Use this image ## Use this image
1. Create input and output directories for the pipeline. 1. Create input and output directories for the pipeline.
@ -22,7 +23,7 @@ mkdir -p /<my_data_location>/input /<my_data_location>/output
3. Start the pipeline process. Check the [Pipeline arguments](#pipeline-arguments) section for more details. 3. Start the pipeline process. Check the [Pipeline arguments](#pipeline-arguments) section for more details.
``` ```
# Option one: Use the wrapper script # Option one: Use the wrapper script
## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/raw/1.0.0/wrapper/ocr, make it executeable and add it to your ${PATH} ## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/raw/development/wrapper/ocr, make it executeable and add it to your ${PATH}
cd /<my_data_location> cd /<my_data_location>
ocr -i input -l <language_code> -o output <optional_pipeline_arguments> ocr -i input -l <language_code> -o output <optional_pipeline_arguments>
@ -33,37 +34,44 @@ docker run \
-u $(id -u $USER):$(id -g $USER) \ -u $(id -u $USER):$(id -g $USER) \
-v /<my_data_location>/input:/input \ -v /<my_data_location>/input:/input \
-v /<my_data_location>/output:/output \ -v /<my_data_location>/output:/output \
gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:1.0.0 \ gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:development \
-i /input \ -i /ocr_pipeline/input \
-l <language_code> -l <language_code> \
-o /output \ -o /ocr_pipeline/output \
<optional_pipeline_arguments> <optional_pipeline_arguments>
``` ```
4. Check your results in the `/<my_data_location>/output` directory. 4. Check your results in the `/<my_data_location>/output` directory.
```
### Pipeline arguments ### Pipeline arguments
`-l languagecode` #### Mandatory arguments
* Tells tesseract which language will be used.
* options = ara (Arabic), chi_tra (Chinese - Traditional), dan (Danish), deu (German), ell (Greek, Modern (1453-)), eng (English), enm (Middle englisch), fra (French), frk (German Fraktur), frm (Middle french), ita (Italian), por (Portuguese), rus (Russian), spa (Spanish)
* required = True
`--keep-intermediates` `-i, --input-dir INPUT_DIR`
* If set, all intermediate files created during the OCR process will be * Input directory
kept.
* default = False
* required = False
`--nCores corenumber` `-o, --output-dir OUTPUT_DIR`
* Sets the number of CPU cores being used during the OCR process. * Output directory
* default = min(4, multiprocessing.cpu_count())
* required = False
`--skip-binarisation` `-l, --language {spa,fra,dan,deu,eng,frm,chi_tra,ara,enm,ita,ell,frk,rus,por}`
* Used to skip binarization with ocropus. If skipped, only the tesseract binarization is used. * Language of the input (3-character ISO 639-2 language codes)
* default = False
#### Optional arguments
`--binarize`
* Add binarization as a preprocessing step
`--log-dir`
* Logging directory
`--mem-mb`
* Amount of system memory to be used (Default: min(--n-cores * 2048, available system memory))
`--n-cores`
* Number of CPU threads to be used (Default: min(4, available CPU cores))
`-v, --version`
* Returns the current version of the OCR pipeline
``` bash ``` bash
# Example with all arguments used # Example with all arguments used
@ -71,13 +79,14 @@ docker run \
--rm \ --rm \
-it \ -it \
-u $(id -u $USER):$(id -g $USER) \ -u $(id -u $USER):$(id -g $USER) \
-v "$HOME"/ocr/input:/input \ -v /<my_data_location>/input:/ocr_pipeline/input \
-v "$HOME"/ocr/output:/output \ -v /<my_data_location>/output:/ocr_pipeline/output \
gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:1.0.0 \ -v /<my_data_location>/logs:/ocr_pipeline/logs \
-i /input \ gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:development \
-i /ocr_pipeline/input \
-l eng \ -l eng \
-o /output \ -o /ocr_pipeline/output \
--keep_intermediates \ --binarize \
--nCores 8 \ --log-dir /ocr_pipeline/logs \
--skip-binarisation --n-cores 8 \
``` ```

View File

@ -5,45 +5,50 @@
from xml.sax.saxutils import escape from xml.sax.saxutils import escape
from argparse import ArgumentParser from argparse import ArgumentParser
import re
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
parser = ArgumentParser(description='Merges hOCR files into a TEI file.') parser = ArgumentParser(description='Merges hOCR files into a TEI file.')
parser.add_argument('i', metavar='hOCR-sourcefile', nargs='+') parser.add_argument('i', metavar='hOCR-sourcefile')
parser.add_argument('o', metavar='TEI-destfile',) parser.add_argument('o', metavar='TEI-destfile')
args = parser.parse_args() args = parser.parse_args()
output_file = open(args.o, 'w') output_file = open(args.o, 'w')
output_file.write( output_file.write(
'<?xml version="1.0" encoding="UTF-8"?>\n' '<?xml version="1.0" encoding="UTF-8"?>\n'
+ '<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="dtabf">\n' + '<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="dtabf">\n'
+ ' <teiHeader>\n' + ' <teiHeader>\n'
+ ' <fileDesc>\n' + ' <fileDesc>\n'
+ ' <titleStmt/>\n' + ' <titleStmt/>\n'
+ ' <publicationStmt/>\n' + ' <publicationStmt/>\n'
+ ' <sourceDesc/>\n' + ' <sourceDesc/>\n'
+ ' </fileDesc>\n' + ' </fileDesc>\n'
+ ' <encodingDesc/>\n' + ' <encodingDesc/>\n'
+ ' <profileDesc/>\n' + ' <profileDesc/>\n'
+ ' </teiHeader>\n' + ' </teiHeader>\n'
+ ' <text>\n' + ' <text>\n'
+ ' <body>\n' + ' <body>\n'
) )
for index, input_file in enumerate(args.i): tree = ET.parse(args.i)
tree = ET.parse(input_file) for page in tree.findall('.//*[@class="ocr_page"]'):
output_file.write(' <pb n="%i"/>\n' % (index + 1)) page_properties = page.attrib.get('title')
for para in tree.findall('.//*[@class="ocr_par"]'): facsimile = re.search(r'image \"(.*?)\"', page_properties).group(1)
page_number = re.search(r'ppageno (\d+)', page_properties).group(1)
output_file.write(' <pb facs="%s" n="%s"/>\n' % (facsimile, page_number)) # noqa
for para in page.findall('.//*[@class="ocr_par"]'):
output_file.write(' <p>\n') output_file.write(' <p>\n')
for line in para.findall('.//*[@class="ocr_line"]'): for line in para.findall('.//*[@class="ocr_line"]'):
first_word_in_line = True output_file.write(' <lb/>')
indent = ''
for word in line.findall('.//*[@class="ocrx_word"]'): for word in line.findall('.//*[@class="ocrx_word"]'):
if word.text is not None: if word.text is not None:
output_file.write((' ' if first_word_in_line else ' ') + escape(word.text.strip())) output_file.write(indent + escape(word.text.strip()))
first_word_in_line = False indent = ' '
if not first_word_in_line: output_file.write('\n')
output_file.write('<lb/>\n')
output_file.write(' </p>\n') output_file.write(' </p>\n')
output_file.write( output_file.write(
' </body>\n' ' </body>\n'
+ ' </text>\n' + ' </text>\n'
+ '</TEI>') + '</TEI>'
)
output_file.close() output_file.close()

404
ocr
View File

@ -8,48 +8,10 @@ __author__ = 'Patrick Jentsch <p.jentsch@uni-bielefeld.de>,' \
__version__ = '1.0.0' __version__ = '1.0.0'
from argparse import ArgumentParser from argparse import ArgumentParser
from natsort import natsorted
from pyflow import WorkflowRunner from pyflow import WorkflowRunner
import multiprocessing import multiprocessing
import os import os
import sys import sys
import tempfile
TESSERACT_MODELS = ['deu', 'eng', 'enm', 'fra', 'frk', 'frm', 'ita', 'por', 'spa'] # noqa
def parse_args():
parser = ArgumentParser(
description='An OCR pipeline for PDF file processing.',
prog='OCR pipeline'
)
parser.add_argument('-i', '--input-directory',
help='Input directory (only PDF files get processed)',
required=True)
parser.add_argument('-o', '--output-directory',
help='Output directory',
required=True)
parser.add_argument('-l', '--language',
choices=TESSERACT_MODELS,
required=True)
parser.add_argument('--binarize',
action='store_true',
help='Use ocropy binarisation as preprocessing step.')
parser.add_argument('--log-dir')
parser.add_argument('--n-cores',
default=min(4, multiprocessing.cpu_count()),
help='Total number of cores available.',
type=int)
parser.add_argument('--intermediate-directory')
parser.add_argument('--zip',
help='Zips all results in different archives depending'
' on result types. Also zips everything into one '
'archive.')
parser.add_argument('-v', '--version',
action='version',
version='%(prog)s {}'.format(__version__))
return parser.parse_args()
class OCRPipelineJob: class OCRPipelineJob:
@ -61,41 +23,23 @@ class OCRPipelineJob:
Arguments: Arguments:
file -- Path to the file file -- Path to the file
output_dir -- Path to a directory, where job results a stored output_dir -- Path to a directory, where job results a stored
intermediate_dir -- Path to a directory, where intermediate files are
stored.
""" """
def __init__(self, file, output_dir, intermediate_dir): def __init__(self, file, output_dir):
self.file = file self.file = file
self.intermediate_dir = intermediate_dir
self.name = os.path.basename(file).rsplit('.', 1)[0] self.name = os.path.basename(file).rsplit('.', 1)[0]
self.output_dir = output_dir self.output_dir = output_dir
self.page_dir = os.path.join(output_dir, 'pages')
class OCRPipeline(WorkflowRunner): class OCRPipeline(WorkflowRunner):
def __init__(self, input_dir, lang, output_dir, binarize, intermediate_dir, def __init__(self, input_dir, lang, output_dir, binarize, zip):
n_cores, zip):
self.input_dir = input_dir self.input_dir = input_dir
self.lang = lang self.lang = lang
self.output_dir = output_dir self.output_dir = output_dir
self.binarize = binarize self.binarize = binarize
if intermediate_dir is None: self.zip = zip
self.intermediate_dir = os.path.join(output_dir, 'tmp') self.jobs = collect_jobs(self.input_dir, self.output_dir)
else:
self.intermediate_dir = tempfile.mkdtemp(dir=intermediate_dir)
self.n_cores = n_cores
if zip is None:
self.zip = zip
else:
if zip.lower().endswith('.zip'):
# Remove .zip file extension if provided
self.zip = zip[:-4]
self.zip = self.zip if self.zip else 'output'
else:
self.zip = zip
self.jobs = collect_jobs(self.input_dir,
self.output_dir,
self.intermediate_dir)
def workflow(self): def workflow(self):
if not self.jobs: if not self.jobs:
@ -108,10 +52,7 @@ class OCRPipeline(WorkflowRunner):
''' '''
setup_output_directory_tasks = [] setup_output_directory_tasks = []
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
cmd = 'mkdir' cmd = 'mkdir -p "{}"'.format(job.page_dir)
cmd += ' -p'
cmd += ' "{}"'.format(job.intermediate_dir)
cmd += ' "{}"'.format(os.path.join(job.output_dir, 'poco'))
lbl = 'setup_output_directory_-_{}'.format(i) lbl = 'setup_output_directory_-_{}'.format(i)
task = self.addTask(command=cmd, label=lbl) task = self.addTask(command=cmd, label=lbl)
setup_output_directory_tasks.append(task) setup_output_directory_tasks.append(task)
@ -122,10 +63,10 @@ class OCRPipeline(WorkflowRunner):
' ################################################## ' ##################################################
''' '''
split_input_tasks = [] split_input_tasks = []
n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs)))) n_cores = max(1, int(self.getNCores() / len(self.jobs)))
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
input_file = job.file input_file = job.file
output_file = '{}/page-%d.tif'.format(job.intermediate_dir) output_file = '{}/page-%d.tif'.format(job.page_dir)
cmd = 'gs' cmd = 'gs'
cmd += ' -dBATCH' cmd += ' -dBATCH'
cmd += ' -dNOPAUSE' cmd += ' -dNOPAUSE'
@ -138,15 +79,24 @@ class OCRPipeline(WorkflowRunner):
cmd += ' "{}"'.format(input_file) cmd += ' "{}"'.format(input_file)
deps = 'setup_output_directory_-_{}'.format(i) deps = 'setup_output_directory_-_{}'.format(i)
lbl = 'split_input_-_{}'.format(i) lbl = 'split_input_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl, nCores=n_cores) # noqa task = self.addTask(command=cmd, dependencies=deps, label=lbl,
nCores=n_cores)
split_input_tasks.append(task) split_input_tasks.append(task)
if self.binarize: if self.binarize:
''' '''
' The binarization_tasks list is created based on the output files ' ##################################################
' of the split_tasks. So wait until they are finished. ' # pre binarization #
' ##################################################
''' '''
self.waitForTasks() pre_binarization_tasks = []
for i, job in enumerate(self.jobs):
input_file = os.path.join(job.output_dir, 'binarization_input_files.txt') # noqa
cmd = 'ls -dv "{}/"* >> "{}"'.format(job.page_dir, input_file)
deps = 'split_input_-_{}'.format(i)
lbl = 'pre_binarization_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
pre_binarization_tasks.append(task)
''' '''
' ################################################## ' ##################################################
@ -154,52 +104,55 @@ class OCRPipeline(WorkflowRunner):
' ################################################## ' ##################################################
''' '''
binarization_tasks = [] binarization_tasks = []
''' n_cores = self.getNCores()
' We run ocropus-nlbin with either four or, if there are less then mem_mb = self.getMemMb()
' four cores available for this workflow, the available core
' number.
'''
n_cores = min(4, self.n_cores)
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
input_dir = job.intermediate_dir input_file = os.path.join(job.output_dir, 'binarization_input_files.txt') # noqa
output_dir = job.intermediate_dir cmd = 'ocropus-nlbin "@{}"'.format(input_file)
files = filter(lambda x: x.endswith('.tif'), os.listdir(input_dir)) # noqa
files = natsorted(files)
files = map(lambda x: os.path.join(input_dir, x), files)
cmd = 'ocropus-nlbin "{}"'.format('" "'.join(files))
cmd += ' --nocheck' cmd += ' --nocheck'
cmd += ' --output "{}"'.format(output_dir) cmd += ' --output "{}"'.format(job.page_dir)
cmd += ' --parallel "{}"'.format(n_cores) cmd += ' --parallel "{}"'.format(n_cores)
print(cmd) deps = 'pre_binarization_-_{}'.format(i)
deps = 'split_input_-_{}'.format(i)
lbl = 'binarization_-_{}'.format(i) lbl = 'binarization_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl, nCores=n_cores) # noqa task = self.addTask(command=cmd, dependencies=deps, label=lbl,
memMb=mem_mb, nCores=n_cores)
binarization_tasks.append(task) binarization_tasks.append(task)
self.waitForTasks()
''' '''
' ################################################## ' ##################################################
' # Renaming of binarization output files # ' # post binarization #
' ################################################## ' ##################################################
''' '''
post_binarization_tasks = []
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
input_dir = job.intermediate_dir input_file = os.path.join(job.output_dir, 'binarization_input_files.txt') # noqa
output_dir = job.intermediate_dir cmd = 'rm "{}"'.format(input_file)
files = filter(lambda x: x.endswith('.bin.png'), os.listdir(input_dir)) # noqa cmd += ' && '
for file in files: cmd += 'cd "{}"'.format(job.page_dir)
# int conversion is done in order to trim leading zeros cmd += ' && '
page_number = int(file.split('.', 1)[0]) cmd += 'rm *.{nrm.png,tif}'
output_file = 'page-{}.bin.png'.format(page_number) cmd += ' && '
os.rename(os.path.join(output_dir, file), cmd += 'rename \'s/^0*/page-/\' *'
os.path.join(output_dir, output_file)) cmd += ' && '
cmd += 'cd -'
deps = 'binarization_-_{}'.format(i)
lbl = 'post_binarization_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
post_binarization_tasks.append(task)
''' '''
' The ocr_tasks are created based of the output files of either the ' ##################################################
' split_tasks or binarization_tasks. So wait until they are ' # pre ocr #
' finished. ' ##################################################
''' '''
self.waitForTasks() pre_ocr_tasks = []
for i, job in enumerate(self.jobs):
input_file = os.path.join(job.output_dir, 'ocr_input_files.txt')
cmd = 'ls -dv "{}/"* >> "{}"'.format(job.page_dir, input_file)
deps = 'post_binarization_-_{}'.format(i) if self.binarize else 'split_input_-_{}'.format(i) # noqa
lbl = 'pre_ocr_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
pre_ocr_tasks.append(task)
''' '''
' ################################################## ' ##################################################
@ -207,157 +160,51 @@ class OCRPipeline(WorkflowRunner):
' ################################################## ' ##################################################
''' '''
ocr_tasks = [] ocr_tasks = []
n_cores = min(4, self.getNCores())
mem_mb = min(n_cores * 2048, self.getMemMb())
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
input_dir = job.intermediate_dir input_file = os.path.join(job.output_dir, 'ocr_input_files.txt')
output_dir = job.intermediate_dir output_file_base = os.path.join(job.output_dir, job.name)
files = os.listdir(input_dir) cmd = 'tesseract "{}" "{}"'.format(input_file, output_file_base)
if self.binarize: cmd += ' -l "{}"'.format(self.lang)
deps = 'binarization_-_{}'.format(i) cmd += ' hocr pdf txt'
files = filter(lambda x: x.endswith('.bin.png'), files) deps = 'pre_ocr_-_{}'.format(i)
else: lbl = 'ocr_-_{}'.format(i)
deps = 'split_input_-_{}'.format(i) task = self.addTask(command=cmd, dependencies=deps,
files = filter(lambda x: x.endswith('.tif'), files) env={'OMP_THREAD_LIMIT': '{}'.format(n_cores)},
files = natsorted(files) label=lbl, memMb=mem_mb, nCores=n_cores)
files = map(lambda x: os.path.join(input_dir, x), files) ocr_tasks.append(task)
for j, file in enumerate(files):
if self.binarize:
output_file_base = os.path.join(output_dir, file.rsplit('.', 2)[0]) # noqa
else:
output_file_base = os.path.join(output_dir, file.rsplit('.', 1)[0]) # noqa
cmd = 'tesseract "{}" "{}"'.format(file, output_file_base)
cmd += ' -l "{}"'.format(self.lang)
cmd += ' hocr pdf txt'
cmd += ' && '
cmd += 'sed -i \'s+{}/++g\' "{}".hocr'.format(input_dir, output_file_base) # noqa
lbl = 'ocr_-_{}-{}'.format(i, j)
task = self.addTask(command=cmd, dependencies=deps, label=lbl, env={"OMP_THREAD_LIMIT": "1"}) # noqa
ocr_tasks.append(task)
'''
' The following jobs are created based of the output files of the
' ocr_tasks. So wait until they are finished.
'''
self.waitForTasks()
''' '''
' ################################################## ' ##################################################
' # combined pdf creation # ' # post ocr #
' ################################################## ' ##################################################
''' '''
combined_pdf_creation_tasks = [] post_ocr_tasks = []
n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs))))
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
input_dir = job.intermediate_dir input_file = os.path.join(job.output_dir, 'ocr_input_files.txt')
output_file = os.path.join(job.output_dir, '{}.pdf'.format(job.name)) # noqa output_file_base = os.path.join(job.output_dir, job.name)
files = filter(lambda x: x.endswith('.pdf'), os.listdir(input_dir)) cmd = 'rm "{}"'.format(input_file)
files = natsorted(files) cmd += ' && '
files = map(lambda x: os.path.join(input_dir, x), files) cmd += 'sed -i \'s+{}+pages+g\' "{}.hocr"'.format(job.page_dir, output_file_base) # noqa
cmd = 'gs' deps = 'ocr_-_{}'.format(i)
cmd += ' -dBATCH' lbl = 'post_ocr_-_{}'.format(i)
cmd += ' -dNOPAUSE'
cmd += ' -dNumRenderingThreads={}'.format(n_cores)
cmd += ' -dPDFSETTINGS=/ebook'
cmd += ' -dQUIET'
cmd += ' -sDEVICE=pdfwrite'
cmd += ' "-sOutputFile={}"'.format(output_file)
cmd += ' "{}"'.format('" "'.join(files))
deps = filter(lambda x: x.startswith('ocr_-_{}'.format(i)), ocr_tasks) # noqa
lbl = 'combined_pdf_creation_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl, nCores=n_cores) # noqa
combined_pdf_creation_tasks.append(task)
'''
' ##################################################
' # combined txt creation #
' ##################################################
'''
combined_txt_creation_tasks = []
for i, job in enumerate(self.jobs):
input_dir = job.intermediate_dir
output_file = os.path.join(job.output_dir, '{}.txt'.format(job.name)) # noqa
files = filter(lambda x: x.endswith('.txt'), os.listdir(input_dir))
files = natsorted(files)
files = map(lambda x: os.path.join(input_dir, x), files)
cmd = 'cat "{}" > "{}"'.format('" "'.join(files), output_file)
deps = filter(lambda x: x.startswith('ocr_-_{}'.format(i)), ocr_tasks) # noqa
lbl = 'combined_txt_creation_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl) task = self.addTask(command=cmd, dependencies=deps, label=lbl)
combined_txt_creation_tasks.append(task) post_ocr_tasks.append(task)
''' '''
' ################################################## ' ##################################################
' # tei p5 creation # ' # hocr to tei #
' ################################################## ' ##################################################
''' '''
tei_p5_creation_tasks = [] hocr_to_tei_tasks = []
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
input_dir = job.intermediate_dir output_file_base = os.path.join(job.output_dir, job.name)
output_file = os.path.join(job.output_dir, '{}.xml'.format(job.name)) # noqa cmd = 'hocrtotei "{}.hocr" "{}.xml"'.format(output_file_base, output_file_base) # noqa
files = filter(lambda x: x.endswith('.hocr'), deps = 'post_ocr_-_{}'.format(i)
os.listdir(input_dir)) lbl = 'hocr_to_tei_-_{}'.format(i)
files = natsorted(files)
files = map(lambda x: os.path.join(input_dir, x), files)
cmd = 'hocrtotei "{}" "{}"'.format('" "'.join(files),
output_file)
deps = filter(lambda x: x.startswith('ocr_-_{}'.format(i)), ocr_tasks) # noqa
lbl = 'tei_p5_creation_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl) task = self.addTask(command=cmd, dependencies=deps, label=lbl)
tei_p5_creation_tasks.append(task) hocr_to_tei_tasks.append(task)
'''
' ##################################################
' # poco bundle creation #
' ##################################################
'''
poco_bundle_creation_tasks = []
for i, job in enumerate(self.jobs):
input_dir = job.intermediate_dir
output_dir = os.path.join(job.output_dir, 'poco')
files = os.listdir(input_dir)
if self.binarize:
files = filter(lambda x: x.endswith(('.bin.png', '.hocr')), files) # noqa
else:
files = filter(lambda x: x.endswith(('.tif', '.hocr')), files)
files = natsorted(files)
files = map(lambda x: os.path.join(input_dir, x), files)
cmd = 'mv "{}" "{}"'.format('" "'.join(files), output_dir)
deps = filter(lambda x: x.startswith('ocr_-_{}'.format(i)), ocr_tasks) # noqa
deps.append('tei_p5_creation_-_{}'.format(i))
lbl = 'poco_bundle_creation_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
poco_bundle_creation_tasks.append(task)
'''
' The following jobs are created based of the output files of the
' combined_pdf_creation_tasks. So wait until they are finished.
'''
self.waitForTasks()
'''
' ##################################################
' # cleanup #
' ##################################################
'''
cleanup_tasks = []
for i, job in enumerate(self.jobs):
input_dir = job.intermediate_dir
cmd = 'rm -r "{}"'.format(input_dir)
deps = ['combined_pdf_creation_-_{}'.format(i),
'combined_txt_creation_-_{}'.format(i),
'poco_bundle_creation_-_{}'.format(i),
'tei_p5_creation_-_{}'.format(i)]
lbl = 'job_cleanup_-_{}'.format(i)
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
cleanup_tasks.append(task)
input_dir = self.intermediate_dir
cmd = 'rm -r "{}"'.format(input_dir)
deps = cleanup_tasks
lbl = 'pipeline_cleanup'
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
cleanup_tasks.append(task)
self.waitForTasks()
''' '''
' ################################################## ' ##################################################
@ -376,7 +223,7 @@ class OCRPipeline(WorkflowRunner):
cmd += ' -i "*.pdf" "*.txt" "*.xml" "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif') # noqa cmd += ' -i "*.pdf" "*.txt" "*.xml" "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif') # noqa
cmd += ' && ' cmd += ' && '
cmd += 'cd -' cmd += 'cd -'
deps = combined_pdf_creation_tasks + combined_txt_creation_tasks + poco_bundle_creation_tasks # noqa deps = hocr_to_tei_tasks
lbl = 'zip_creation_-_all' lbl = 'zip_creation_-_all'
task = self.addTask(command=cmd, dependencies=deps, label=lbl) task = self.addTask(command=cmd, dependencies=deps, label=lbl)
zip_creation_tasks.append(task) zip_creation_tasks.append(task)
@ -390,7 +237,7 @@ class OCRPipeline(WorkflowRunner):
cmd += ' -i "*.pdf"' cmd += ' -i "*.pdf"'
cmd += ' && ' cmd += ' && '
cmd += 'cd -' cmd += 'cd -'
deps = combined_pdf_creation_tasks deps = ocr_tasks
lbl = 'zip_creation_-_pdf' lbl = 'zip_creation_-_pdf'
task = self.addTask(command=cmd, dependencies=deps, label=lbl) task = self.addTask(command=cmd, dependencies=deps, label=lbl)
zip_creation_tasks.append(task) zip_creation_tasks.append(task)
@ -404,7 +251,7 @@ class OCRPipeline(WorkflowRunner):
cmd += ' -i "*.txt"' cmd += ' -i "*.txt"'
cmd += ' && ' cmd += ' && '
cmd += 'cd -' cmd += 'cd -'
deps = combined_txt_creation_tasks deps = ocr_tasks
lbl = 'zip_creation_-_txt' lbl = 'zip_creation_-_txt'
task = self.addTask(command=cmd, dependencies=deps, label=lbl) task = self.addTask(command=cmd, dependencies=deps, label=lbl)
zip_creation_tasks.append(task) zip_creation_tasks.append(task)
@ -418,7 +265,7 @@ class OCRPipeline(WorkflowRunner):
cmd += ' -i "*.xml"' cmd += ' -i "*.xml"'
cmd += ' && ' cmd += ' && '
cmd += 'cd -' cmd += 'cd -'
deps = tei_p5_creation_tasks deps = hocr_to_tei_tasks
lbl = 'zip_creation_-_xml' lbl = 'zip_creation_-_xml'
task = self.addTask(command=cmd, dependencies=deps, label=lbl) task = self.addTask(command=cmd, dependencies=deps, label=lbl)
zip_creation_tasks.append(task) zip_creation_tasks.append(task)
@ -432,37 +279,80 @@ class OCRPipeline(WorkflowRunner):
cmd += ' -i "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif') # noqa cmd += ' -i "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif') # noqa
cmd += ' && ' cmd += ' && '
cmd += 'cd -' cmd += 'cd -'
deps = poco_bundle_creation_tasks deps = post_ocr_tasks
lbl = 'zip_creation_-_poco' lbl = 'zip_creation_-_poco'
task = self.addTask(command=cmd, dependencies=deps, label=lbl) task = self.addTask(command=cmd, dependencies=deps, label=lbl)
zip_creation_tasks.append(task) zip_creation_tasks.append(task)
def collect_jobs(input_dir, output_dir, intermediate_dir): def collect_jobs(input_dir, output_dir):
jobs = [] jobs = []
for file in os.listdir(input_dir): for file in os.listdir(input_dir):
if os.path.isdir(os.path.join(input_dir, file)): if os.path.isdir(os.path.join(input_dir, file)):
jobs += collect_jobs(os.path.join(input_dir, file), jobs += collect_jobs(os.path.join(input_dir, file),
os.path.join(output_dir, file), os.path.join(output_dir, file))
os.path.join(intermediate_dir, file))
elif file.lower().endswith('.pdf'): elif file.lower().endswith('.pdf'):
job = OCRPipelineJob(os.path.join(input_dir, file), job = OCRPipelineJob(os.path.join(input_dir, file),
os.path.join(output_dir, file), os.path.join(output_dir, file))
os.path.join(intermediate_dir, file))
jobs.append(job) jobs.append(job)
return jobs return jobs
def parse_args():
parser = ArgumentParser(description='OCR pipeline for PDF file processing',
prog='OCR pipeline')
parser.add_argument('-i', '--input-dir',
help='Input directory',
required=True)
parser.add_argument('-o', '--output-dir',
help='Output directory',
required=True)
parser.add_argument('-l', '--language',
choices=list(map(lambda x: x[:-12], filter(lambda x: x.endswith('.traineddata'), os.listdir('/usr/local/share/tessdata')))), # noqa
help='Language of the input '
'(3-character ISO 639-2 language codes)',
required=True)
parser.add_argument('--binarize',
action='store_true',
help='Add binarization as a preprocessing step')
parser.add_argument('--log-dir',
help='Logging directory')
parser.add_argument('--mem-mb',
help='Amount of system memory to be used (Default: min(--n-cores * 2048, available system memory))', # noqa
type=int)
parser.add_argument('--n-cores',
default=min(4, multiprocessing.cpu_count()),
help='Number of CPU threads to be used', # noqa
type=int)
parser.add_argument('--zip',
help='Create one zip file per filetype')
parser.add_argument('-v', '--version',
action='version',
help='Returns the current version of the OCR pipeline',
version='%(prog)s {}'.format(__version__))
args = parser.parse_args()
# Set some tricky default values and check for insufficient input
if args.log_dir is None:
args.log_dir = args.output_dir
if args.n_cores < 1:
raise Exception('--n-cores must be greater or equal 1')
if args.mem_mb is None:
max_mem_mb = int(os.popen('free -t -m').readlines()[-1].split()[1:][0])
args.mem_mb = min(args.n_cores * 2048, max_mem_mb)
if args.mem_mb < 2048:
raise Exception('--mem-mb must be greater or equal 2048')
if args.zip is not None and args.zip.lower().endswith('.zip'):
# Remove .zip file extension if provided
args.zip = args.zip[:-4]
args.zip = args.zip if args.zip else 'output'
return args
def main(): def main():
args = parse_args() args = parse_args()
ocr_pipeline = OCRPipeline(args.input_directory, args.language, ocr_pipeline = OCRPipeline(args.input_dir, args.language, args.output_dir, args.binarize, args.zip) # noqa
args.output_directory, args.binarize, retval = ocr_pipeline.run(dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores) # noqa
args.intermediate_directory, args.n_cores,
args.zip)
retval = ocr_pipeline.run(
dataDirRoot=(args.log_dir or args.output_directory),
nCores=args.n_cores
)
sys.exit(retval) sys.exit(retval)

View File

@ -1,43 +1,43 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# coding=utf-8 # coding=utf-8
"""A wrapper to execute the OCR pipeline in a Docker container""" """A wrapper to execute the OCR pipeline in a Docker container."""
from argparse import ArgumentParser from argparse import ArgumentParser
import os import os
import subprocess import subprocess
import sys
CONTAINER_IMAGE_TAG = '1.0.0' CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:1.0.0'
CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:{}'.format(CONTAINER_IMAGE_TAG) # noqa
CONTAINER_INPUT_DIR = '/input' CONTAINER_INPUT_DIR = '/input'
CONTAINER_INTERMEDIATE_DIR = '/intermediate' CONTAINER_LOG_DIR = '/logs'
CONTAINER_OUTPUT_DIR = '/output' CONTAINER_OUTPUT_DIR = '/output'
UID = str(os.getuid()) UID = str(os.getuid())
GID = str(os.getgid()) GID = str(os.getgid())
parser = ArgumentParser(add_help=False) parser = ArgumentParser(add_help=False)
parser.add_argument('-i', '--input-directory') parser.add_argument('-i', '--input-dir')
parser.add_argument('-o', '--output-directory') parser.add_argument('-o', '--output-dir')
parser.add_argument('--intermediate-directory') parser.add_argument('--log-dir')
args, remaining_args = parser.parse_known_args() args, remaining_args = parser.parse_known_args()
cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)] cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)]
if args.intermediate_directory is not None: if args.log_dir is not None:
cmd += ['-v', '{}:{}'.format(os.path.abspath(args.intermediate_directory), cmd += ['-v', '{}:{}'.format(os.path.abspath(args.log_dir),
CONTAINER_INTERMEDIATE_DIR)] CONTAINER_LOG_DIR)]
remaining_args.insert(0, CONTAINER_INTERMEDIATE_DIR) remaining_args.insert(0, CONTAINER_LOG_DIR)
remaining_args.insert(0, '--intermediate-directory') remaining_args.insert(0, '--log-dir')
if args.output_directory is not None: if args.input_dir is not None:
cmd += ['-v', '{}:{}'.format(os.path.abspath(args.output_directory), cmd += ['-v', '{}:{}'.format(os.path.abspath(args.input_dir),
CONTAINER_OUTPUT_DIR)]
remaining_args.insert(0, CONTAINER_OUTPUT_DIR)
remaining_args.insert(0, '-o')
if args.input_directory is not None:
cmd += ['-v', '{}:{}'.format(os.path.abspath(args.input_directory),
CONTAINER_INPUT_DIR)] CONTAINER_INPUT_DIR)]
remaining_args.insert(0, CONTAINER_INPUT_DIR) remaining_args.insert(0, CONTAINER_INPUT_DIR)
remaining_args.insert(0, '-i') remaining_args.insert(0, '-i')
if args.output_dir is not None:
cmd += ['-v', '{}:{}'.format(os.path.abspath(args.output_dir),
CONTAINER_OUTPUT_DIR)]
remaining_args.insert(0, CONTAINER_OUTPUT_DIR)
remaining_args.insert(0, '-o')
cmd.append(CONTAINER_IMAGE) cmd.append(CONTAINER_IMAGE)
cmd += remaining_args cmd += remaining_args
subprocess.run(cmd) sys.exit(subprocess.run(cmd).returncode)