mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2024-12-26 04:34:17 +00:00
Update to Tesseract 5.0.0, Set version 0.1.0
This commit is contained in:
parent
a0760487ae
commit
e1b78b6ba4
39
Dockerfile
39
Dockerfile
@ -9,8 +9,14 @@ ENV LANG=C.UTF-8
|
|||||||
|
|
||||||
RUN apt-get update \
|
RUN apt-get update \
|
||||||
&& apt-get install --no-install-recommends --yes \
|
&& apt-get install --no-install-recommends --yes \
|
||||||
wget
|
ghostscript \
|
||||||
|
procps \
|
||||||
|
python3.7 \
|
||||||
|
python3-pip \
|
||||||
|
rename \
|
||||||
|
wget \
|
||||||
|
zip \
|
||||||
|
&& python3 -m pip install lxml
|
||||||
|
|
||||||
# Install the OCR pipeline and it's dependencies #
|
# Install the OCR pipeline and it's dependencies #
|
||||||
## Install pyFlow ##
|
## Install pyFlow ##
|
||||||
@ -43,7 +49,7 @@ RUN wget --no-check-certificate --quiet \
|
|||||||
|
|
||||||
|
|
||||||
## Install Tesseract OCR ##
|
## Install Tesseract OCR ##
|
||||||
ENV TESSERACT_VERSION=4.1.1
|
ENV TESSERACT_VERSION=5.0.0
|
||||||
RUN wget --no-check-certificate --quiet \
|
RUN wget --no-check-certificate --quiet \
|
||||||
"https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \
|
"https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \
|
||||||
&& tar -xzf "${TESSERACT_VERSION}.tar.gz" \
|
&& tar -xzf "${TESSERACT_VERSION}.tar.gz" \
|
||||||
@ -61,37 +67,20 @@ RUN wget --no-check-certificate --quiet \
|
|||||||
pkg-config \
|
pkg-config \
|
||||||
zlib1g-dev \
|
zlib1g-dev \
|
||||||
&& ./autogen.sh \
|
&& ./autogen.sh \
|
||||||
&& ./configure \
|
&& ./configure --disable-openmp --disable-shared 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic' \
|
||||||
&& make \
|
&& make \
|
||||||
&& make install \
|
&& make install \
|
||||||
&& ldconfig \
|
&& ldconfig \
|
||||||
&& cd - > /dev/null \
|
&& cd - > /dev/null \
|
||||||
&& rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz"
|
&& rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz"
|
||||||
|
|
||||||
ENV TESSERACT_MODELS="ara,chi_tra,dan,deu,ell,eng,enm,fra,frk,frm,ita,por,rus,spa"
|
|
||||||
ENV TESSDATA_BEST_VERSION=4.1.0
|
|
||||||
RUN wget --no-check-certificate --quiet \
|
|
||||||
"https://github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}.tar.gz" \
|
|
||||||
&& tar -xzf "${TESSDATA_BEST_VERSION}.tar.gz" \
|
|
||||||
&& for tesseract_model in $(echo ${TESSERACT_MODELS} | tr "," "\n"); do mv "tessdata_best-${TESSDATA_BEST_VERSION}/${tesseract_model}.traineddata" "/usr/local/share/tessdata/"; done \
|
|
||||||
&& rm -r "tessdata_best-${TESSDATA_BEST_VERSION}" "${TESSDATA_BEST_VERSION}.tar.gz"
|
|
||||||
|
|
||||||
|
|
||||||
## Further dependencies ##
|
|
||||||
RUN apt-get install --no-install-recommends --yes \
|
|
||||||
procps \
|
|
||||||
ghostscript \
|
|
||||||
python3.7 \
|
|
||||||
rename \
|
|
||||||
zip
|
|
||||||
|
|
||||||
|
|
||||||
## Install Pipeline ##
|
|
||||||
COPY hocrtotei ocr /usr/local/bin/
|
|
||||||
|
|
||||||
|
|
||||||
RUN rm -r /var/lib/apt/lists/*
|
RUN rm -r /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
|
||||||
|
## Install Pipeline ##
|
||||||
|
COPY hocr2tei hocr-combine ocr /usr/local/bin/
|
||||||
|
|
||||||
|
|
||||||
ENTRYPOINT ["ocr"]
|
ENTRYPOINT ["ocr"]
|
||||||
CMD ["--help"]
|
CMD ["--help"]
|
||||||
|
21
LICENSE
Normal file
21
LICENSE
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2021 Bielefeld University - CRC 1288 - INF
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
43
README.md
43
README.md
@ -1,6 +1,6 @@
|
|||||||
# OCR - Optical Character Recognition
|
# OCR - Optical Character Recognition
|
||||||
|
|
||||||
This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided.
|
This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided. The pipeline is designed to run on Linux operating systems, but with some tweaks it should also run on Windows with WSL installed.
|
||||||
|
|
||||||
## Software used in this pipeline implementation
|
## Software used in this pipeline implementation
|
||||||
|
|
||||||
@ -8,37 +8,26 @@ This software implements a heavily parallelized pipeline to recognize text in PD
|
|||||||
- Software from Debian Buster's free repositories
|
- Software from Debian Buster's free repositories
|
||||||
- ocropy (1.3.3): https://github.com/ocropus/ocropy/releases/tag/v1.3.3
|
- ocropy (1.3.3): https://github.com/ocropus/ocropy/releases/tag/v1.3.3
|
||||||
- pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20
|
- pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20
|
||||||
- Tesseract OCR (4.1.1): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1
|
- Tesseract OCR (5.0.0): https://github.com/tesseract-ocr/tesseract/releases/tag/5.0.0
|
||||||
- tessdata_best (4.1.0): https://github.com/tesseract-ocr/tessdata_best/releases/tag/4.1.0
|
|
||||||
|
|
||||||
## Use this image
|
## Installation
|
||||||
|
|
||||||
1. Create input and output directories for the pipeline.
|
1. Install Docker and Python 3.
|
||||||
``` bash
|
2. Clone this repository: `git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git`
|
||||||
mkdir -p /<my_data_location>/input /<my_data_location>/output
|
2. Build the Docker image: `docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:0.1.0 ocr`
|
||||||
```
|
2. Add the wrapper script (`wrapper/ocr` relative to this README file) to your `${PATH}`.
|
||||||
|
3. Create working directories for the pipeline: `mkdir -p /<my_data_location>/{input,models,output}`.
|
||||||
|
4. Place your Tesseract OCR model(s) inside `/<my_data_location>/models`.
|
||||||
|
|
||||||
2. Place your PDF files inside `/<my_data_location>/input`. Files should all contain text of the same language.
|
## Use the Pipeline
|
||||||
|
|
||||||
|
1. Place your PDF files inside `/<my_data_location>/input`. Files should all contain text of the same language.
|
||||||
|
2. Clear your `/<my_data_location>/output` directory.
|
||||||
3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details.
|
3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details.
|
||||||
```
|
```bash
|
||||||
# Option one: Use the wrapper script
|
|
||||||
## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/raw/development/wrapper/ocr, make it executeable and add it to your ${PATH}
|
|
||||||
cd /<my_data_location>
|
cd /<my_data_location>
|
||||||
ocr -i input -l <language_code> -o output <optional_pipeline_arguments>
|
ocr -i input -o output -m models/<model_name> -l <language_code> <optional_pipeline_arguments>
|
||||||
|
# or
|
||||||
# Option two: Classic Docker style
|
ocr -i input -o output -m models/* -l <language_code> <optional_pipeline_arguments>
|
||||||
docker run \
|
|
||||||
--rm \
|
|
||||||
-it \
|
|
||||||
-u $(id -u $USER):$(id -g $USER) \
|
|
||||||
-v /<my_data_location>/input:/input \
|
|
||||||
-v /<my_data_location>/output:/output \
|
|
||||||
gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:development \
|
|
||||||
-i /ocr_pipeline/input \
|
|
||||||
-l <language_code> \
|
|
||||||
-o /ocr_pipeline/output \
|
|
||||||
<optional_pipeline_arguments>
|
|
||||||
```
|
```
|
||||||
|
|
||||||
4. Check your results in the `/<my_data_location>/output` directory.
|
4. Check your results in the `/<my_data_location>/output` directory.
|
||||||
|
35
hocr-combine
Normal file
35
hocr-combine
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
#!/usr/bin/env python3.7
|
||||||
|
# coding=utf-8
|
||||||
|
|
||||||
|
""""Combine multiple hOCR files."""
|
||||||
|
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
from lxml import html
|
||||||
|
|
||||||
|
|
||||||
|
parser = ArgumentParser(description='Combine multiple hOCR files.')
|
||||||
|
parser.add_argument('file', help='Input file(s)', nargs='+')
|
||||||
|
parser.add_argument('-o', '--output-file', help='Output file', required=True)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
for file in args.file:
|
||||||
|
files = []
|
||||||
|
if file.startswith('@'):
|
||||||
|
with open(file[1:], 'r') as f:
|
||||||
|
files += [x for x in f.read().split("\n") if x != '']
|
||||||
|
else:
|
||||||
|
files.append(file)
|
||||||
|
if len(files) == 0:
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
hocr = html.parse(files[0])
|
||||||
|
hocr_body = hocr.find('body')
|
||||||
|
for file in files[1:]:
|
||||||
|
for ocr_page in html.parse(file).findall('//div[@class="ocr_page"]'):
|
||||||
|
hocr_body.append(ocr_page)
|
||||||
|
|
||||||
|
|
||||||
|
with open(args.output_file, 'wb') as f:
|
||||||
|
hocr.write(f, encoding='UTF-8', method='html')
|
39
hocrtotei → hocr2tei
Executable file → Normal file
39
hocrtotei → hocr2tei
Executable file → Normal file
@ -3,16 +3,18 @@
|
|||||||
|
|
||||||
""""Convert hOCR to TEI XML."""
|
""""Convert hOCR to TEI XML."""
|
||||||
|
|
||||||
from xml.sax.saxutils import escape
|
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
|
from lxml import html
|
||||||
|
from xml.sax.saxutils import escape
|
||||||
import re
|
import re
|
||||||
import xml.etree.ElementTree as ET
|
|
||||||
|
|
||||||
parser = ArgumentParser(description='Convert hOCR to TEI XML.')
|
parser = ArgumentParser(description='Convert hOCR to TEI XML.')
|
||||||
parser.add_argument('input', metavar='Path to hOCR input file')
|
parser.add_argument('file', help='Input file')
|
||||||
parser.add_argument('output', metavar='Path to TEI output file')
|
parser.add_argument('-o', '--output-file', help='Output file', required=True)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
tei = ''
|
tei = ''
|
||||||
tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n'
|
tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n'
|
||||||
tei += ' <teiHeader>\n'
|
tei += ' <teiHeader>\n'
|
||||||
@ -30,28 +32,27 @@ tei += ' </fileDesc>\n'
|
|||||||
tei += ' </teiHeader>\n'
|
tei += ' </teiHeader>\n'
|
||||||
tei += ' <text>\n'
|
tei += ' <text>\n'
|
||||||
tei += ' <body>\n'
|
tei += ' <body>\n'
|
||||||
# Conversion start
|
hocr = html.parse(args.file)
|
||||||
hocr = ET.parse(args.input)
|
for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
|
||||||
for page in hocr.findall('.//*[@class="ocr_page"]'):
|
ocr_page_title_attrib = ocr_page.attrib.get('title')
|
||||||
page_properties = page.attrib.get('title')
|
facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1)
|
||||||
facsimile = re.search(r'image \"(.*?)\"', page_properties).group(1)
|
page_number = re.search(r'ppageno (\d+)', ocr_page_title_attrib).group(1)
|
||||||
page_number = re.search(r'ppageno (\d+)', page_properties).group(1)
|
tei += f' <pb facs="{facsimile}" n="{page_number}"/>\n'
|
||||||
tei += ' <pb facs="{}" n="{}"/>\n'.format(facsimile, page_number)
|
for ocr_par in ocr_page.findall('.//p[@class="ocr_par"]'):
|
||||||
for para in page.findall('.//*[@class="ocr_par"]'):
|
|
||||||
tei += ' <p>\n'
|
tei += ' <p>\n'
|
||||||
for line in para.findall('.//*[@class="ocr_line"]'):
|
for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'):
|
||||||
tei += ' <lb/>'
|
tei += ' <lb/>'
|
||||||
indent = ''
|
indent = ''
|
||||||
for word in line.findall('.//*[@class="ocrx_word"]'):
|
for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'):
|
||||||
if word.text is not None:
|
if ocrx_word.text is not None:
|
||||||
tei += indent + escape(word.text.strip())
|
tei += indent + escape(ocrx_word.text)
|
||||||
indent = ' '
|
indent = ' '
|
||||||
tei += '\n'
|
tei += '\n'
|
||||||
tei += ' </p>\n'
|
tei += ' </p>\n'
|
||||||
# Conversion end
|
|
||||||
tei += ' </body>\n'
|
tei += ' </body>\n'
|
||||||
tei += ' </text>\n'
|
tei += ' </text>\n'
|
||||||
tei += '</TEI>\n'
|
tei += '</TEI>\n'
|
||||||
|
|
||||||
with open(args.output, 'w') as tei_file:
|
|
||||||
tei_file.write(tei)
|
with open(args.output_file, 'w') as f:
|
||||||
|
f.write(tei)
|
719
ocr
719
ocr
@ -1,11 +1,9 @@
|
|||||||
#!/usr/bin/env python2.7
|
#!/usr/bin/env python2.7
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
|
||||||
"""OCR pipeline for PDF file processing."""
|
''' OCR pipeline for PDF file processing. '''
|
||||||
|
__version__ = '0.1.0'
|
||||||
|
|
||||||
__author__ = 'Patrick Jentsch <p.jentsch@uni-bielefeld.de>,' \
|
|
||||||
'Stephan Porada <porada@posteo.de>'
|
|
||||||
__version__ = '1.0.0'
|
|
||||||
|
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
from pyflow import WorkflowRunner
|
from pyflow import WorkflowRunner
|
||||||
@ -14,145 +12,402 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
|
||||||
class OCRPipelineJob:
|
class PipelineJob:
|
||||||
"""An OCR pipeline job class
|
'''
|
||||||
|
OCR pipeline job class.
|
||||||
|
|
||||||
Each input file of the pipeline is represented as an OCR pipeline job,
|
Each input file of the pipeline is represented as an OCR pipeline job,
|
||||||
which holds all necessary information for the pipeline to process it.
|
which holds all necessary information for the pipeline to process it.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
file -- Path to the file
|
file -- Path to the file
|
||||||
output_dir -- Path to a directory, where job results a stored
|
output_dir -- Path to a directory, where job results are stored
|
||||||
"""
|
'''
|
||||||
|
|
||||||
def __init__(self, file, output_dir):
|
def __init__(self, file, output_dir):
|
||||||
self.file = file
|
self.file = file
|
||||||
self.name = os.path.basename(file).rsplit('.', 1)[0]
|
self.name = os.path.basename(file)[:-4]
|
||||||
self.output_dir = output_dir
|
self.output_dir = output_dir
|
||||||
self.page_dir = os.path.join(output_dir, 'pages')
|
self.tmp_dir = os.path.join(output_dir, 'tmp')
|
||||||
|
|
||||||
|
|
||||||
class OCRPipeline(WorkflowRunner):
|
class SplitInputWorkflow(WorkflowRunner):
|
||||||
def __init__(self, input_dir, lang, output_dir, binarize, zip):
|
def __init__(self, job):
|
||||||
|
self.job = job
|
||||||
|
|
||||||
|
def workflow(self):
|
||||||
|
'''
|
||||||
|
' ##################################################
|
||||||
|
' # gs #
|
||||||
|
' ##################################################
|
||||||
|
'''
|
||||||
|
n_cores = min(2, self.getNCores())
|
||||||
|
mem_mb = min(n_cores * 512, self.getMemMb())
|
||||||
|
cmd = 'gs'
|
||||||
|
cmd += ' -dBATCH'
|
||||||
|
cmd += ' -dNOPAUSE'
|
||||||
|
cmd += ' -dBufferSpace={}'.format(mem_mb * 1000000)
|
||||||
|
cmd += ' -dNumRenderingThreads={}'.format(n_cores)
|
||||||
|
cmd += ' -dQUIET'
|
||||||
|
cmd += ' -r300'
|
||||||
|
cmd += ' -sDEVICE=png16m'
|
||||||
|
cmd += ' -sOutputFile="{}/page-%d.png"'.format(
|
||||||
|
os.path.join(self.job.tmp_dir, 'images')
|
||||||
|
)
|
||||||
|
cmd += ' "{}"'.format(self.job.file)
|
||||||
|
self.addTask(
|
||||||
|
'gs',
|
||||||
|
command=cmd,
|
||||||
|
memMb=mem_mb,
|
||||||
|
nCores=n_cores
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class BinarizationWorkflow(WorkflowRunner):
|
||||||
|
def __init__(self, job):
|
||||||
|
self.job = job
|
||||||
|
|
||||||
|
def workflow(self):
|
||||||
|
'''
|
||||||
|
' ##################################################
|
||||||
|
' # ocropus-nlbin #
|
||||||
|
' ##################################################
|
||||||
|
'''
|
||||||
|
# TODO: Update to newer ocropus-nlbin and start one task per page
|
||||||
|
n_cores = self.getNCores()
|
||||||
|
mem_mb = min(512 * n_cores, self.getMemMb())
|
||||||
|
cmd = 'ls -dv "{}/"* > "{}"'.format(
|
||||||
|
os.path.join(self.job.tmp_dir, 'images'),
|
||||||
|
os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')
|
||||||
|
)
|
||||||
|
cmd += ' && '
|
||||||
|
cmd += 'ocropus-nlbin "@{}"'.format(os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')) # noqa
|
||||||
|
cmd += ' --nocheck'
|
||||||
|
cmd += ' --output "{}"'.format(
|
||||||
|
os.path.join(self.job.tmp_dir, 'images'))
|
||||||
|
cmd += ' --parallel "{}"'.format(n_cores)
|
||||||
|
cmd += ' && '
|
||||||
|
cmd += 'rm "{}"'.format(os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')) # noqa
|
||||||
|
ocropus_nlbin_task = self.addTask(
|
||||||
|
'ocropus_nlbin',
|
||||||
|
command=cmd,
|
||||||
|
memMb=mem_mb,
|
||||||
|
nCores=n_cores
|
||||||
|
)
|
||||||
|
|
||||||
|
'''
|
||||||
|
' ##################################################
|
||||||
|
' # cleanup #
|
||||||
|
' ##################################################
|
||||||
|
'''
|
||||||
|
n_cores = 1
|
||||||
|
mem_mb = min(128, self.getMemMb())
|
||||||
|
cmd = 'cd "{}"'.format(os.path.join(self.job.tmp_dir, 'images'))
|
||||||
|
cmd += ' && '
|
||||||
|
cmd += 'mkdir tmp'
|
||||||
|
cmd += ' && '
|
||||||
|
cmd += 'mv *.bin.png tmp'
|
||||||
|
cmd += ' && '
|
||||||
|
cmd += 'rm *.png'
|
||||||
|
cmd += ' && '
|
||||||
|
cmd += 'mv tmp/* .'
|
||||||
|
cmd += ' && '
|
||||||
|
cmd += 'rmdir tmp'
|
||||||
|
cmd += ' && '
|
||||||
|
cmd += 'rename \'s/^0*/page-/\' *'
|
||||||
|
cmd += ' && '
|
||||||
|
cmd += 'rename \'s/.bin.png$/.png/\' *'
|
||||||
|
cmd += ' && '
|
||||||
|
cmd += 'cd -'
|
||||||
|
self.addTask(
|
||||||
|
'cleanup',
|
||||||
|
command=cmd,
|
||||||
|
dependencies=ocropus_nlbin_task,
|
||||||
|
memMb=mem_mb,
|
||||||
|
nCores=n_cores
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class OCRWorkflow(WorkflowRunner):
|
||||||
|
def __init__(self, job, lang):
|
||||||
|
self.job = job
|
||||||
|
self.lang = lang
|
||||||
|
|
||||||
|
def workflow(self):
|
||||||
|
'''
|
||||||
|
' ##################################################
|
||||||
|
' # tesseract #
|
||||||
|
' ##################################################
|
||||||
|
'''
|
||||||
|
tesseract_tasks = []
|
||||||
|
n_cores = 1
|
||||||
|
mem_mb = min(512, self.getMemMb())
|
||||||
|
for i, file in enumerate(os.listdir(os.path.join(self.job.tmp_dir, 'images'))): # noqa
|
||||||
|
cmd = 'tesseract "{}" "{}"'.format(
|
||||||
|
os.path.join(self.job.tmp_dir, 'images', file),
|
||||||
|
os.path.join(self.job.tmp_dir, file[:-4])
|
||||||
|
)
|
||||||
|
cmd += ' -l "{}"'.format(self.lang)
|
||||||
|
cmd += ' hocr pdf txt'
|
||||||
|
cmd += ' || '
|
||||||
|
cmd += 'echo "${?}"'
|
||||||
|
task = self.addTask(
|
||||||
|
'tesseract_-_{}'.format(i),
|
||||||
|
command=cmd,
|
||||||
|
env={'OMP_THREAD_LIMIT': '{}'.format(n_cores)},
|
||||||
|
memMb=mem_mb,
|
||||||
|
nCores=n_cores
|
||||||
|
)
|
||||||
|
tesseract_tasks.append(task)
|
||||||
|
|
||||||
|
'''
|
||||||
|
' ##################################################
|
||||||
|
' # move_files #
|
||||||
|
' ##################################################
|
||||||
|
'''
|
||||||
|
n_cores = 1
|
||||||
|
mem_mb = min(128, self.getMemMb())
|
||||||
|
for i, file_extension in enumerate(['hocr', 'pdf', 'txt']):
|
||||||
|
cmd = 'mv "{}/"*.{} "{}"'.format(
|
||||||
|
self.job.tmp_dir,
|
||||||
|
file_extension,
|
||||||
|
os.path.join(self.job.tmp_dir, file_extension)
|
||||||
|
)
|
||||||
|
self.addTask(
|
||||||
|
'move_{}_files'.format(file_extension),
|
||||||
|
command=cmd,
|
||||||
|
dependencies=tesseract_tasks,
|
||||||
|
memMb=mem_mb,
|
||||||
|
nCores=n_cores
|
||||||
|
)
|
||||||
|
cmd = 'mv "{}" "{}"'.format(
|
||||||
|
os.path.join(self.job.tmp_dir, 'images'),
|
||||||
|
os.path.join(self.job.output_dir)
|
||||||
|
)
|
||||||
|
self.addTask(
|
||||||
|
'move_image_files',
|
||||||
|
command=cmd,
|
||||||
|
dependencies=tesseract_tasks,
|
||||||
|
memMb=mem_mb,
|
||||||
|
nCores=n_cores
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class CreateHOCRWorkflow(WorkflowRunner):
|
||||||
|
def __init__(self, job):
|
||||||
|
self.job = job
|
||||||
|
|
||||||
|
def workflow(self):
|
||||||
|
'''
|
||||||
|
' ##################################################
|
||||||
|
' # fix-hocr #
|
||||||
|
' ##################################################
|
||||||
|
'''
|
||||||
|
fix_hocr_tasks = []
|
||||||
|
n_cores = 1
|
||||||
|
mem_mb = min(256, self.getMemMb())
|
||||||
|
for i, file in enumerate(os.listdir(os.path.join(self.job.tmp_dir, 'hocr'))): # noqa
|
||||||
|
cmd = 'sed -i \'s>{}>images>g\' "{}"'.format(
|
||||||
|
os.path.join(self.job.tmp_dir, 'images'),
|
||||||
|
os.path.join(self.job.tmp_dir, 'hocr', file)
|
||||||
|
)
|
||||||
|
cmd += ' && '
|
||||||
|
cmd += 'sed -i \'s>ppageno [0-9]\\+>ppageno {}>g\' "{}"'.format(
|
||||||
|
file[5:-5],
|
||||||
|
os.path.join(self.job.tmp_dir, 'hocr', file)
|
||||||
|
)
|
||||||
|
cmd += ' && '
|
||||||
|
cmd += 'sed -i \'s>page_[0-9]\\+>page_{}>g\' "{}"'.format(
|
||||||
|
file[5:-5],
|
||||||
|
os.path.join(self.job.tmp_dir, 'hocr', file)
|
||||||
|
)
|
||||||
|
cmd += ' && '
|
||||||
|
cmd += 'sed -i \'s>block_[0-9]\\+>block_{}>g\' "{}"'.format(
|
||||||
|
file[5:-5],
|
||||||
|
os.path.join(self.job.tmp_dir, 'hocr', file)
|
||||||
|
)
|
||||||
|
cmd += ' && '
|
||||||
|
cmd += 'sed -i \'s>par_[0-9]\\+>par_{}>g\' "{}"'.format(
|
||||||
|
file[5:-5],
|
||||||
|
os.path.join(self.job.tmp_dir, 'hocr', file)
|
||||||
|
)
|
||||||
|
cmd += ' && '
|
||||||
|
cmd += 'sed -i \'s>line_[0-9]\\+>line_{}>g\' "{}"'.format(
|
||||||
|
file[5:-5],
|
||||||
|
os.path.join(self.job.tmp_dir, 'hocr', file)
|
||||||
|
)
|
||||||
|
cmd += ' && '
|
||||||
|
cmd += 'sed -i \'s>word_[0-9]\\+>word_{}>g\' "{}"'.format(
|
||||||
|
file[5:-5],
|
||||||
|
os.path.join(self.job.tmp_dir, 'hocr', file)
|
||||||
|
)
|
||||||
|
task = self.addTask(
|
||||||
|
'fix-hocr_-_{}'.format(i),
|
||||||
|
command=cmd,
|
||||||
|
memMb=mem_mb,
|
||||||
|
nCores=n_cores
|
||||||
|
)
|
||||||
|
fix_hocr_tasks.append(task)
|
||||||
|
|
||||||
|
'''
|
||||||
|
' ##################################################
|
||||||
|
' # hocr-combine #
|
||||||
|
' ##################################################
|
||||||
|
'''
|
||||||
|
n_cores = 1
|
||||||
|
mem_mb = min(512, self.getMemMb())
|
||||||
|
cmd = 'ls -dv "{}/"* > "{}"'.format(
|
||||||
|
os.path.join(self.job.tmp_dir, 'hocr'),
|
||||||
|
os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt')
|
||||||
|
)
|
||||||
|
cmd += ' && '
|
||||||
|
cmd += 'hocr-combine "@{}"'.format(
|
||||||
|
os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt')
|
||||||
|
)
|
||||||
|
cmd += ' --output-file "{}.hocr"'.format(
|
||||||
|
os.path.join(self.job.output_dir, self.job.name)
|
||||||
|
)
|
||||||
|
cmd += ' && '
|
||||||
|
cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'hocr'))
|
||||||
|
self.addTask(
|
||||||
|
'hocr_combine',
|
||||||
|
command=cmd,
|
||||||
|
dependencies=fix_hocr_tasks,
|
||||||
|
memMb=mem_mb,
|
||||||
|
nCores=n_cores
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class CreatePDFWorkflow(WorkflowRunner):
|
||||||
|
def __init__(self, job):
|
||||||
|
self.job = job
|
||||||
|
|
||||||
|
def workflow(self):
|
||||||
|
'''
|
||||||
|
' ##################################################
|
||||||
|
' # pdf_combine #
|
||||||
|
' ##################################################
|
||||||
|
'''
|
||||||
|
n_cores = min(2, self.getNCores())
|
||||||
|
mem_mb = min(n_cores * 256, self.getMemMb())
|
||||||
|
cmd = 'ls -dQv "{}"/*'.format(os.path.join(self.job.tmp_dir, 'pdf'))
|
||||||
|
cmd += ' | '
|
||||||
|
cmd += 'xargs gs'
|
||||||
|
cmd += ' -dBATCH'
|
||||||
|
cmd += ' -dNOPAUSE'
|
||||||
|
cmd += ' -dBufferSpace={}'.format(mem_mb * 1000000)
|
||||||
|
cmd += ' -dNumRenderingThreads={}'.format(n_cores)
|
||||||
|
cmd += ' -dPDFSETTINGS=/ebook'
|
||||||
|
cmd += ' -dQUIET'
|
||||||
|
cmd += ' -sDEVICE=pdfwrite'
|
||||||
|
cmd += ' -sOutputFile="{}.pdf"'.format(
|
||||||
|
os.path.join(self.job.output_dir, self.job.name)
|
||||||
|
)
|
||||||
|
cmd += ' && '
|
||||||
|
cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'pdf'))
|
||||||
|
self.addTask('pdf_combine', command=cmd, memMb=mem_mb, nCores=n_cores)
|
||||||
|
|
||||||
|
|
||||||
|
class CreateTEIWorkflow(WorkflowRunner):
|
||||||
|
def __init__(self, job):
|
||||||
|
self.job = job
|
||||||
|
|
||||||
|
def workflow(self):
|
||||||
|
'''
|
||||||
|
' ##################################################
|
||||||
|
' # hocr2tei #
|
||||||
|
' ##################################################
|
||||||
|
'''
|
||||||
|
n_cores = 1
|
||||||
|
mem_mb = min(512, self.getMemMb())
|
||||||
|
cmd = 'hocr2tei "{}.hocr"'.format(
|
||||||
|
os.path.join(self.job.output_dir, self.job.name)
|
||||||
|
)
|
||||||
|
cmd += ' --output-file "{}.xml"'.format(
|
||||||
|
os.path.join(self.job.output_dir, self.job.name)
|
||||||
|
)
|
||||||
|
self.addTask('hocr2tei', command=cmd, memMb=mem_mb, nCores=n_cores)
|
||||||
|
|
||||||
|
|
||||||
|
class CreateTxtWorkflow(WorkflowRunner):
|
||||||
|
def __init__(self, job):
|
||||||
|
self.job = job
|
||||||
|
|
||||||
|
def workflow(self):
|
||||||
|
'''
|
||||||
|
' ##################################################
|
||||||
|
' # txt_combine #
|
||||||
|
' ##################################################
|
||||||
|
'''
|
||||||
|
n_cores = 1
|
||||||
|
mem_mb = min(512, self.getMemMb())
|
||||||
|
cmd = 'ls -dQv "{}"/*'.format(os.path.join(self.job.tmp_dir, 'txt'))
|
||||||
|
cmd += ' | '
|
||||||
|
cmd += 'xargs cat'
|
||||||
|
cmd += ' > '
|
||||||
|
cmd += '"{}.txt"'.format(os.path.join(self.job.output_dir, self.job.name)) # noqa
|
||||||
|
cmd += ' && '
|
||||||
|
cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'txt'))
|
||||||
|
self.addTask('txt_combine', command=cmd, memMb=mem_mb, nCores=n_cores)
|
||||||
|
|
||||||
|
|
||||||
|
class MainWorkflow(WorkflowRunner):
|
||||||
|
def __init__(self, input_dir, lang, output_dir, binarize):
|
||||||
self.input_dir = input_dir
|
self.input_dir = input_dir
|
||||||
self.lang = lang
|
self.lang = lang
|
||||||
self.output_dir = output_dir
|
self.output_dir = output_dir
|
||||||
self.binarize = binarize
|
self.binarize = binarize
|
||||||
self.zip = zip
|
self.jobs = self.collect_jobs()
|
||||||
self.jobs = collect_jobs(self.input_dir, self.output_dir)
|
|
||||||
|
def collect_jobs(self):
|
||||||
|
jobs = []
|
||||||
|
for file in os.listdir(self.input_dir):
|
||||||
|
if os.path.isdir(os.path.join(self.input_dir, file)):
|
||||||
|
continue
|
||||||
|
if file.lower().endswith('.pdf'):
|
||||||
|
job = PipelineJob(
|
||||||
|
os.path.join(self.input_dir, file),
|
||||||
|
os.path.join(self.output_dir, file)
|
||||||
|
)
|
||||||
|
jobs.append(job)
|
||||||
|
return jobs
|
||||||
|
|
||||||
def workflow(self):
|
def workflow(self):
|
||||||
if not self.jobs:
|
if not self.jobs:
|
||||||
return
|
return
|
||||||
|
|
||||||
'''
|
# Create output and temporary directories
|
||||||
' ##################################################
|
for job in self.jobs:
|
||||||
' # setup output directory #
|
os.mkdir(job.output_dir)
|
||||||
' ##################################################
|
os.mkdir(job.tmp_dir)
|
||||||
'''
|
os.mkdir(os.path.join(job.tmp_dir, 'hocr'))
|
||||||
setup_output_directory_tasks = []
|
os.mkdir(os.path.join(job.tmp_dir, 'pdf'))
|
||||||
for i, job in enumerate(self.jobs):
|
os.mkdir(os.path.join(job.tmp_dir, 'images'))
|
||||||
cmd = 'mkdir -p "{}"'.format(job.page_dir)
|
os.mkdir(os.path.join(job.tmp_dir, 'txt'))
|
||||||
lbl = 'setup_output_directory_-_{}'.format(i)
|
|
||||||
task = self.addTask(command=cmd, label=lbl)
|
|
||||||
setup_output_directory_tasks.append(task)
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
' ##################################################
|
' ##################################################
|
||||||
' # split input #
|
' # split-input #
|
||||||
' ##################################################
|
' ##################################################
|
||||||
'''
|
'''
|
||||||
split_input_tasks = []
|
|
||||||
n_cores = max(1, int(self.getNCores() / len(self.jobs)))
|
|
||||||
for i, job in enumerate(self.jobs):
|
for i, job in enumerate(self.jobs):
|
||||||
input_file = job.file
|
self.addWorkflowTask(
|
||||||
output_file = '{}/page-%d.tif'.format(job.page_dir)
|
'split_input_-_{}'.format(i),
|
||||||
cmd = 'gs'
|
SplitInputWorkflow(job)
|
||||||
cmd += ' -dBATCH'
|
)
|
||||||
cmd += ' -dNOPAUSE'
|
|
||||||
cmd += ' -dNumRenderingThreads={}'.format(n_cores)
|
|
||||||
cmd += ' -dQUIET'
|
|
||||||
cmd += ' -r300'
|
|
||||||
cmd += ' -sDEVICE=tiff24nc'
|
|
||||||
cmd += ' -sCompression=lzw'
|
|
||||||
cmd += ' "-sOutputFile={}"'.format(output_file)
|
|
||||||
cmd += ' "{}"'.format(input_file)
|
|
||||||
deps = 'setup_output_directory_-_{}'.format(i)
|
|
||||||
lbl = 'split_input_-_{}'.format(i)
|
|
||||||
task = self.addTask(command=cmd, dependencies=deps, label=lbl,
|
|
||||||
nCores=n_cores)
|
|
||||||
split_input_tasks.append(task)
|
|
||||||
|
|
||||||
if self.binarize:
|
if self.binarize:
|
||||||
'''
|
|
||||||
' ##################################################
|
|
||||||
' # pre binarization #
|
|
||||||
' ##################################################
|
|
||||||
'''
|
|
||||||
pre_binarization_tasks = []
|
|
||||||
for i, job in enumerate(self.jobs):
|
|
||||||
input_file = os.path.join(job.output_dir, 'binarization_input_files.txt') # noqa
|
|
||||||
cmd = 'ls -dv "{}/"* >> "{}"'.format(job.page_dir, input_file)
|
|
||||||
deps = 'split_input_-_{}'.format(i)
|
|
||||||
lbl = 'pre_binarization_-_{}'.format(i)
|
|
||||||
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
|
|
||||||
pre_binarization_tasks.append(task)
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
' ##################################################
|
' ##################################################
|
||||||
' # binarization #
|
' # binarization #
|
||||||
' ##################################################
|
' ##################################################
|
||||||
'''
|
'''
|
||||||
binarization_tasks = []
|
|
||||||
n_cores = self.getNCores()
|
|
||||||
mem_mb = self.getMemMb()
|
|
||||||
for i, job in enumerate(self.jobs):
|
for i, job in enumerate(self.jobs):
|
||||||
input_file = os.path.join(job.output_dir, 'binarization_input_files.txt') # noqa
|
self.addWorkflowTask(
|
||||||
cmd = 'ocropus-nlbin "@{}"'.format(input_file)
|
'binarization_-_{}'.format(i),
|
||||||
cmd += ' --nocheck'
|
BinarizationWorkflow(job),
|
||||||
cmd += ' --output "{}"'.format(job.page_dir)
|
dependencies='split_input_-_{}'.format(i)
|
||||||
cmd += ' --parallel "{}"'.format(n_cores)
|
)
|
||||||
deps = 'pre_binarization_-_{}'.format(i)
|
|
||||||
lbl = 'binarization_-_{}'.format(i)
|
|
||||||
task = self.addTask(command=cmd, dependencies=deps, label=lbl,
|
|
||||||
memMb=mem_mb, nCores=n_cores)
|
|
||||||
binarization_tasks.append(task)
|
|
||||||
|
|
||||||
'''
|
|
||||||
' ##################################################
|
|
||||||
' # post binarization #
|
|
||||||
' ##################################################
|
|
||||||
'''
|
|
||||||
post_binarization_tasks = []
|
|
||||||
for i, job in enumerate(self.jobs):
|
|
||||||
input_file = os.path.join(job.output_dir, 'binarization_input_files.txt') # noqa
|
|
||||||
cmd = 'rm "{}"'.format(input_file)
|
|
||||||
cmd += ' && '
|
|
||||||
cmd += 'cd "{}"'.format(job.page_dir)
|
|
||||||
cmd += ' && '
|
|
||||||
cmd += 'rm *.{nrm.png,tif}'
|
|
||||||
cmd += ' && '
|
|
||||||
cmd += 'rename \'s/^0*/page-/\' *'
|
|
||||||
cmd += ' && '
|
|
||||||
cmd += 'cd -'
|
|
||||||
deps = 'binarization_-_{}'.format(i)
|
|
||||||
lbl = 'post_binarization_-_{}'.format(i)
|
|
||||||
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
|
|
||||||
post_binarization_tasks.append(task)
|
|
||||||
|
|
||||||
'''
|
|
||||||
' ##################################################
|
|
||||||
' # pre ocr #
|
|
||||||
' ##################################################
|
|
||||||
'''
|
|
||||||
pre_ocr_tasks = []
|
|
||||||
for i, job in enumerate(self.jobs):
|
|
||||||
input_file = os.path.join(job.output_dir, 'ocr_input_files.txt')
|
|
||||||
cmd = 'ls -dv "{}/"* >> "{}"'.format(job.page_dir, input_file)
|
|
||||||
deps = 'post_binarization_-_{}'.format(i) if self.binarize else 'split_input_-_{}'.format(i) # noqa
|
|
||||||
lbl = 'pre_ocr_-_{}'.format(i)
|
|
||||||
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
|
|
||||||
pre_ocr_tasks.append(task)
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
' ##################################################
|
' ##################################################
|
||||||
@ -160,175 +415,117 @@ class OCRPipeline(WorkflowRunner):
|
|||||||
' ##################################################
|
' ##################################################
|
||||||
'''
|
'''
|
||||||
ocr_tasks = []
|
ocr_tasks = []
|
||||||
n_cores = min(4, self.getNCores())
|
|
||||||
mem_mb = min(n_cores * 2048, self.getMemMb())
|
|
||||||
for i, job in enumerate(self.jobs):
|
for i, job in enumerate(self.jobs):
|
||||||
input_file = os.path.join(job.output_dir, 'ocr_input_files.txt')
|
if self.binarize:
|
||||||
output_file_base = os.path.join(job.output_dir, job.name)
|
deps = 'binarization_-_{}'.format(i)
|
||||||
cmd = 'tesseract "{}" "{}"'.format(input_file, output_file_base)
|
else:
|
||||||
cmd += ' -l "{}"'.format(self.lang)
|
deps = 'split_input_-_{}'.format(i)
|
||||||
cmd += ' hocr pdf txt'
|
task = self.addWorkflowTask(
|
||||||
deps = 'pre_ocr_-_{}'.format(i)
|
'ocr_-_{}'.format(i),
|
||||||
lbl = 'ocr_-_{}'.format(i)
|
OCRWorkflow(job, self.lang),
|
||||||
task = self.addTask(command=cmd, dependencies=deps,
|
dependencies=deps
|
||||||
env={'OMP_THREAD_LIMIT': '{}'.format(n_cores)},
|
)
|
||||||
label=lbl, memMb=mem_mb, nCores=n_cores)
|
|
||||||
ocr_tasks.append(task)
|
ocr_tasks.append(task)
|
||||||
|
|
||||||
'''
|
'''
|
||||||
' ##################################################
|
' ##################################################
|
||||||
' # post ocr #
|
' # create-hocr #
|
||||||
' ##################################################
|
' ##################################################
|
||||||
'''
|
'''
|
||||||
post_ocr_tasks = []
|
create_hocr_tasks = []
|
||||||
for i, job in enumerate(self.jobs):
|
for i, job in enumerate(self.jobs):
|
||||||
input_file = os.path.join(job.output_dir, 'ocr_input_files.txt')
|
task = self.addWorkflowTask(
|
||||||
output_file_base = os.path.join(job.output_dir, job.name)
|
'create_hocr_-_{}'.format(i),
|
||||||
cmd = 'rm "{}"'.format(input_file)
|
CreateHOCRWorkflow(job),
|
||||||
cmd += ' && '
|
dependencies='ocr_-_{}'.format(i)
|
||||||
cmd += 'sed -i \'s+{}+pages+g\' "{}.hocr"'.format(job.page_dir, output_file_base) # noqa
|
)
|
||||||
deps = 'ocr_-_{}'.format(i)
|
create_hocr_tasks.append(task)
|
||||||
lbl = 'post_ocr_-_{}'.format(i)
|
|
||||||
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
|
|
||||||
post_ocr_tasks.append(task)
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
' ##################################################
|
' ##################################################
|
||||||
' # hocr to tei #
|
' # create-pdf #
|
||||||
' ##################################################
|
' ##################################################
|
||||||
'''
|
'''
|
||||||
hocr_to_tei_tasks = []
|
create_pdf_tasks = []
|
||||||
for i, job in enumerate(self.jobs):
|
for i, job in enumerate(self.jobs):
|
||||||
output_file_base = os.path.join(job.output_dir, job.name)
|
task = self.addWorkflowTask(
|
||||||
cmd = 'hocrtotei "{}.hocr" "{}.xml"'.format(output_file_base, output_file_base) # noqa
|
'create_pdf_-_{}'.format(i),
|
||||||
deps = 'post_ocr_-_{}'.format(i)
|
CreatePDFWorkflow(job),
|
||||||
lbl = 'hocr_to_tei_-_{}'.format(i)
|
dependencies='ocr_-_{}'.format(i)
|
||||||
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
|
)
|
||||||
hocr_to_tei_tasks.append(task)
|
create_pdf_tasks.append(task)
|
||||||
|
|
||||||
'''
|
'''
|
||||||
' ##################################################
|
' ##################################################
|
||||||
' # zip creation #
|
' # create-tei #
|
||||||
' ##################################################
|
' ##################################################
|
||||||
'''
|
'''
|
||||||
zip_creation_tasks = []
|
create_tei_tasks = []
|
||||||
if self.zip is not None:
|
for i, job in enumerate(self.jobs):
|
||||||
# zip all files
|
task = self.addWorkflowTask(
|
||||||
cmd = 'cd "{}"'.format(self.output_dir)
|
'create_tei_-_{}'.format(i),
|
||||||
cmd += ' && '
|
CreateTEIWorkflow(job),
|
||||||
cmd += 'zip'
|
dependencies='create_hocr_-_{}'.format(i)
|
||||||
cmd += ' -r'
|
)
|
||||||
cmd += ' "{}.all.zip" .'.format(self.zip)
|
create_tei_tasks.append(task)
|
||||||
cmd += ' -x "pyflow.data*" "*tmp*"'
|
|
||||||
cmd += ' -i "*.pdf" "*.txt" "*.xml" "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif') # noqa
|
|
||||||
cmd += ' && '
|
|
||||||
cmd += 'cd -'
|
|
||||||
deps = hocr_to_tei_tasks
|
|
||||||
lbl = 'zip_creation_-_all'
|
|
||||||
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
|
|
||||||
zip_creation_tasks.append(task)
|
|
||||||
# zip PDF files
|
|
||||||
cmd = 'cd "{}"'.format(self.output_dir)
|
|
||||||
cmd += ' && '
|
|
||||||
cmd += 'zip'
|
|
||||||
cmd += ' -r'
|
|
||||||
cmd += ' "{}.pdf.zip" .'.format(self.zip)
|
|
||||||
cmd += ' -x "pyflow.data*" "*tmp*"'
|
|
||||||
cmd += ' -i "*.pdf"'
|
|
||||||
cmd += ' && '
|
|
||||||
cmd += 'cd -'
|
|
||||||
deps = ocr_tasks
|
|
||||||
lbl = 'zip_creation_-_pdf'
|
|
||||||
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
|
|
||||||
zip_creation_tasks.append(task)
|
|
||||||
# zip TXT files
|
|
||||||
cmd = 'cd "{}"'.format(self.output_dir)
|
|
||||||
cmd += ' && '
|
|
||||||
cmd += 'zip'
|
|
||||||
cmd += ' -r'
|
|
||||||
cmd += ' "{}.txt.zip" .'.format(self.zip)
|
|
||||||
cmd += ' -x "pyflow.data*" "*tmp*"'
|
|
||||||
cmd += ' -i "*.txt"'
|
|
||||||
cmd += ' && '
|
|
||||||
cmd += 'cd -'
|
|
||||||
deps = ocr_tasks
|
|
||||||
lbl = 'zip_creation_-_txt'
|
|
||||||
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
|
|
||||||
zip_creation_tasks.append(task)
|
|
||||||
# zip XML files
|
|
||||||
cmd = 'cd "{}"'.format(self.output_dir)
|
|
||||||
cmd += ' && '
|
|
||||||
cmd += 'zip'
|
|
||||||
cmd += ' -r'
|
|
||||||
cmd += ' "{}.xml.zip" .'.format(self.zip)
|
|
||||||
cmd += ' -x "pyflow.data*" "*tmp*"'
|
|
||||||
cmd += ' -i "*.xml"'
|
|
||||||
cmd += ' && '
|
|
||||||
cmd += 'cd -'
|
|
||||||
deps = hocr_to_tei_tasks
|
|
||||||
lbl = 'zip_creation_-_xml'
|
|
||||||
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
|
|
||||||
zip_creation_tasks.append(task)
|
|
||||||
# zip PoCo bundles
|
|
||||||
cmd = 'cd "{}"'.format(self.output_dir)
|
|
||||||
cmd += ' && '
|
|
||||||
cmd += 'zip'
|
|
||||||
cmd += ' -r'
|
|
||||||
cmd += ' "{}.poco.zip" .'.format(self.zip)
|
|
||||||
cmd += ' -x "pyflow.data*" "*tmp*"'
|
|
||||||
cmd += ' -i "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif') # noqa
|
|
||||||
cmd += ' && '
|
|
||||||
cmd += 'cd -'
|
|
||||||
deps = post_ocr_tasks
|
|
||||||
lbl = 'zip_creation_-_poco'
|
|
||||||
task = self.addTask(command=cmd, dependencies=deps, label=lbl)
|
|
||||||
zip_creation_tasks.append(task)
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
' ##################################################
|
||||||
|
' # create-txt #
|
||||||
|
' ##################################################
|
||||||
|
'''
|
||||||
|
create_txt_tasks = []
|
||||||
|
for i, job in enumerate(self.jobs):
|
||||||
|
task = self.addWorkflowTask(
|
||||||
|
'create_txt_-_{}'.format(i),
|
||||||
|
CreateTxtWorkflow(job),
|
||||||
|
dependencies='ocr_-_{}'.format(i)
|
||||||
|
)
|
||||||
|
create_txt_tasks.append(task)
|
||||||
|
|
||||||
def collect_jobs(input_dir, output_dir):
|
# Remove temporary directories when all tasks are completed
|
||||||
jobs = []
|
self.waitForTasks()
|
||||||
for file in os.listdir(input_dir):
|
for job in self.jobs:
|
||||||
if os.path.isdir(os.path.join(input_dir, file)):
|
os.rmdir(job.tmp_dir)
|
||||||
continue
|
|
||||||
if file.lower().endswith('.pdf'):
|
|
||||||
job = OCRPipelineJob(os.path.join(input_dir, file),
|
|
||||||
os.path.join(output_dir, file))
|
|
||||||
jobs.append(job)
|
|
||||||
return jobs
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
parser = ArgumentParser(description='OCR pipeline for PDF file processing',
|
parser = ArgumentParser(description='OCR pipeline for PDF file processing')
|
||||||
prog='OCR pipeline')
|
parser.add_argument(
|
||||||
parser.add_argument('-i', '--input-dir',
|
'-i', '--input-dir', help='Input directory', required=True)
|
||||||
help='Input directory',
|
parser.add_argument(
|
||||||
required=True)
|
'-o', '--output-dir', help='Output directory', required=True)
|
||||||
parser.add_argument('-o', '--output-dir',
|
parser.add_argument(
|
||||||
help='Output directory',
|
'-l', '--language',
|
||||||
required=True)
|
choices=[x[:-12] for x in os.listdir('/usr/local/share/tessdata')
|
||||||
parser.add_argument('-l', '--language',
|
if x.endswith('.traineddata') and len(x) > 12],
|
||||||
choices=list(map(lambda x: x[:-12], filter(lambda x: x.endswith('.traineddata'), os.listdir('/usr/local/share/tessdata')))), # noqa
|
help='Language of the input (3-character ISO 639-2 language codes)',
|
||||||
help='Language of the input '
|
required=True
|
||||||
'(3-character ISO 639-2 language codes)',
|
)
|
||||||
required=True)
|
parser.add_argument(
|
||||||
parser.add_argument('--binarize',
|
'--binarize',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
help='Add binarization as a preprocessing step')
|
help='Add binarization as a preprocessing step'
|
||||||
parser.add_argument('--log-dir',
|
)
|
||||||
help='Logging directory')
|
parser.add_argument(
|
||||||
parser.add_argument('--mem-mb',
|
'--log-dir', help='Logging directory (Default: --output-dir)')
|
||||||
help='Amount of system memory to be used (Default: min(--n-cores * 2048, available system memory))', # noqa
|
parser.add_argument(
|
||||||
type=int)
|
'--mem-mb',
|
||||||
parser.add_argument('--n-cores',
|
help='Amount of system memory to be used (Default: min(--n-cores * 512, available system memory))', # noqa
|
||||||
default=min(4, multiprocessing.cpu_count()),
|
type=int
|
||||||
help='Number of CPU threads to be used (Default: min(4, number of CPUs))', # noqa
|
)
|
||||||
type=int)
|
parser.add_argument(
|
||||||
parser.add_argument('--zip',
|
'--n-cores',
|
||||||
help='Create one zip file per filetype')
|
default=min(4, multiprocessing.cpu_count()),
|
||||||
parser.add_argument('-v', '--version',
|
help='Number of CPU threads to be used (Default: min(4, CPU count))',
|
||||||
action='version',
|
type=int
|
||||||
help='Returns the current version of the OCR pipeline',
|
)
|
||||||
version='%(prog)s {}'.format(__version__))
|
parser.add_argument(
|
||||||
|
'-v', '--version',
|
||||||
|
action='version',
|
||||||
|
help='Returns the current version of the OCR pipeline',
|
||||||
|
version='%(prog)s {}'.format(__version__)
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Set some tricky default values and check for insufficient input
|
# Set some tricky default values and check for insufficient input
|
||||||
@ -338,20 +535,18 @@ def parse_args():
|
|||||||
raise Exception('--n-cores must be greater or equal 1')
|
raise Exception('--n-cores must be greater or equal 1')
|
||||||
if args.mem_mb is None:
|
if args.mem_mb is None:
|
||||||
max_mem_mb = int(os.popen('free -t -m').readlines()[-1].split()[1:][0])
|
max_mem_mb = int(os.popen('free -t -m').readlines()[-1].split()[1:][0])
|
||||||
args.mem_mb = min(args.n_cores * 2048, max_mem_mb)
|
args.mem_mb = min(args.n_cores * 512, max_mem_mb)
|
||||||
if args.mem_mb < 2048:
|
if args.mem_mb < 512:
|
||||||
raise Exception('--mem-mb must be greater or equal 2048')
|
raise Exception('--mem-mb must be greater or equal 512')
|
||||||
if args.zip is not None and args.zip.lower().endswith('.zip'):
|
|
||||||
# Remove .zip file extension if provided
|
|
||||||
args.zip = args.zip[:-4]
|
|
||||||
args.zip = args.zip if args.zip else 'output'
|
|
||||||
return args
|
return args
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
ocr_pipeline = OCRPipeline(args.input_dir, args.language, args.output_dir, args.binarize, args.zip) # noqa
|
ocr_pipeline = MainWorkflow(
|
||||||
retval = ocr_pipeline.run(dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores) # noqa
|
args.input_dir, args.language, args.output_dir, args.binarize)
|
||||||
|
retval = ocr_pipeline.run(
|
||||||
|
dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores)
|
||||||
sys.exit(retval)
|
sys.exit(retval)
|
||||||
|
|
||||||
|
|
||||||
|
16
wrapper/ocr
16
wrapper/ocr
@ -6,9 +6,10 @@ import os
|
|||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:1.0.0'
|
CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:0.1.0'
|
||||||
CONTAINER_INPUT_DIR = '/input'
|
CONTAINER_INPUT_DIR = '/input'
|
||||||
CONTAINER_OUTPUT_DIR = '/output'
|
CONTAINER_OUTPUT_DIR = '/output'
|
||||||
|
CONTAINER_MODELS_DIR = '/usr/local/share/tessdata'
|
||||||
CONTAINER_LOG_DIR = '/logs'
|
CONTAINER_LOG_DIR = '/logs'
|
||||||
UID = str(os.getuid())
|
UID = str(os.getuid())
|
||||||
GID = str(os.getgid())
|
GID = str(os.getgid())
|
||||||
@ -16,20 +17,25 @@ GID = str(os.getgid())
|
|||||||
parser = ArgumentParser(add_help=False)
|
parser = ArgumentParser(add_help=False)
|
||||||
parser.add_argument('-i', '--input-dir')
|
parser.add_argument('-i', '--input-dir')
|
||||||
parser.add_argument('-o', '--output-dir')
|
parser.add_argument('-o', '--output-dir')
|
||||||
|
parser.add_argument('-m', '--model', action='extend', dest='models', nargs='+')
|
||||||
parser.add_argument('--log-dir')
|
parser.add_argument('--log-dir')
|
||||||
args, remaining_args = parser.parse_known_args()
|
args, remaining_args = parser.parse_known_args()
|
||||||
|
|
||||||
cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)]
|
cmd = ['docker', 'run', '--rm', '-it', '-u', f'{UID}:{GID}']
|
||||||
if args.input_dir is not None:
|
if args.input_dir is not None:
|
||||||
mapping = os.path.abspath(args.input_dir) + ':' + CONTAINER_INPUT_DIR
|
mapping = f'{os.path.abspath(args.input_dir)}:{CONTAINER_INPUT_DIR}'
|
||||||
cmd += ['-v', mapping]
|
cmd += ['-v', mapping]
|
||||||
remaining_args += ['-i', CONTAINER_INPUT_DIR]
|
remaining_args += ['-i', CONTAINER_INPUT_DIR]
|
||||||
if args.output_dir is not None:
|
if args.output_dir is not None:
|
||||||
mapping = os.path.abspath(args.output_dir) + ':' + CONTAINER_OUTPUT_DIR
|
mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}'
|
||||||
cmd += ['-v', mapping]
|
cmd += ['-v', mapping]
|
||||||
remaining_args += ['-o', CONTAINER_OUTPUT_DIR]
|
remaining_args += ['-o', CONTAINER_OUTPUT_DIR]
|
||||||
|
if args.models is not None:
|
||||||
|
for model in args.models:
|
||||||
|
mapping = f'{os.path.abspath(model)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model)}' # noqa
|
||||||
|
cmd += ['-v', mapping]
|
||||||
if args.log_dir is not None:
|
if args.log_dir is not None:
|
||||||
mapping = os.path.abspath(args.log_dir) + ':' + CONTAINER_LOG_DIR
|
mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}'
|
||||||
cmd += ['-v', mapping]
|
cmd += ['-v', mapping]
|
||||||
remaining_args += ['--log-dir', CONTAINER_LOG_DIR]
|
remaining_args += ['--log-dir', CONTAINER_LOG_DIR]
|
||||||
cmd.append(CONTAINER_IMAGE)
|
cmd.append(CONTAINER_IMAGE)
|
||||||
|
Loading…
Reference in New Issue
Block a user