From e1b78b6ba44f5910f9e9bd8f504084757c14a592 Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Tue, 4 Jan 2022 11:42:55 +0100 Subject: [PATCH] Update to Tesseract 5.0.0, Set version 0.1.0 --- Dockerfile | 39 +-- LICENSE | 21 ++ README.md | 43 +-- hocr-combine | 35 ++ hocrtotei => hocr2tei | 39 +-- ocr | 719 +++++++++++++++++++++++++++--------------- wrapper/ocr | 16 +- 7 files changed, 574 insertions(+), 338 deletions(-) create mode 100644 LICENSE create mode 100644 hocr-combine rename hocrtotei => hocr2tei (52%) mode change 100755 => 100644 diff --git a/Dockerfile b/Dockerfile index fc02719..0b01459 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,8 +9,14 @@ ENV LANG=C.UTF-8 RUN apt-get update \ && apt-get install --no-install-recommends --yes \ - wget - + ghostscript \ + procps \ + python3.7 \ + python3-pip \ + rename \ + wget \ + zip \ + && python3 -m pip install lxml # Install the OCR pipeline and it's dependencies # ## Install pyFlow ## @@ -43,7 +49,7 @@ RUN wget --no-check-certificate --quiet \ ## Install Tesseract OCR ## -ENV TESSERACT_VERSION=4.1.1 +ENV TESSERACT_VERSION=5.0.0 RUN wget --no-check-certificate --quiet \ "https://github.com/tesseract-ocr/tesseract/archive/${TESSERACT_VERSION}.tar.gz" \ && tar -xzf "${TESSERACT_VERSION}.tar.gz" \ @@ -61,37 +67,20 @@ RUN wget --no-check-certificate --quiet \ pkg-config \ zlib1g-dev \ && ./autogen.sh \ - && ./configure \ + && ./configure --disable-openmp --disable-shared 'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic' \ && make \ && make install \ && ldconfig \ && cd - > /dev/null \ && rm -r "tesseract-${TESSERACT_VERSION}" "${TESSERACT_VERSION}.tar.gz" -ENV TESSERACT_MODELS="ara,chi_tra,dan,deu,ell,eng,enm,fra,frk,frm,ita,por,rus,spa" -ENV TESSDATA_BEST_VERSION=4.1.0 -RUN wget --no-check-certificate --quiet \ - "https://github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}.tar.gz" \ - && tar -xzf "${TESSDATA_BEST_VERSION}.tar.gz" \ - && for tesseract_model in $(echo ${TESSERACT_MODELS} | tr "," "\n"); do mv "tessdata_best-${TESSDATA_BEST_VERSION}/${tesseract_model}.traineddata" "/usr/local/share/tessdata/"; done \ - && rm -r "tessdata_best-${TESSDATA_BEST_VERSION}" "${TESSDATA_BEST_VERSION}.tar.gz" - - -## Further dependencies ## -RUN apt-get install --no-install-recommends --yes \ - procps \ - ghostscript \ - python3.7 \ - rename \ - zip - - -## Install Pipeline ## -COPY hocrtotei ocr /usr/local/bin/ - RUN rm -r /var/lib/apt/lists/* +## Install Pipeline ## +COPY hocr2tei hocr-combine ocr /usr/local/bin/ + + ENTRYPOINT ["ocr"] CMD ["--help"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..a374dbc --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Bielefeld University - CRC 1288 - INF + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 265cbd8..412e9c5 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # OCR - Optical Character Recognition -This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided. +This software implements a heavily parallelized pipeline to recognize text in PDF files. It is used for nopaque's OCR service but you can also use it standalone, for that purpose a convenient wrapper script is provided. The pipeline is designed to run on Linux operating systems, but with some tweaks it should also run on Windows with WSL installed. ## Software used in this pipeline implementation @@ -8,37 +8,26 @@ This software implements a heavily parallelized pipeline to recognize text in PD - Software from Debian Buster's free repositories - ocropy (1.3.3): https://github.com/ocropus/ocropy/releases/tag/v1.3.3 - pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20 -- Tesseract OCR (4.1.1): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1 -- tessdata_best (4.1.0): https://github.com/tesseract-ocr/tessdata_best/releases/tag/4.1.0 +- Tesseract OCR (5.0.0): https://github.com/tesseract-ocr/tesseract/releases/tag/5.0.0 -## Use this image +## Installation -1. Create input and output directories for the pipeline. -``` bash -mkdir -p //input //output -``` +1. Install Docker and Python 3. +2. Clone this repository: `git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git` +2. Build the Docker image: `docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:0.1.0 ocr` +2. Add the wrapper script (`wrapper/ocr` relative to this README file) to your `${PATH}`. +3. Create working directories for the pipeline: `mkdir -p //{input,models,output}`. +4. Place your Tesseract OCR model(s) inside `//models`. -2. Place your PDF files inside `//input`. Files should all contain text of the same language. +## Use the Pipeline +1. Place your PDF files inside `//input`. Files should all contain text of the same language. +2. Clear your `//output` directory. 3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details. -``` -# Option one: Use the wrapper script -## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/raw/development/wrapper/ocr, make it executeable and add it to your ${PATH} +```bash cd / -ocr -i input -l -o output - -# Option two: Classic Docker style -docker run \ - --rm \ - -it \ - -u $(id -u $USER):$(id -g $USER) \ - -v //input:/input \ - -v //output:/output \ - gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:development \ - -i /ocr_pipeline/input \ - -l \ - -o /ocr_pipeline/output \ - +ocr -i input -o output -m models/ -l +# or +ocr -i input -o output -m models/* -l ``` - 4. Check your results in the `//output` directory. diff --git a/hocr-combine b/hocr-combine new file mode 100644 index 0000000..4008890 --- /dev/null +++ b/hocr-combine @@ -0,0 +1,35 @@ +#!/usr/bin/env python3.7 +# coding=utf-8 + +""""Combine multiple hOCR files.""" + +from argparse import ArgumentParser +from lxml import html + + +parser = ArgumentParser(description='Combine multiple hOCR files.') +parser.add_argument('file', help='Input file(s)', nargs='+') +parser.add_argument('-o', '--output-file', help='Output file', required=True) +args = parser.parse_args() + + +for file in args.file: + files = [] + if file.startswith('@'): + with open(file[1:], 'r') as f: + files += [x for x in f.read().split("\n") if x != ''] + else: + files.append(file) +if len(files) == 0: + exit(1) + + +hocr = html.parse(files[0]) +hocr_body = hocr.find('body') +for file in files[1:]: + for ocr_page in html.parse(file).findall('//div[@class="ocr_page"]'): + hocr_body.append(ocr_page) + + +with open(args.output_file, 'wb') as f: + hocr.write(f, encoding='UTF-8', method='html') diff --git a/hocrtotei b/hocr2tei old mode 100755 new mode 100644 similarity index 52% rename from hocrtotei rename to hocr2tei index ae637d3..04a3db7 --- a/hocrtotei +++ b/hocr2tei @@ -3,16 +3,18 @@ """"Convert hOCR to TEI XML.""" -from xml.sax.saxutils import escape from argparse import ArgumentParser +from lxml import html +from xml.sax.saxutils import escape import re -import xml.etree.ElementTree as ET + parser = ArgumentParser(description='Convert hOCR to TEI XML.') -parser.add_argument('input', metavar='Path to hOCR input file') -parser.add_argument('output', metavar='Path to TEI output file') +parser.add_argument('file', help='Input file') +parser.add_argument('-o', '--output-file', help='Output file', required=True) args = parser.parse_args() + tei = '' tei += '\n' tei += ' \n' @@ -30,28 +32,27 @@ tei += ' \n' tei += ' \n' tei += ' \n' tei += ' \n' -# Conversion start -hocr = ET.parse(args.input) -for page in hocr.findall('.//*[@class="ocr_page"]'): - page_properties = page.attrib.get('title') - facsimile = re.search(r'image \"(.*?)\"', page_properties).group(1) - page_number = re.search(r'ppageno (\d+)', page_properties).group(1) - tei += ' \n'.format(facsimile, page_number) - for para in page.findall('.//*[@class="ocr_par"]'): +hocr = html.parse(args.file) +for ocr_page in hocr.findall('.//div[@class="ocr_page"]'): + ocr_page_title_attrib = ocr_page.attrib.get('title') + facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1) + page_number = re.search(r'ppageno (\d+)', ocr_page_title_attrib).group(1) + tei += f' \n' + for ocr_par in ocr_page.findall('.//p[@class="ocr_par"]'): tei += '

\n' - for line in para.findall('.//*[@class="ocr_line"]'): + for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'): tei += ' ' indent = '' - for word in line.findall('.//*[@class="ocrx_word"]'): - if word.text is not None: - tei += indent + escape(word.text.strip()) + for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'): + if ocrx_word.text is not None: + tei += indent + escape(ocrx_word.text) indent = ' ' tei += '\n' tei += '

\n' -# Conversion end tei += ' \n' tei += '
\n' tei += '
\n' -with open(args.output, 'w') as tei_file: - tei_file.write(tei) + +with open(args.output_file, 'w') as f: + f.write(tei) diff --git a/ocr b/ocr index 09e2140..65ca392 100755 --- a/ocr +++ b/ocr @@ -1,11 +1,9 @@ #!/usr/bin/env python2.7 # coding=utf-8 -"""OCR pipeline for PDF file processing.""" +''' OCR pipeline for PDF file processing. ''' +__version__ = '0.1.0' -__author__ = 'Patrick Jentsch ,' \ - 'Stephan Porada ' -__version__ = '1.0.0' from argparse import ArgumentParser from pyflow import WorkflowRunner @@ -14,145 +12,402 @@ import os import sys -class OCRPipelineJob: - """An OCR pipeline job class +class PipelineJob: + ''' + OCR pipeline job class. Each input file of the pipeline is represented as an OCR pipeline job, which holds all necessary information for the pipeline to process it. Arguments: file -- Path to the file - output_dir -- Path to a directory, where job results a stored - """ + output_dir -- Path to a directory, where job results are stored + ''' def __init__(self, file, output_dir): self.file = file - self.name = os.path.basename(file).rsplit('.', 1)[0] + self.name = os.path.basename(file)[:-4] self.output_dir = output_dir - self.page_dir = os.path.join(output_dir, 'pages') + self.tmp_dir = os.path.join(output_dir, 'tmp') -class OCRPipeline(WorkflowRunner): - def __init__(self, input_dir, lang, output_dir, binarize, zip): +class SplitInputWorkflow(WorkflowRunner): + def __init__(self, job): + self.job = job + + def workflow(self): + ''' + ' ################################################## + ' # gs # + ' ################################################## + ''' + n_cores = min(2, self.getNCores()) + mem_mb = min(n_cores * 512, self.getMemMb()) + cmd = 'gs' + cmd += ' -dBATCH' + cmd += ' -dNOPAUSE' + cmd += ' -dBufferSpace={}'.format(mem_mb * 1000000) + cmd += ' -dNumRenderingThreads={}'.format(n_cores) + cmd += ' -dQUIET' + cmd += ' -r300' + cmd += ' -sDEVICE=png16m' + cmd += ' -sOutputFile="{}/page-%d.png"'.format( + os.path.join(self.job.tmp_dir, 'images') + ) + cmd += ' "{}"'.format(self.job.file) + self.addTask( + 'gs', + command=cmd, + memMb=mem_mb, + nCores=n_cores + ) + + +class BinarizationWorkflow(WorkflowRunner): + def __init__(self, job): + self.job = job + + def workflow(self): + ''' + ' ################################################## + ' # ocropus-nlbin # + ' ################################################## + ''' + # TODO: Update to newer ocropus-nlbin and start one task per page + n_cores = self.getNCores() + mem_mb = min(512 * n_cores, self.getMemMb()) + cmd = 'ls -dv "{}/"* > "{}"'.format( + os.path.join(self.job.tmp_dir, 'images'), + os.path.join(self.job.tmp_dir, 'images', 'inputs.txt') + ) + cmd += ' && ' + cmd += 'ocropus-nlbin "@{}"'.format(os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')) # noqa + cmd += ' --nocheck' + cmd += ' --output "{}"'.format( + os.path.join(self.job.tmp_dir, 'images')) + cmd += ' --parallel "{}"'.format(n_cores) + cmd += ' && ' + cmd += 'rm "{}"'.format(os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')) # noqa + ocropus_nlbin_task = self.addTask( + 'ocropus_nlbin', + command=cmd, + memMb=mem_mb, + nCores=n_cores + ) + + ''' + ' ################################################## + ' # cleanup # + ' ################################################## + ''' + n_cores = 1 + mem_mb = min(128, self.getMemMb()) + cmd = 'cd "{}"'.format(os.path.join(self.job.tmp_dir, 'images')) + cmd += ' && ' + cmd += 'mkdir tmp' + cmd += ' && ' + cmd += 'mv *.bin.png tmp' + cmd += ' && ' + cmd += 'rm *.png' + cmd += ' && ' + cmd += 'mv tmp/* .' + cmd += ' && ' + cmd += 'rmdir tmp' + cmd += ' && ' + cmd += 'rename \'s/^0*/page-/\' *' + cmd += ' && ' + cmd += 'rename \'s/.bin.png$/.png/\' *' + cmd += ' && ' + cmd += 'cd -' + self.addTask( + 'cleanup', + command=cmd, + dependencies=ocropus_nlbin_task, + memMb=mem_mb, + nCores=n_cores + ) + + +class OCRWorkflow(WorkflowRunner): + def __init__(self, job, lang): + self.job = job + self.lang = lang + + def workflow(self): + ''' + ' ################################################## + ' # tesseract # + ' ################################################## + ''' + tesseract_tasks = [] + n_cores = 1 + mem_mb = min(512, self.getMemMb()) + for i, file in enumerate(os.listdir(os.path.join(self.job.tmp_dir, 'images'))): # noqa + cmd = 'tesseract "{}" "{}"'.format( + os.path.join(self.job.tmp_dir, 'images', file), + os.path.join(self.job.tmp_dir, file[:-4]) + ) + cmd += ' -l "{}"'.format(self.lang) + cmd += ' hocr pdf txt' + cmd += ' || ' + cmd += 'echo "${?}"' + task = self.addTask( + 'tesseract_-_{}'.format(i), + command=cmd, + env={'OMP_THREAD_LIMIT': '{}'.format(n_cores)}, + memMb=mem_mb, + nCores=n_cores + ) + tesseract_tasks.append(task) + + ''' + ' ################################################## + ' # move_files # + ' ################################################## + ''' + n_cores = 1 + mem_mb = min(128, self.getMemMb()) + for i, file_extension in enumerate(['hocr', 'pdf', 'txt']): + cmd = 'mv "{}/"*.{} "{}"'.format( + self.job.tmp_dir, + file_extension, + os.path.join(self.job.tmp_dir, file_extension) + ) + self.addTask( + 'move_{}_files'.format(file_extension), + command=cmd, + dependencies=tesseract_tasks, + memMb=mem_mb, + nCores=n_cores + ) + cmd = 'mv "{}" "{}"'.format( + os.path.join(self.job.tmp_dir, 'images'), + os.path.join(self.job.output_dir) + ) + self.addTask( + 'move_image_files', + command=cmd, + dependencies=tesseract_tasks, + memMb=mem_mb, + nCores=n_cores + ) + + +class CreateHOCRWorkflow(WorkflowRunner): + def __init__(self, job): + self.job = job + + def workflow(self): + ''' + ' ################################################## + ' # fix-hocr # + ' ################################################## + ''' + fix_hocr_tasks = [] + n_cores = 1 + mem_mb = min(256, self.getMemMb()) + for i, file in enumerate(os.listdir(os.path.join(self.job.tmp_dir, 'hocr'))): # noqa + cmd = 'sed -i \'s>{}>images>g\' "{}"'.format( + os.path.join(self.job.tmp_dir, 'images'), + os.path.join(self.job.tmp_dir, 'hocr', file) + ) + cmd += ' && ' + cmd += 'sed -i \'s>ppageno [0-9]\\+>ppageno {}>g\' "{}"'.format( + file[5:-5], + os.path.join(self.job.tmp_dir, 'hocr', file) + ) + cmd += ' && ' + cmd += 'sed -i \'s>page_[0-9]\\+>page_{}>g\' "{}"'.format( + file[5:-5], + os.path.join(self.job.tmp_dir, 'hocr', file) + ) + cmd += ' && ' + cmd += 'sed -i \'s>block_[0-9]\\+>block_{}>g\' "{}"'.format( + file[5:-5], + os.path.join(self.job.tmp_dir, 'hocr', file) + ) + cmd += ' && ' + cmd += 'sed -i \'s>par_[0-9]\\+>par_{}>g\' "{}"'.format( + file[5:-5], + os.path.join(self.job.tmp_dir, 'hocr', file) + ) + cmd += ' && ' + cmd += 'sed -i \'s>line_[0-9]\\+>line_{}>g\' "{}"'.format( + file[5:-5], + os.path.join(self.job.tmp_dir, 'hocr', file) + ) + cmd += ' && ' + cmd += 'sed -i \'s>word_[0-9]\\+>word_{}>g\' "{}"'.format( + file[5:-5], + os.path.join(self.job.tmp_dir, 'hocr', file) + ) + task = self.addTask( + 'fix-hocr_-_{}'.format(i), + command=cmd, + memMb=mem_mb, + nCores=n_cores + ) + fix_hocr_tasks.append(task) + + ''' + ' ################################################## + ' # hocr-combine # + ' ################################################## + ''' + n_cores = 1 + mem_mb = min(512, self.getMemMb()) + cmd = 'ls -dv "{}/"* > "{}"'.format( + os.path.join(self.job.tmp_dir, 'hocr'), + os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt') + ) + cmd += ' && ' + cmd += 'hocr-combine "@{}"'.format( + os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt') + ) + cmd += ' --output-file "{}.hocr"'.format( + os.path.join(self.job.output_dir, self.job.name) + ) + cmd += ' && ' + cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'hocr')) + self.addTask( + 'hocr_combine', + command=cmd, + dependencies=fix_hocr_tasks, + memMb=mem_mb, + nCores=n_cores + ) + + +class CreatePDFWorkflow(WorkflowRunner): + def __init__(self, job): + self.job = job + + def workflow(self): + ''' + ' ################################################## + ' # pdf_combine # + ' ################################################## + ''' + n_cores = min(2, self.getNCores()) + mem_mb = min(n_cores * 256, self.getMemMb()) + cmd = 'ls -dQv "{}"/*'.format(os.path.join(self.job.tmp_dir, 'pdf')) + cmd += ' | ' + cmd += 'xargs gs' + cmd += ' -dBATCH' + cmd += ' -dNOPAUSE' + cmd += ' -dBufferSpace={}'.format(mem_mb * 1000000) + cmd += ' -dNumRenderingThreads={}'.format(n_cores) + cmd += ' -dPDFSETTINGS=/ebook' + cmd += ' -dQUIET' + cmd += ' -sDEVICE=pdfwrite' + cmd += ' -sOutputFile="{}.pdf"'.format( + os.path.join(self.job.output_dir, self.job.name) + ) + cmd += ' && ' + cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'pdf')) + self.addTask('pdf_combine', command=cmd, memMb=mem_mb, nCores=n_cores) + + +class CreateTEIWorkflow(WorkflowRunner): + def __init__(self, job): + self.job = job + + def workflow(self): + ''' + ' ################################################## + ' # hocr2tei # + ' ################################################## + ''' + n_cores = 1 + mem_mb = min(512, self.getMemMb()) + cmd = 'hocr2tei "{}.hocr"'.format( + os.path.join(self.job.output_dir, self.job.name) + ) + cmd += ' --output-file "{}.xml"'.format( + os.path.join(self.job.output_dir, self.job.name) + ) + self.addTask('hocr2tei', command=cmd, memMb=mem_mb, nCores=n_cores) + + +class CreateTxtWorkflow(WorkflowRunner): + def __init__(self, job): + self.job = job + + def workflow(self): + ''' + ' ################################################## + ' # txt_combine # + ' ################################################## + ''' + n_cores = 1 + mem_mb = min(512, self.getMemMb()) + cmd = 'ls -dQv "{}"/*'.format(os.path.join(self.job.tmp_dir, 'txt')) + cmd += ' | ' + cmd += 'xargs cat' + cmd += ' > ' + cmd += '"{}.txt"'.format(os.path.join(self.job.output_dir, self.job.name)) # noqa + cmd += ' && ' + cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'txt')) + self.addTask('txt_combine', command=cmd, memMb=mem_mb, nCores=n_cores) + + +class MainWorkflow(WorkflowRunner): + def __init__(self, input_dir, lang, output_dir, binarize): self.input_dir = input_dir self.lang = lang self.output_dir = output_dir self.binarize = binarize - self.zip = zip - self.jobs = collect_jobs(self.input_dir, self.output_dir) + self.jobs = self.collect_jobs() + + def collect_jobs(self): + jobs = [] + for file in os.listdir(self.input_dir): + if os.path.isdir(os.path.join(self.input_dir, file)): + continue + if file.lower().endswith('.pdf'): + job = PipelineJob( + os.path.join(self.input_dir, file), + os.path.join(self.output_dir, file) + ) + jobs.append(job) + return jobs def workflow(self): if not self.jobs: return - ''' - ' ################################################## - ' # setup output directory # - ' ################################################## - ''' - setup_output_directory_tasks = [] - for i, job in enumerate(self.jobs): - cmd = 'mkdir -p "{}"'.format(job.page_dir) - lbl = 'setup_output_directory_-_{}'.format(i) - task = self.addTask(command=cmd, label=lbl) - setup_output_directory_tasks.append(task) + # Create output and temporary directories + for job in self.jobs: + os.mkdir(job.output_dir) + os.mkdir(job.tmp_dir) + os.mkdir(os.path.join(job.tmp_dir, 'hocr')) + os.mkdir(os.path.join(job.tmp_dir, 'pdf')) + os.mkdir(os.path.join(job.tmp_dir, 'images')) + os.mkdir(os.path.join(job.tmp_dir, 'txt')) ''' ' ################################################## - ' # split input # + ' # split-input # ' ################################################## ''' - split_input_tasks = [] - n_cores = max(1, int(self.getNCores() / len(self.jobs))) for i, job in enumerate(self.jobs): - input_file = job.file - output_file = '{}/page-%d.tif'.format(job.page_dir) - cmd = 'gs' - cmd += ' -dBATCH' - cmd += ' -dNOPAUSE' - cmd += ' -dNumRenderingThreads={}'.format(n_cores) - cmd += ' -dQUIET' - cmd += ' -r300' - cmd += ' -sDEVICE=tiff24nc' - cmd += ' -sCompression=lzw' - cmd += ' "-sOutputFile={}"'.format(output_file) - cmd += ' "{}"'.format(input_file) - deps = 'setup_output_directory_-_{}'.format(i) - lbl = 'split_input_-_{}'.format(i) - task = self.addTask(command=cmd, dependencies=deps, label=lbl, - nCores=n_cores) - split_input_tasks.append(task) + self.addWorkflowTask( + 'split_input_-_{}'.format(i), + SplitInputWorkflow(job) + ) if self.binarize: - ''' - ' ################################################## - ' # pre binarization # - ' ################################################## - ''' - pre_binarization_tasks = [] - for i, job in enumerate(self.jobs): - input_file = os.path.join(job.output_dir, 'binarization_input_files.txt') # noqa - cmd = 'ls -dv "{}/"* >> "{}"'.format(job.page_dir, input_file) - deps = 'split_input_-_{}'.format(i) - lbl = 'pre_binarization_-_{}'.format(i) - task = self.addTask(command=cmd, dependencies=deps, label=lbl) - pre_binarization_tasks.append(task) - ''' ' ################################################## ' # binarization # ' ################################################## ''' - binarization_tasks = [] - n_cores = self.getNCores() - mem_mb = self.getMemMb() for i, job in enumerate(self.jobs): - input_file = os.path.join(job.output_dir, 'binarization_input_files.txt') # noqa - cmd = 'ocropus-nlbin "@{}"'.format(input_file) - cmd += ' --nocheck' - cmd += ' --output "{}"'.format(job.page_dir) - cmd += ' --parallel "{}"'.format(n_cores) - deps = 'pre_binarization_-_{}'.format(i) - lbl = 'binarization_-_{}'.format(i) - task = self.addTask(command=cmd, dependencies=deps, label=lbl, - memMb=mem_mb, nCores=n_cores) - binarization_tasks.append(task) - - ''' - ' ################################################## - ' # post binarization # - ' ################################################## - ''' - post_binarization_tasks = [] - for i, job in enumerate(self.jobs): - input_file = os.path.join(job.output_dir, 'binarization_input_files.txt') # noqa - cmd = 'rm "{}"'.format(input_file) - cmd += ' && ' - cmd += 'cd "{}"'.format(job.page_dir) - cmd += ' && ' - cmd += 'rm *.{nrm.png,tif}' - cmd += ' && ' - cmd += 'rename \'s/^0*/page-/\' *' - cmd += ' && ' - cmd += 'cd -' - deps = 'binarization_-_{}'.format(i) - lbl = 'post_binarization_-_{}'.format(i) - task = self.addTask(command=cmd, dependencies=deps, label=lbl) - post_binarization_tasks.append(task) - - ''' - ' ################################################## - ' # pre ocr # - ' ################################################## - ''' - pre_ocr_tasks = [] - for i, job in enumerate(self.jobs): - input_file = os.path.join(job.output_dir, 'ocr_input_files.txt') - cmd = 'ls -dv "{}/"* >> "{}"'.format(job.page_dir, input_file) - deps = 'post_binarization_-_{}'.format(i) if self.binarize else 'split_input_-_{}'.format(i) # noqa - lbl = 'pre_ocr_-_{}'.format(i) - task = self.addTask(command=cmd, dependencies=deps, label=lbl) - pre_ocr_tasks.append(task) + self.addWorkflowTask( + 'binarization_-_{}'.format(i), + BinarizationWorkflow(job), + dependencies='split_input_-_{}'.format(i) + ) ''' ' ################################################## @@ -160,175 +415,117 @@ class OCRPipeline(WorkflowRunner): ' ################################################## ''' ocr_tasks = [] - n_cores = min(4, self.getNCores()) - mem_mb = min(n_cores * 2048, self.getMemMb()) for i, job in enumerate(self.jobs): - input_file = os.path.join(job.output_dir, 'ocr_input_files.txt') - output_file_base = os.path.join(job.output_dir, job.name) - cmd = 'tesseract "{}" "{}"'.format(input_file, output_file_base) - cmd += ' -l "{}"'.format(self.lang) - cmd += ' hocr pdf txt' - deps = 'pre_ocr_-_{}'.format(i) - lbl = 'ocr_-_{}'.format(i) - task = self.addTask(command=cmd, dependencies=deps, - env={'OMP_THREAD_LIMIT': '{}'.format(n_cores)}, - label=lbl, memMb=mem_mb, nCores=n_cores) + if self.binarize: + deps = 'binarization_-_{}'.format(i) + else: + deps = 'split_input_-_{}'.format(i) + task = self.addWorkflowTask( + 'ocr_-_{}'.format(i), + OCRWorkflow(job, self.lang), + dependencies=deps + ) ocr_tasks.append(task) ''' ' ################################################## - ' # post ocr # + ' # create-hocr # ' ################################################## ''' - post_ocr_tasks = [] + create_hocr_tasks = [] for i, job in enumerate(self.jobs): - input_file = os.path.join(job.output_dir, 'ocr_input_files.txt') - output_file_base = os.path.join(job.output_dir, job.name) - cmd = 'rm "{}"'.format(input_file) - cmd += ' && ' - cmd += 'sed -i \'s+{}+pages+g\' "{}.hocr"'.format(job.page_dir, output_file_base) # noqa - deps = 'ocr_-_{}'.format(i) - lbl = 'post_ocr_-_{}'.format(i) - task = self.addTask(command=cmd, dependencies=deps, label=lbl) - post_ocr_tasks.append(task) + task = self.addWorkflowTask( + 'create_hocr_-_{}'.format(i), + CreateHOCRWorkflow(job), + dependencies='ocr_-_{}'.format(i) + ) + create_hocr_tasks.append(task) ''' ' ################################################## - ' # hocr to tei # + ' # create-pdf # ' ################################################## ''' - hocr_to_tei_tasks = [] + create_pdf_tasks = [] for i, job in enumerate(self.jobs): - output_file_base = os.path.join(job.output_dir, job.name) - cmd = 'hocrtotei "{}.hocr" "{}.xml"'.format(output_file_base, output_file_base) # noqa - deps = 'post_ocr_-_{}'.format(i) - lbl = 'hocr_to_tei_-_{}'.format(i) - task = self.addTask(command=cmd, dependencies=deps, label=lbl) - hocr_to_tei_tasks.append(task) + task = self.addWorkflowTask( + 'create_pdf_-_{}'.format(i), + CreatePDFWorkflow(job), + dependencies='ocr_-_{}'.format(i) + ) + create_pdf_tasks.append(task) ''' ' ################################################## - ' # zip creation # + ' # create-tei # ' ################################################## ''' - zip_creation_tasks = [] - if self.zip is not None: - # zip all files - cmd = 'cd "{}"'.format(self.output_dir) - cmd += ' && ' - cmd += 'zip' - cmd += ' -r' - cmd += ' "{}.all.zip" .'.format(self.zip) - cmd += ' -x "pyflow.data*" "*tmp*"' - cmd += ' -i "*.pdf" "*.txt" "*.xml" "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif') # noqa - cmd += ' && ' - cmd += 'cd -' - deps = hocr_to_tei_tasks - lbl = 'zip_creation_-_all' - task = self.addTask(command=cmd, dependencies=deps, label=lbl) - zip_creation_tasks.append(task) - # zip PDF files - cmd = 'cd "{}"'.format(self.output_dir) - cmd += ' && ' - cmd += 'zip' - cmd += ' -r' - cmd += ' "{}.pdf.zip" .'.format(self.zip) - cmd += ' -x "pyflow.data*" "*tmp*"' - cmd += ' -i "*.pdf"' - cmd += ' && ' - cmd += 'cd -' - deps = ocr_tasks - lbl = 'zip_creation_-_pdf' - task = self.addTask(command=cmd, dependencies=deps, label=lbl) - zip_creation_tasks.append(task) - # zip TXT files - cmd = 'cd "{}"'.format(self.output_dir) - cmd += ' && ' - cmd += 'zip' - cmd += ' -r' - cmd += ' "{}.txt.zip" .'.format(self.zip) - cmd += ' -x "pyflow.data*" "*tmp*"' - cmd += ' -i "*.txt"' - cmd += ' && ' - cmd += 'cd -' - deps = ocr_tasks - lbl = 'zip_creation_-_txt' - task = self.addTask(command=cmd, dependencies=deps, label=lbl) - zip_creation_tasks.append(task) - # zip XML files - cmd = 'cd "{}"'.format(self.output_dir) - cmd += ' && ' - cmd += 'zip' - cmd += ' -r' - cmd += ' "{}.xml.zip" .'.format(self.zip) - cmd += ' -x "pyflow.data*" "*tmp*"' - cmd += ' -i "*.xml"' - cmd += ' && ' - cmd += 'cd -' - deps = hocr_to_tei_tasks - lbl = 'zip_creation_-_xml' - task = self.addTask(command=cmd, dependencies=deps, label=lbl) - zip_creation_tasks.append(task) - # zip PoCo bundles - cmd = 'cd "{}"'.format(self.output_dir) - cmd += ' && ' - cmd += 'zip' - cmd += ' -r' - cmd += ' "{}.poco.zip" .'.format(self.zip) - cmd += ' -x "pyflow.data*" "*tmp*"' - cmd += ' -i "*.hocr" "*.{}"'.format('bin.png' if self.binarize else 'tif') # noqa - cmd += ' && ' - cmd += 'cd -' - deps = post_ocr_tasks - lbl = 'zip_creation_-_poco' - task = self.addTask(command=cmd, dependencies=deps, label=lbl) - zip_creation_tasks.append(task) + create_tei_tasks = [] + for i, job in enumerate(self.jobs): + task = self.addWorkflowTask( + 'create_tei_-_{}'.format(i), + CreateTEIWorkflow(job), + dependencies='create_hocr_-_{}'.format(i) + ) + create_tei_tasks.append(task) + ''' + ' ################################################## + ' # create-txt # + ' ################################################## + ''' + create_txt_tasks = [] + for i, job in enumerate(self.jobs): + task = self.addWorkflowTask( + 'create_txt_-_{}'.format(i), + CreateTxtWorkflow(job), + dependencies='ocr_-_{}'.format(i) + ) + create_txt_tasks.append(task) -def collect_jobs(input_dir, output_dir): - jobs = [] - for file in os.listdir(input_dir): - if os.path.isdir(os.path.join(input_dir, file)): - continue - if file.lower().endswith('.pdf'): - job = OCRPipelineJob(os.path.join(input_dir, file), - os.path.join(output_dir, file)) - jobs.append(job) - return jobs + # Remove temporary directories when all tasks are completed + self.waitForTasks() + for job in self.jobs: + os.rmdir(job.tmp_dir) def parse_args(): - parser = ArgumentParser(description='OCR pipeline for PDF file processing', - prog='OCR pipeline') - parser.add_argument('-i', '--input-dir', - help='Input directory', - required=True) - parser.add_argument('-o', '--output-dir', - help='Output directory', - required=True) - parser.add_argument('-l', '--language', - choices=list(map(lambda x: x[:-12], filter(lambda x: x.endswith('.traineddata'), os.listdir('/usr/local/share/tessdata')))), # noqa - help='Language of the input ' - '(3-character ISO 639-2 language codes)', - required=True) - parser.add_argument('--binarize', - action='store_true', - help='Add binarization as a preprocessing step') - parser.add_argument('--log-dir', - help='Logging directory') - parser.add_argument('--mem-mb', - help='Amount of system memory to be used (Default: min(--n-cores * 2048, available system memory))', # noqa - type=int) - parser.add_argument('--n-cores', - default=min(4, multiprocessing.cpu_count()), - help='Number of CPU threads to be used (Default: min(4, number of CPUs))', # noqa - type=int) - parser.add_argument('--zip', - help='Create one zip file per filetype') - parser.add_argument('-v', '--version', - action='version', - help='Returns the current version of the OCR pipeline', - version='%(prog)s {}'.format(__version__)) + parser = ArgumentParser(description='OCR pipeline for PDF file processing') + parser.add_argument( + '-i', '--input-dir', help='Input directory', required=True) + parser.add_argument( + '-o', '--output-dir', help='Output directory', required=True) + parser.add_argument( + '-l', '--language', + choices=[x[:-12] for x in os.listdir('/usr/local/share/tessdata') + if x.endswith('.traineddata') and len(x) > 12], + help='Language of the input (3-character ISO 639-2 language codes)', + required=True + ) + parser.add_argument( + '--binarize', + action='store_true', + help='Add binarization as a preprocessing step' + ) + parser.add_argument( + '--log-dir', help='Logging directory (Default: --output-dir)') + parser.add_argument( + '--mem-mb', + help='Amount of system memory to be used (Default: min(--n-cores * 512, available system memory))', # noqa + type=int + ) + parser.add_argument( + '--n-cores', + default=min(4, multiprocessing.cpu_count()), + help='Number of CPU threads to be used (Default: min(4, CPU count))', + type=int + ) + parser.add_argument( + '-v', '--version', + action='version', + help='Returns the current version of the OCR pipeline', + version='%(prog)s {}'.format(__version__) + ) args = parser.parse_args() # Set some tricky default values and check for insufficient input @@ -338,20 +535,18 @@ def parse_args(): raise Exception('--n-cores must be greater or equal 1') if args.mem_mb is None: max_mem_mb = int(os.popen('free -t -m').readlines()[-1].split()[1:][0]) - args.mem_mb = min(args.n_cores * 2048, max_mem_mb) - if args.mem_mb < 2048: - raise Exception('--mem-mb must be greater or equal 2048') - if args.zip is not None and args.zip.lower().endswith('.zip'): - # Remove .zip file extension if provided - args.zip = args.zip[:-4] - args.zip = args.zip if args.zip else 'output' + args.mem_mb = min(args.n_cores * 512, max_mem_mb) + if args.mem_mb < 512: + raise Exception('--mem-mb must be greater or equal 512') return args def main(): args = parse_args() - ocr_pipeline = OCRPipeline(args.input_dir, args.language, args.output_dir, args.binarize, args.zip) # noqa - retval = ocr_pipeline.run(dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores) # noqa + ocr_pipeline = MainWorkflow( + args.input_dir, args.language, args.output_dir, args.binarize) + retval = ocr_pipeline.run( + dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores) sys.exit(retval) diff --git a/wrapper/ocr b/wrapper/ocr index c38b894..37dad98 100755 --- a/wrapper/ocr +++ b/wrapper/ocr @@ -6,9 +6,10 @@ import os import subprocess import sys -CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:1.0.0' +CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:0.1.0' CONTAINER_INPUT_DIR = '/input' CONTAINER_OUTPUT_DIR = '/output' +CONTAINER_MODELS_DIR = '/usr/local/share/tessdata' CONTAINER_LOG_DIR = '/logs' UID = str(os.getuid()) GID = str(os.getgid()) @@ -16,20 +17,25 @@ GID = str(os.getgid()) parser = ArgumentParser(add_help=False) parser.add_argument('-i', '--input-dir') parser.add_argument('-o', '--output-dir') +parser.add_argument('-m', '--model', action='extend', dest='models', nargs='+') parser.add_argument('--log-dir') args, remaining_args = parser.parse_known_args() -cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)] +cmd = ['docker', 'run', '--rm', '-it', '-u', f'{UID}:{GID}'] if args.input_dir is not None: - mapping = os.path.abspath(args.input_dir) + ':' + CONTAINER_INPUT_DIR + mapping = f'{os.path.abspath(args.input_dir)}:{CONTAINER_INPUT_DIR}' cmd += ['-v', mapping] remaining_args += ['-i', CONTAINER_INPUT_DIR] if args.output_dir is not None: - mapping = os.path.abspath(args.output_dir) + ':' + CONTAINER_OUTPUT_DIR + mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}' cmd += ['-v', mapping] remaining_args += ['-o', CONTAINER_OUTPUT_DIR] +if args.models is not None: + for model in args.models: + mapping = f'{os.path.abspath(model)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model)}' # noqa + cmd += ['-v', mapping] if args.log_dir is not None: - mapping = os.path.abspath(args.log_dir) + ':' + CONTAINER_LOG_DIR + mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}' cmd += ['-v', mapping] remaining_args += ['--log-dir', CONTAINER_LOG_DIR] cmd.append(CONTAINER_IMAGE)