ocr/ocr

#!/usr/bin/env python2.7
# coding=utf-8


"""
ocr

Usage:  For usage instructions run with option --help
Author: Patrick Jentsch <p.jentsch@uni-bielefeld.de>
"""


import argparse
import multiprocessing
import os
import re
import sys
from pyflow import WorkflowRunner


''' TODO:
' Implement --end-page: Last page to ocr
' Implement --memMb: Total amount of memory (RAM) available for this workflow.
'                    Default: 2048 * n_cores
' Implement --rotate: Rotate pages from input (90, 180, 270)
' Implement --split-pages: Split pages in half after possible rotation
' Implement --start-page: First page to ocr
'''


def parse_arguments():
    parser = argparse.ArgumentParser(
        description='Performs OCR of (historical) documents utilizing OCRopus for preprocessing and Tesseract OCR for OCR. The results are served as hOCR, PDF, raw text and TEI compliant XML files.\nSoftware requirements: imagemagick, ocropus, pdftoppm, pdfunite, poppler-utils, pyflow, python2.7, python3.5, tesseract'
    )
    parser.add_argument(
        '-i',
        dest='input_dir',
        required=True
    )
    parser.add_argument(
        '-l',
        choices=[
            'deu', 'deu_frak', 'eng', 'enm', 'fra', 'frm', 'ita', 'por', 'spa'
        ],
        dest='lang',
        required=True
    )
    parser.add_argument(
        '-o',
        dest='output_dir',
        required=True
    )
    parser.add_argument(
        '--skip-binarisation',
        action='store_true',
        default=False,
        dest='skip_binarisation',
        help='skip ocropy binarisation',
        required=False
    )
    parser.add_argument(
        '--keep-intermediates',
        action='store_true',
        default=False,
        dest='keep_intermediates',
        help='keep intermediate files',
        required=False
    )
    parser.add_argument(
        '--nCores',
        default=min(4, multiprocessing.cpu_count()),
        dest='n_cores',
        help='total number of cores available',
        required=False,
        type=int
    )
    return parser.parse_args()


class OCRWorkflow(WorkflowRunner):
    def __init__(self, args):
        self.jobs = analyze_jobs(args.input_dir, args.output_dir)
        self.skip_binarisation = args.skip_binarisation
        self.keep_intermediates = args.keep_intermediates
        self.lang = args.lang
        self.n_cores = args.n_cores

    def workflow(self):
        if len(self.jobs) == 0:
            return

        '''
        ' ##################################################
        ' # Create output directories                      #
        ' ##################################################
        '''
        create_output_directories_jobs = []
        for index, job in enumerate(self.jobs):
            cmd = 'mkdir -p "%s"' % (
                os.path.join(job['output_dir'], 'tmp')
            )
            if self.keep_intermediates:
                cmd += ' "%s" "%s" "%s" "%s"' % (
                    os.path.join(job['output_dir'], 'tmp', 'hocr'),
                    os.path.join(job['output_dir'], 'tmp', 'pdf'),
                    os.path.join(job['output_dir'], 'tmp', 'tiff'),
                    os.path.join(job['output_dir'], 'tmp', 'txt')
                )
            if not self.skip_binarisation:
                cmd += ' "%s"' % (
                    os.path.join(job['output_dir'], 'tmp', 'bin.png')
                )
            create_output_directories_jobs.append(
                self.addTask(
                    command=cmd,
                    label='create_output_directories_job_-_%i' % (index)
                )
            )

        '''
        ' ##################################################
        ' # Split                                          #
        ' ##################################################
        '''
        split_jobs = []
        split_job_n_cores = min(
            self.n_cores,
            max(1, int(self.n_cores / len(self.jobs)))
        )
        for index, job in enumerate(self.jobs):
            if job['filename'].endswith(('.tif', '.tiff')):
                '''
                ' This command also works for PDF input but ocropus-nlbin
                ' is not able to handle the TIFF output of it.
                '''
                cmd = 'convert -density 300 "%s" -compress LZW -scene 1 "%s/page-%%d.tif"' % (
                    job['path'],
                    os.path.join(job['output_dir'], 'tmp')
                )
            else:
                cmd = 'pdftoppm -r 300 -tiff -tiffcompression lzw "%s" "%s"' % (
                    job['path'],
                    os.path.join(job['output_dir'], 'tmp', 'page')
                )

            split_jobs.append(
                self.addTask(
                    command=cmd,
                    dependencies='create_output_directories_job_-_%i' % (index),
                    label='split_job_-_%i' % (index),
                    nCores=split_job_n_cores
                )
            )

        if not self.skip_binarisation:
            '''
            ' The binarisation_jobs are created based of the output files of
            ' the split_jobs. So wait until they are finished.
            '''
            self.waitForTasks()

            '''
            ' ##################################################
            ' # Binarise                                       #
            ' ##################################################
            '''
            binarisation_jobs = []
            '''
            ' We run ocropus-nlbin with either four or, if there are less then
            ' four cores available for this workflow, the available core
            ' number.
            '''
            binarisation_job_n_cores = min(4, self.n_cores)
            for index, job in enumerate(self.jobs):
                files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
                files = filter(lambda x: x.endswith('.tif'), files)
                files.sort(key=lambda x: int(re.search(r'\d+', x).group(0)))
                files = map(
                    lambda x: '"' + os.path.join(job['output_dir'], 'tmp', x) + '"',
                    files
                )
                cmd = 'ocropus-nlbin --output "%s" --parallel "%i" %s' % (
                    os.path.join(job['output_dir'], 'tmp'),
                    binarisation_job_n_cores,
                    ' '.join(files)
                )
                binarisation_jobs.append(
                    self.addTask(
                        command=cmd,
                        dependencies='split_job_-_%i' % (index),
                        label='binarisation_job_-_%i' % (index),
                        nCores=binarisation_job_n_cores
                    )
                )

            '''
            ' The post_binarisation_jobs are created based of the output files
            ' of the binarisation_jobs. So wait until they are finished.
            '''
            self.waitForTasks()

            '''
            ' ##################################################
            ' # Normalise file names from binarisation         #
            ' ##################################################
            '''
            post_binarisation_jobs = []
            for index, job in enumerate(self.jobs):
                number = 0
                files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
                files = filter(lambda x: x.endswith('.bin.png'), files)
                files.sort()
                for file in files:
                    cmd = 'mv "%s" "%s"' % (
                        os.path.join(job['output_dir'], 'tmp', file),
                        os.path.join(
                            job['output_dir'],
                            'tmp',
                            'page-%i.bin.png' % (int(file.split('.', 1)[0]))
                        )
                    )
                    post_binarisation_jobs.append(
                        self.addTask(
                            command=cmd,
                            dependencies='binarisation_job_-_%i' % (index),
                            label='post_binarisation_job_-_%i-%i' % (
                                index,
                                number
                            )
                        )
                    )
                    number += 1

        '''
        ' The ocr_jobs are created based of the output files of either the
        ' split_jobs or post_binarisation_jobs. So wait until they are
        ' finished.
        '''
        self.waitForTasks()

        '''
        ' ##################################################
        ' # Optical Character Recognition                  #
        ' ##################################################
        '''
        ocr_jobs = []
        '''
        ' Tesseract runs fastest with four cores. So we run it with either four
        ' or, if there are less then four cores available for this workflow,
        ' the available core number.
        '''
        ocr_job_n_cores = min(4, self.n_cores)
        '''
        ' WORKAROUND: Tesseract only uses one core for the deu_frak language
        ' model, so the workflow will also only reserve one in this case.
        '''
        if self.lang == "deu_frak":
            ocr_job_n_cores = 1
        for index, job in enumerate(self.jobs):
            files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
            if self.skip_binarisation:
                files = filter(lambda x: x.endswith('.tif'), files)
            else:
                files = filter(lambda x: x.endswith('.bin.png'), files)
            files.sort(key=lambda x: int(re.search(r'\d+', x).group(0)))
            files = map(
                lambda x: os.path.join(job['output_dir'], 'tmp', x),
                files
            )
            number = 0
            for file in files:
                cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % (
                    file,
                    os.path.join(
                        job['output_dir'],
                        'tmp',
                        file.rsplit('.', 1 if self.skip_binarisation else 2)[0]
                    ),
                    self.lang
                )
                if self.skip_binarisation:
                    ocr_job_dependencies = 'split_job_-_%i' % (index)
                else:
                    ocr_job_dependencies = filter(
                        lambda x: x == 'post_binarisation_job_-_%i-%i' % (
                            index,
                            number
                        ),
                        post_binarisation_jobs
                    )
                ocr_jobs.append(
                    self.addTask(
                        command=cmd,
                        dependencies=ocr_job_dependencies,
                        label='ocr_job_-_%i-%i' % (index, number),
                        nCores=ocr_job_n_cores
                    )
                )
                number += 1

        '''
        ' The following jobs are created based of the output files of the
        ' ocr_jobs. So wait until they are finished.
        '''
        self.waitForTasks()

        '''
        ' ##################################################
        ' # Create TEI P5 files                            #
        ' ##################################################
        '''
        hocr_to_tei_jobs = []
        for index, job in enumerate(self.jobs):
            files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
            files = filter(lambda x: x.endswith('.hocr'), files)
            files.sort(key=lambda x: int(re.search(r'\d+', x).group(0)))
            files = map(
                lambda x: '"' + os.path.join(job['output_dir'], 'tmp', x) + '"',
                files
            )
            cmd = 'hocrtotei %s "%s"' % (
                ' '.join(files),
                os.path.join(
                    job['output_dir'],
                    os.path.join(job['output_dir'], job['name'] + '.xml')
                )
            )
            hocr_to_tei_jobs.append(
                self.addTask(
                    command=cmd,
                    dependencies=filter(
                        lambda x: x.startswith('ocr_job_-_%i' % (index)),
                        ocr_jobs
                    ),
                    label='hocr_to_tei_job_-_%i' % (index)
                )
            )

        '''
        ' ##################################################
        ' # Merge PDF files                                #
        ' ##################################################
        '''
        pdf_merge_jobs = []
        for index, job in enumerate(self.jobs):
            files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
            files = filter(lambda x: x.endswith('.pdf'), files)
            files.sort(key=lambda x: int(re.search(r'\d+', x).group(0)))
            files = map(
                lambda x: '"' + os.path.join(job['output_dir'], 'tmp', x) + '"',
                files
            )
            cmd = 'pdfunite %s "%s"' % (
                ' '.join(files),
                os.path.join(
                    job['output_dir'],
                    os.path.join(job['output_dir'], job['name'] + '.pdf')
                )
            )
            pdf_merge_jobs.append(
                self.addTask(
                    command=cmd,
                    dependencies=filter(
                        lambda x: x.startswith('ocr_job_-_%i' % (index)),
                        ocr_jobs
                    ),
                    label='pdf_merge_job_-_%i' % (index)
                )
            )

        '''
        ' ##################################################
        ' # Merge text files                               #
        ' ##################################################
        '''
        txt_merge_jobs = []
        for index, job in enumerate(self.jobs):
            files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
            files = filter(lambda x: x.endswith('.txt'), files)
            files.sort(key=lambda x: int(re.search(r'\d+', x).group(0)))
            files = map(
                lambda x: '"' + os.path.join(job['output_dir'], 'tmp', x) + '"',
                files
            )
            cmd = 'cat %s > "%s"' % (
                ' '.join(files),
                os.path.join(
                    job['output_dir'],
                    os.path.join(job['output_dir'], job['name'] + '.txt')
                )
            )
            txt_merge_jobs.append(
                self.addTask(
                    command=cmd,
                    dependencies=filter(
                        lambda x: x.startswith('ocr_job_-_%i' % (index)),
                        ocr_jobs
                    ),
                    label='txt_merge_job_-_%i' % (index)
                )
            )

        '''
        ' ##################################################
        ' # Cleanup                                        #
        ' ##################################################
        '''
        cleanup_jobs = []
        if self.keep_intermediates:
            for index, job in enumerate(self.jobs):
                cleanup_job_dependencies = [
                    'hocr_to_tei_job_-_%i' % (index),
                    'pdf_merge_job_-_%i' % (index),
                    'txt_merge_job_-_%i' % (index)
                ]
                cmd = 'mv "%s"/*.hocr "%s"' % (
                    os.path.join(job['output_dir'], 'tmp'),
                    os.path.join(job['output_dir'], 'tmp', 'hocr'),
                )
                cmd += ' && mv "%s"/*.pdf "%s"' % (
                    os.path.join(job['output_dir'], 'tmp'),
                    os.path.join(job['output_dir'], 'tmp', 'pdf'),
                )
                cmd += ' && mv "%s"/*.tif "%s"' % (
                    os.path.join(job['output_dir'], 'tmp'),
                    os.path.join(job['output_dir'], 'tmp', 'tiff'),
                )
                cmd += ' && mv "%s"/*.txt "%s"' % (
                    os.path.join(job['output_dir'], 'tmp'),
                    os.path.join(job['output_dir'], 'tmp', 'txt'),
                )
                if not self.skip_binarisation:
                    cmd += ' && mv "%s"/*.bin.png "%s"' % (
                        os.path.join(job['output_dir'], 'tmp'),
                        os.path.join(job['output_dir'], 'tmp', 'bin.png'),
                    )
                    cmd += ' && rm "%s"/*.nrm.png' % (
                        os.path.join(job['output_dir'], 'tmp')
                    )
                cleanup_jobs.append(
                    self.addTask(
                        command=cmd,
                        dependencies=cleanup_job_dependencies,
                        label='cleanup_job_-_%i' % (index)
                    )
                )
        else:
            for index, job in enumerate(self.jobs):
                cleanup_job_dependencies = [
                    'hocr_to_tei_job_-_%i' % (index),
                    'pdf_merge_job_-_%i' % (index),
                    'txt_merge_job_-_%i' % (index)
                ]
                cmd = 'rm -r "%s"' % (
                    os.path.join(job['output_dir'], 'tmp')
                )
                cleanup_jobs.append(
                    self.addTask(
                        command=cmd,
                        dependencies=cleanup_job_dependencies,
                        label='cleanup_job_-_%i' % (index)
                    )
                )


def analyze_jobs(input_dir, output_dir):
    jobs = []

    for file in os.listdir(input_dir):
        if os.path.isdir(os.path.join(input_dir, file)):
            jobs += analyze_jobs(
                os.path.join(input_dir, file),
                os.path.join(output_dir, file)
            )
        elif file.endswith(('.pdf', '.tif', '.tiff')):
            jobs.append(
                {
                    'filename': file,
                    'name': file.rsplit('.', 1)[0],
                    'output_dir': os.path.join(output_dir, file),
                    'path': os.path.join(input_dir, file)
                }
            )

    return jobs


def main():
    args = parse_arguments()

    wflow = OCRWorkflow(args)

    retval = wflow.run(dataDirRoot=args.output_dir, nCores=args.n_cores)

    sys.exit(retval)


if __name__ == '__main__':
    main()