diff --git a/hocrtotei b/hocrtotei index 96a4045..a762d99 100755 --- a/hocrtotei +++ b/hocrtotei @@ -2,25 +2,15 @@ # coding=utf-8 from xml.sax.saxutils import escape -import argparse +from argparse import ArgumentParser import xml.etree.ElementTree as ET -parser = argparse.ArgumentParser( - description='Merges several hOCR files in order of their occurrence on command line to one TEI result file.' -) -parser.add_argument( - 'i', - metavar='hOCR-sourcefile', - nargs='+' -) -parser.add_argument( - 'o', - metavar='TEI-destfile', -) +parser = ArgumentParser(description='Merges hOCR files to one P5 file.') +parser.add_argument('i', metavar='hOCR-sourcefile', nargs='+') +parser.add_argument('o', metavar='TEI-destfile',) args = parser.parse_args() output_file = open(args.o, 'w') - output_file.write( '\n' + '\n' @@ -54,5 +44,4 @@ output_file.write( ' \n' + ' \n' + '') - output_file.close() diff --git a/ocr b/ocr index cf3dae1..52bba3e 100755 --- a/ocr +++ b/ocr @@ -10,292 +10,206 @@ Author: Patrick Jentsch """ -import argparse +from argparse import ArgumentParser +from pyflow import WorkflowRunner import multiprocessing import os import re import sys -from pyflow import WorkflowRunner -def parse_arguments(): - parser = argparse.ArgumentParser( - description='''Performs OCR of (historical) documents utilizing OCRopus - for preprocessing and Tesseract OCR for OCR. The results - are served as hOCR, PDF, raw text and TEI compliant XML - files.\n - Software requirements: imagemagick, ocropus, pdftoppm, - pdfunite, poppler-utils, pyflow, python2.7, python3.5, - tesseract''' - ) - parser.add_argument( - '-i', - dest='input_dir', - required=True - ) - parser.add_argument( - '-l', - choices=[ - 'deu', 'eng', 'enm', 'fra', 'frk', 'frm', 'ita', 'por', 'spa' - ], - dest='lang', - required=True - ) - parser.add_argument( - '-o', - dest='output_dir', - required=True - ) - parser.add_argument( - '--skip-binarisation', - action='store_true', - default=False, - dest='skip_binarisation', - help='skip ocropy binarisation', - required=False - ) - parser.add_argument( - '--keep-intermediates', - action='store_true', - default=False, - dest='keep_intermediates', - help='keep intermediate files', - required=False - ) - parser.add_argument( - '--nCores', - default=min(4, multiprocessing.cpu_count()), - dest='n_cores', - help='total number of cores available', - required=False, - type=int - ) - parser.add_argument( - '--zip', - default='ocr-result-files', - dest='zip', - type=str, - help='package result files in zip bundles and asign an filename prefix', - required=False - ) +TESSERACT_MODELS = ['deu', 'eng', 'enm', 'fra', 'frk', 'frm', 'ita', 'por', + 'spa'] + + +def parse_args(): + parser = ArgumentParser(description='OCR Pipeline utilizing tesseract.') + parser.add_argument('i') + parser.add_argument('o') + parser.add_argument('-l', '--language', choices=TESSERACT_MODELS, + required=True) + parser.add_argument('--binarize', action='store_true', + help='use ocropy binarisation') + parser.add_argument('--keep-intermediates', action='store_true', + help='keep intermediate files') + parser.add_argument('--n-cores', + default=min(4, multiprocessing.cpu_count()), + help='total number of cores available', type=int) + parser.add_argument('--log-dir') + parser.add_argument('--zip') return parser.parse_args() -class OCRWorkflow(WorkflowRunner): - def __init__(self, args): - self.jobs = analyze_jobs(args.input_dir, args.output_dir) - self.skip_binarisation = args.skip_binarisation - self.keep_intermediates = args.keep_intermediates - self.lang = args.lang - self.n_cores = args.n_cores - self.output_dir = args.output_dir - self.zip = args.zip +class OCRPipelineJob: + def __init__(self, file, output_dir): + self.file = file + self.name = os.path.basename(file).rsplit('.', 1)[0] + self.output_dir = output_dir + + +class OCRPipeline(WorkflowRunner): + def __init__(self, binarize, jobs, keep_intermediates, lang, n_cores, + output_dir, zip): + self.binarize = binarize + self.jobs = jobs + self.keep_intermediates = keep_intermediates + self.lang = lang + self.n_cores = n_cores + self.output_dir = output_dir + self.zip = zip def workflow(self): - if len(self.jobs) == 0: + if not self.jobs: return ''' ' ################################################## - ' # Create output directories # + ' # mkdir_jobs # ' ################################################## ''' - create_output_directories_jobs = [] - for index, job in enumerate(self.jobs): - cmd = 'mkdir -p "%s"' % ( - os.path.join(job['output_dir'], 'tmp') - ) + mkdir_jobs = [] + for i, job in enumerate(self.jobs): + output_dir = os.path.join(job.output_dir, 'tmp') + cmd = 'mkdir' + cmd += ' -p' + cmd += ' "{}"'.format(output_dir) if self.keep_intermediates: - cmd += ' "%s" "%s" "%s" "%s"' % ( - os.path.join(job['output_dir'], 'tmp', 'hocr'), - os.path.join(job['output_dir'], 'tmp', 'pdf'), - os.path.join(job['output_dir'], 'tmp', 'tiff'), - os.path.join(job['output_dir'], 'tmp', 'txt') - ) - if not self.skip_binarisation: - cmd += ' "%s"' % ( - os.path.join(job['output_dir'], 'tmp', 'bin.png') - ) - create_output_directories_jobs.append( - self.addTask( - command=cmd, - label='create_output_directories_job_-_%i' % (index) - ) - ) + cmd += ' "{}"'.format(os.path.join(output_dir, 'hocr')) + cmd += ' "{}"'.format(os.path.join(output_dir, 'pdf')) + cmd += ' "{}"'.format(os.path.join(output_dir, 'tiff')) + cmd += ' "{}"'.format(os.path.join(output_dir, 'txt')) + if self.binarize: + cmd += ' "{}"'.format(os.path.join(output_dir, 'bin.png')) + cmd += ' "{}"'.format(os.path.join(output_dir, 'nrm.png')) + lbl = 'mkdir_job_-_{}'.format(i) + mkdir_jobs.append(self.addTask(command=cmd, label=lbl)) ''' ' ################################################## - ' # Split # + ' # pdftoppm_jobs # ' ################################################## ''' - split_jobs = [] - split_job_n_cores = min( - self.n_cores, - max(1, int(self.n_cores / len(self.jobs))) - ) - for index, job in enumerate(self.jobs): - if job['filename'].endswith(('.tif', '.tiff')): - ''' - ' This command also works for PDF input but ocropus-nlbin - ' is not able to handle the TIFF output of it. - ''' - cmd = 'convert -density 300 "%s" -compress LZW -scene 1 "%s/page-%%d.tif"' % ( - job['path'], - os.path.join(job['output_dir'], 'tmp') - ) - else: - cmd = 'pdftoppm -r 300 -tiff -tiffcompression lzw "%s" "%s"' % ( - job['path'], - os.path.join(job['output_dir'], 'tmp', 'page') - ) + pdftoppm_jobs = [] + n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs)))) + for i, job in enumerate(self.jobs): + output_dir = os.path.join(job.output_dir, 'tmp') + output_file_base = os.path.join(output_dir, 'page') + cmd = 'pdftoppm' + cmd += ' -r 300' + cmd += ' -tiff' + cmd += ' -tiffcompression lzw' + cmd += ' "{}" "{}"'.format(job.file, output_file_base) + deps = 'mkdir_job_-_{}'.format(i) + lbl = 'pdftoppm_job_-_{}'.format(i) + pdftoppm_jobs.append(self.addTask(command=cmd, dependencies=deps, + label=lbl, nCores=n_cores)) - split_jobs.append( - self.addTask( - command=cmd, - dependencies='create_output_directories_job_-_%i' % (index), - label='split_job_-_%i' % (index), - nCores=split_job_n_cores - ) - ) - - if not self.skip_binarisation: + if self.binarize: ''' - ' The binarisation_jobs are created based of the output files of - ' the split_jobs. So wait until they are finished. + ' The ocropus_nlbin_jobs list is created based on the output files + ' of the pdftoppm_jobs. So wait until they are finished. ''' self.waitForTasks() ''' ' ################################################## - ' # Binarise # + ' # ocropus_nlbin_jobs # ' ################################################## ''' - binarisation_jobs = [] + ocropus_nlbin_jobs = [] ''' ' We run ocropus-nlbin with either four or, if there are less then ' four cores available for this workflow, the available core ' number. ''' - binarisation_job_n_cores = min(4, self.n_cores) - for index, job in enumerate(self.jobs): - files = os.listdir(os.path.join(job['output_dir'], 'tmp')) - files = filter(lambda x: x.endswith('.tif'), files) + n_cores = min(4, self.n_cores) + for i, job in enumerate(self.jobs): + input_dir = os.path.join(job.output_dir, 'tmp') + output_dir = input_dir + files = filter(lambda x: x.endswith('.tif'), + os.listdir(input_dir)) files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) - files = map( - lambda x: '"' + os.path.join(job['output_dir'], 'tmp', x) + '"', - files - ) - cmd = 'ocropus-nlbin --output "%s" --parallel "%i" %s' % ( - os.path.join(job['output_dir'], 'tmp'), - binarisation_job_n_cores, - ' '.join(files) - ) - binarisation_jobs.append( - self.addTask( - command=cmd, - dependencies='split_job_-_%i' % (index), - label='binarisation_job_-_%i' % (index), - nCores=binarisation_job_n_cores - ) - ) + files = map(lambda x: os.path.join(input_dir, x), files) + cmd = 'ocropus-nlbin "{}"'.format('" "'.join(files)) + cmd += ' -o "{}"'.format(output_dir) + cmd += ' -Q "{}"'.format(n_cores) + deps = 'pdftoppm_job_-_{}'.format(i) + lbl = 'ocropus_nlbin_job_-_{}'.format(i) + ocropus_nlbin_jobs.append( + self.addTask(command=cmd, dependencies=deps, label=lbl, + nCores=n_cores)) ''' - ' The post_binarisation_jobs are created based of the output files - ' of the binarisation_jobs. So wait until they are finished. + ' The post_ocropus_nlbin_jobs are created based on the output files + ' of the ocropus_nlbin_jobs. So wait until they are finished. ''' self.waitForTasks() ''' ' ################################################## - ' # Normalise file names from binarisation # + ' # post_ocropus_nlbin_jobs # ' ################################################## ''' - post_binarisation_jobs = [] - for index, job in enumerate(self.jobs): + post_ocropus_nlbin_jobs = [] + for i, job in enumerate(self.jobs): + input_dir = os.path.join(job.output_dir, 'tmp') + output_dir = input_dir number = 0 - files = os.listdir(os.path.join(job['output_dir'], 'tmp')) - files = filter(lambda x: x.endswith('.bin.png'), files) + files = filter(lambda x: x.endswith('.bin.png'), + os.listdir(input_dir)) files.sort() for file in files: - cmd = 'mv "%s" "%s"' % ( - os.path.join(job['output_dir'], 'tmp', file), - os.path.join( - job['output_dir'], - 'tmp', - 'page-%i.bin.png' % (int(file.split('.', 1)[0])) - ) - ) - post_binarisation_jobs.append( - self.addTask( - command=cmd, - dependencies='binarisation_job_-_%i' % (index), - label='post_binarisation_job_-_%i-%i' % ( - index, - number - ) - ) - ) + # int conversion is done in order to trim leading zeros + output_file = os.path.join(output_dir, 'page-{}.bin.png'.format(int(file.split('.', 1)[0]))) # noqa + cmd = 'mv "{}" "{}"'.format(os.path.join(output_dir, file), + output_file) + deps = 'ocropus_nlbin_job_-_{}'.format(i) + lbl = 'post_ocropus_nlbin_job_-_{}-{}'.format(i, number) + post_ocropus_nlbin_jobs.append( + self.addTask(command=cmd, dependencies=deps, + label=lbl)) number += 1 ''' - ' The ocr_jobs are created based of the output files of either the - ' split_jobs or post_binarisation_jobs. So wait until they are + ' The tesseract_jobs are created based of the output files of either + ' the pdftoppm_jobs or post_ocropus_nlbin_jobs. So wait until they are ' finished. ''' self.waitForTasks() ''' ' ################################################## - ' # Optical Character Recognition # + ' # tesseract_jobs # ' ################################################## ''' - ocr_jobs = [] + tesseract_jobs = [] ''' ' Tesseract runs fastest with four cores. So we run it with either four ' or, if there are less then four cores available for this workflow, ' the available core number. ''' - ocr_job_n_cores = min(4, self.n_cores) - for index, job in enumerate(self.jobs): - files = os.listdir(os.path.join(job['output_dir'], 'tmp')) - if self.skip_binarisation: - files = filter(lambda x: x.endswith('.tif'), files) - else: - files = filter(lambda x: x.endswith('.bin.png'), files) + n_cores = min(4, self.n_cores) + for i, job in enumerate(self.jobs): + input_dir = os.path.join(job.output_dir, 'tmp') + output_dir = input_dir + files = filter(lambda x: x.endswith('.bin.png' if self.binarize else '.tif'), # noqa + os.listdir(input_dir)) files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) - files = map( - lambda x: os.path.join(job['output_dir'], 'tmp', x), - files - ) + files = map(lambda x: os.path.join(input_dir, x), files) number = 0 for file in files: - cmd = 'tesseract "%s" "%s" -l "%s" hocr pdf txt' % ( - file, - os.path.join( - job['output_dir'], - 'tmp', - file.rsplit('.', 1 if self.skip_binarisation else 2)[0] - ), - self.lang - ) - if self.skip_binarisation: - ocr_job_dependencies = 'split_job_-_%i' % (index) + output_file_base = os.path.join(output_dir, file.rsplit('.', 2 if self.binarize else 1)[0]) # noqa + cmd = 'tesseract "{}" "{}"'.format(file, output_file_base) + cmd += ' -l "{}"'.format(self.lang) + cmd += ' hocr pdf txt' + if self.binarize: + deps = 'post_ocropus_nlbin_job_-_{}-{}'.format(i, number) else: - ocr_job_dependencies = filter( - lambda x: x == 'post_binarisation_job_-_%i-%i' % ( - index, - number - ), - post_binarisation_jobs - ) - ocr_jobs.append( - self.addTask( - command=cmd, - dependencies=ocr_job_dependencies, - label='ocr_job_-_%i-%i' % (index, number), - nCores=ocr_job_n_cores - ) - ) + deps = 'pdftoppm_job_-_{}'.format(i) + label = 'tesseract_jobs_-_{}-{}'.format(i, number) + tesseract_jobs.append( + self.addTask(command=cmd, dependencies=deps, label=label, + nCores=n_cores)) number += 1 ''' @@ -306,251 +220,191 @@ class OCRWorkflow(WorkflowRunner): ''' ' ################################################## - ' # Create TEI P5 files # + ' # hocrtotei_jobs # ' ################################################## ''' - hocr_to_tei_jobs = [] - for index, job in enumerate(self.jobs): - files = os.listdir(os.path.join(job['output_dir'], 'tmp')) - files = filter(lambda x: x.endswith('.hocr'), files) + hocrtotei_jobs = [] + for i, job in enumerate(self.jobs): + input_dir = os.path.join(job.output_dir, 'tmp') + files = filter(lambda x: x.endswith('.hocr'), + os.listdir(input_dir)) files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) - files = map( - lambda x: '"' + os.path.join(job['output_dir'], 'tmp', x) + '"', - files - ) - cmd = 'hocrtotei %s "%s"' % ( - ' '.join(files), - os.path.join( - job['output_dir'], - os.path.join(job['output_dir'], job['name'] + '.xml') - ) - ) - hocr_to_tei_jobs.append( - self.addTask( - command=cmd, - dependencies=filter( - lambda x: x.startswith('ocr_job_-_%i' % (index)), - ocr_jobs - ), - label='hocr_to_tei_job_-_%i' % (index) - ) - ) + files = map(lambda x: os.path.join(input_dir, x), files) + output_file = os.path.join(job.output_dir, + '{}.xml'.format(job.name)) + cmd = 'hocrtotei "{}" "{}"'.format('" "'.join(files), output_file) + deps = filter(lambda x: x.startswith('ocr_job_-_{}'.format(i)), + tesseract_jobs) + lbl = 'hocrtotei_job_-_{}'.format(i) + hocrtotei_jobs.append(self.addTask(command=cmd, dependencies=deps, + label=lbl)) ''' ' ################################################## - ' # Merge PDF files # + ' # pdfunite_jobs # ' ################################################## ''' - pdf_merge_jobs = [] - for index, job in enumerate(self.jobs): - files = os.listdir(os.path.join(job['output_dir'], 'tmp')) - files = filter(lambda x: x.endswith('.pdf'), files) + pdfunite_jobs = [] + for i, job in enumerate(self.jobs): + input_dir = os.path.join(job.output_dir, 'tmp') + files = filter(lambda x: x.endswith('.pdf'), os.listdir(input_dir)) files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) - files = map( - lambda x: '"' + os.path.join(job['output_dir'], 'tmp', x) + '"', - files - ) - cmd = 'pdfunite %s "%s"' % ( - ' '.join(files), - os.path.join( - job['output_dir'], - os.path.join(job['output_dir'], job['name'] + '.pdf') - ) - ) - pdf_merge_jobs.append( - self.addTask( - command=cmd, - dependencies=filter( - lambda x: x.startswith('ocr_job_-_%i' % (index)), - ocr_jobs - ), - label='pdf_merge_job_-_%i' % (index) - ) - ) + files = map(lambda x: os.path.join(input_dir, x), files) + output_file = os.path.join(job.output_dir, + '{}.pdf'.format(job.name)) + cmd = 'pdfunite "{}" "{}"'.format('" "'.join(files), output_file) + deps = filter(lambda x: x.startswith('ocr_job_-_{}'.format(i)), + tesseract_jobs) + lbl = 'pdfunite_job_-_{}'.format(i) + pdfunite_jobs.append(self.addTask(command=cmd, dependencies=deps, + label=lbl)) ''' ' ################################################## - ' # Merge text files # + ' # cat_jobs # ' ################################################## ''' - txt_merge_jobs = [] - for index, job in enumerate(self.jobs): - files = os.listdir(os.path.join(job['output_dir'], 'tmp')) - files = filter(lambda x: x.endswith('.txt'), files) + cat_jobs = [] + for i, job in enumerate(self.jobs): + input_dir = os.path.join(job.output_dir, 'tmp') + files = filter(lambda x: x.endswith('.txt'), os.listdir(input_dir)) files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) - files = map( - lambda x: '"' + os.path.join(job['output_dir'], 'tmp', x) + '"', - files - ) - cmd = 'cat %s > "%s"' % ( - ' '.join(files), - os.path.join( - job['output_dir'], - os.path.join(job['output_dir'], job['name'] + '.txt') - ) - ) - txt_merge_jobs.append( - self.addTask( - command=cmd, - dependencies=filter( - lambda x: x.startswith('ocr_job_-_%i' % (index)), - ocr_jobs - ), - label='txt_merge_job_-_%i' % (index) - ) - ) - - if self.zip: - all_zip_jobs = [] - all_zip_job_dependencies = (hocr_to_tei_jobs - + pdf_merge_jobs - + txt_merge_jobs) - cmd = 'cd "%s" && zip "%s"-all-ocr-files.zip */*.{pdf,txt,xml} -x "pyflow.data*" && cd -' % ( - self.output_dir, - self.zip - ) - all_zip_jobs.append( - self.addTask( - command=cmd, - dependencies=all_zip_job_dependencies, - label='all_zip_job' - ) - ) - - pdf_zip_jobs = [] - pdf_zip_job_dependencies = all_zip_jobs - cmd = 'cd "%s" && zip -m "%s"-ocr-pdf.zip */*.pdf -x "pyflow.data*" && cd -' % ( - self.output_dir, - self.zip - ) - pdf_zip_jobs.append( - self.addTask( - command=cmd, - dependencies=pdf_zip_job_dependencies, - label='pdf_zip_job' - ) - ) - - txt_zip_jobs = [] - txt_zip_job_dependencies = all_zip_jobs - cmd = 'cd "%s" && zip -m "%s"-ocr-txt.zip */*.txt -x "pyflow.data*" && cd -' % ( - self.output_dir, - self.zip - ) - txt_zip_jobs.append( - self.addTask( - command=cmd, - dependencies=txt_zip_job_dependencies, - label='txt_zip_job' - ) - ) - - xml_zip_jobs = [] - xml_zip_job_dependencies = all_zip_jobs - cmd = 'cd "%s" && zip -m "%s"-ocr-xml.zip */*.xml -x "pyflow.data*" && cd -' % ( - self.output_dir, - self.zip - ) - xml_zip_jobs.append( - self.addTask( - command=cmd, - dependencies=xml_zip_job_dependencies, - label='xml_zip_job' - ) - ) + files = map(lambda x: os.path.join(input_dir, x), files) + output_file = os.path.join(job.output_dir, + '{}.txt'.format(job.name)) + cmd = 'cat "{}" > "{}"'.format('" "'.join(files), output_file) + deps = filter(lambda x: x.startswith('ocr_job_-_{}'.format(i)), + tesseract_jobs) + lbl = 'cat_job_-_{}'.format(i) + cat_jobs.append(self.addTask(command=cmd, dependencies=deps, + label=lbl)) ''' ' ################################################## - ' # Cleanup # + ' # zip_jobs # ' ################################################## ''' - cleanup_jobs = [] + zip_jobs = [] + if self.zip is not None: + cmd = 'cd "{}"'.format(self.output_dir) + cmd += ' && ' + cmd += 'zip' + cmd += ' -r' + cmd += ' "{}_-_all" .'.format(self.zip) + cmd += ' -x "pyflow.data*" "*tmp*"' + cmd += ' -i "*.pdf" "*.txt" "*.xml"' + cmd += ' && ' + cmd += 'cd -' + deps = hocrtotei_jobs + pdfunite_jobs + cat_jobs + lbl = 'zip_job_-_all' + zip_jobs.append(self.addTask(command=cmd, dependencies=deps, + label=lbl)) + + cmd = 'cd "{}"'.format(self.output_dir) + cmd += ' && ' + cmd += 'zip' + cmd += ' -m' + cmd += ' -r' + cmd += ' "{}_-_pdf" .'.format(self.zip) + cmd += ' -x "pyflow.data*" "*tmp*"' + cmd += ' -i "*.pdf"' + cmd += ' && ' + cmd += 'cd -' + deps = 'zip_job_-_all' + lbl = 'zip_job_-_pdf' + zip_jobs.append(self.addTask(command=cmd, dependencies=deps, + label=lbl)) + + cmd = 'cd "{}"'.format(self.output_dir) + cmd += ' && ' + cmd += 'zip' + cmd += ' -m' + cmd += ' -r' + cmd += ' "{}_-_txt" .'.format(self.zip) + cmd += ' -x "pyflow.data*" "*tmp*"' + cmd += ' -i "*.txt"' + cmd += ' && ' + cmd += 'cd -' + deps = 'zip_job_-_all' + lbl = 'zip_job_-_txt' + zip_jobs.append(self.addTask(command=cmd, dependencies=deps, + label=lbl)) + + cmd = 'cd "{}"'.format(self.output_dir) + cmd += ' && ' + cmd += 'zip' + cmd += ' -m' + cmd += ' -r' + cmd += ' "{}_-_xml" .'.format(self.zip) + cmd += ' -x "pyflow.data*" "*tmp*"' + cmd += ' -i "*.xml"' + cmd += ' && ' + cmd += 'cd -' + deps = 'zip_job_-_all' + lbl = 'zip_job_-_xml' + zip_jobs.append(self.addTask(command=cmd, dependencies=deps, + label=lbl)) + + ''' + ' ################################################## + ' # mv_jobs # + ' ################################################## + ''' + mv_jobs = [] if self.keep_intermediates: - for index, job in enumerate(self.jobs): - cleanup_job_dependencies = [ - 'hocr_to_tei_job_-_%i' % (index), - 'pdf_merge_job_-_%i' % (index), - 'txt_merge_job_-_%i' % (index) - ] - cmd = 'mv "%s"/*.hocr "%s"' % ( - os.path.join(job['output_dir'], 'tmp'), - os.path.join(job['output_dir'], 'tmp', 'hocr'), - ) - cmd += ' && mv "%s"/*.pdf "%s"' % ( - os.path.join(job['output_dir'], 'tmp'), - os.path.join(job['output_dir'], 'tmp', 'pdf'), - ) - cmd += ' && mv "%s"/*.tif "%s"' % ( - os.path.join(job['output_dir'], 'tmp'), - os.path.join(job['output_dir'], 'tmp', 'tiff'), - ) - cmd += ' && mv "%s"/*.txt "%s"' % ( - os.path.join(job['output_dir'], 'tmp'), - os.path.join(job['output_dir'], 'tmp', 'txt'), - ) - if not self.skip_binarisation: - cmd += ' && mv "%s"/*.bin.png "%s"' % ( - os.path.join(job['output_dir'], 'tmp'), - os.path.join(job['output_dir'], 'tmp', 'bin.png'), - ) - cmd += ' && rm "%s"/*.nrm.png' % ( - os.path.join(job['output_dir'], 'tmp') - ) - cleanup_jobs.append( - self.addTask( - command=cmd, - dependencies=cleanup_job_dependencies, - label='cleanup_job_-_%i' % (index) - ) - ) + for i, job in enumerate(self.jobs): + input_dir = os.path.join(job.output_dir, 'tmp') + output_dir = input_dir + cmd = 'mv "{}"/*.hocr "{}"'.format( + input_dir, os.path.join(output_dir, 'hocr')) + cmd += ' && ' + cmd += 'mv "{}"/*.pdf "{}"'.format(input_dir, os.path.join(output_dir, 'pdf')) # noqa + cmd += ' && ' + cmd += 'mv "{}"/*.tif "{}"'.format(input_dir, os.path.join(output_dir, 'tiff')) # noqa + cmd += ' && ' + cmd += 'mv "{}"/*.txt "{}"'.format(input_dir, os.path.join(output_dir, 'txt')) # noqa + if self.binarize: + cmd += ' && ' + cmd += 'mv "{}"/*.bin.png "{}"'.format(input_dir, os.path.join(output_dir, 'bin.png')) # noqa + cmd += ' && ' + cmd += 'mv "{}"/*.nrm.png "{}"'.format(input_dir, os.path.join(output_dir, 'nrm.png')) # noqa + deps = ['hocrtotei_job_-_{}'.format(i), + 'pdfunite_job_-_{}'.format(i), + 'cat_job_-_{}'.format(i)] + lbl = 'mv_job_-_{}'.format(i) + mv_jobs.append(self.addTask(command=cmd, dependencies=deps, + label=lbl)) else: - for index, job in enumerate(self.jobs): - cleanup_job_dependencies = [ - 'hocr_to_tei_job_-_%i' % (index), - 'pdf_merge_job_-_%i' % (index), - 'txt_merge_job_-_%i' % (index) - ] - cmd = 'rm -r "%s"' % ( - os.path.join(job['output_dir'], 'tmp') - ) - cleanup_jobs.append( - self.addTask( - command=cmd, - dependencies=cleanup_job_dependencies, - label='cleanup_job_-_%i' % (index) - ) - ) + for i, job in enumerate(self.jobs): + input_dir = os.path.join(job.output_dir, 'tmp') + cmd = 'rm -r "{}"'.format(input_dir) + deps = ['hocrtotei_job_-_{}'.format(i), + 'pdfunite_job_-_{}'.format(i), + 'cat_job_-_{}'.format(i)] + lbl = 'mv_job_-_{}'.format(i) + mv_jobs.append(self.addTask(command=cmd, dependencies=deps, + label=lbl)) -def analyze_jobs(input_dir, output_dir): +def collect_jobs(input_dir, output_dir): jobs = [] - for file in os.listdir(input_dir): if os.path.isdir(os.path.join(input_dir, file)): - jobs += analyze_jobs( - os.path.join(input_dir, file), - os.path.join(output_dir, file) - ) - elif file.endswith(('.pdf', '.tif', '.tiff')): - jobs.append( - { - 'filename': file, - 'name': file.rsplit('.', 1)[0], - 'output_dir': os.path.join(output_dir, file), - 'path': os.path.join(input_dir, file) - } - ) - + jobs += collect_jobs(os.path.join(input_dir, file), + os.path.join(output_dir, file)) + elif file.endswith('.pdf'): + jobs.append(OCRPipelineJob(os.path.join(input_dir, file), + os.path.join(output_dir, file))) return jobs def main(): - args = parse_arguments() - - wflow = OCRWorkflow(args) - - retval = wflow.run(dataDirRoot=args.output_dir, nCores=args.n_cores) - + args = parse_args() + jobs = collect_jobs(args.i, args.o) + ocr_pipeline = OCRPipeline(args.binarize, jobs, args.keep_intermediates, + args.language, args.n_cores, args.o, args.zip) + retval = ocr_pipeline.run(dataDirRoot=(args.log_dir or args.o), + nCores=args.n_cores) sys.exit(retval) diff --git a/wrapper/ocr b/wrapper/ocr index d168210..4c38f25 100755 --- a/wrapper/ocr +++ b/wrapper/ocr @@ -1,39 +1,29 @@ #!/usr/bin/env python3 # coding=utf-8 -import argparse +from argparse import ArgumentParser import os import subprocess -container_image = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest' -container_input_dir = '/input' -container_output_dir = '/output' -uid = str(os.getuid()) -gid = str(os.getgid()) +CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/ocr:latest' +CONTAINER_INPUT_DIR = '/input' +CONTAINER_OUTPUT_DIR = '/output' +UID = str(os.getuid()) +GID = str(os.getgid()) -parser = argparse.ArgumentParser(add_help=False) -parser.add_argument( - '-i', - dest='input_dir', - required=False -) -parser.add_argument( - '-o', - dest='output_dir', - required=False -) +parser = ArgumentParser(add_help=False) +parser.add_argument('-i') +parser.add_argument('-o') args, remaining_args = parser.parse_known_args() -cmd = ['docker', 'run', '--rm', '-it', '-u', uid + ':' + gid] -if args.input_dir is not None: - host_input_dir = os.path.abspath(args.input_dir) - cmd += ['-v', host_input_dir + ':' + container_input_dir] - remaining_args += ['-i', container_input_dir] -if args.output_dir is not None: - host_output_dir = os.path.abspath(args.output_dir) - cmd += ['-v', host_output_dir + ':' + container_output_dir] - remaining_args += ['-o', container_output_dir] -cmd.append(container_image) +cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)] +if args.o is not None: + cmd += ['-v', '{}:{}'.format(os.path.abspath(args.o), CONTAINER_OUTPUT_DIR)] + remaining_args.insert(0, CONTAINER_OUTPUT_DIR) +if args.i is not None: + cmd += ['-v', '{}:{}'.format(os.path.abspath(args.i), CONTAINER_INPUT_DIR)] + remaining_args.insert(0, CONTAINER_INPUT_DIR) +cmd.append(CONTAINER_IMAGE) cmd += remaining_args subprocess.run(cmd)