diff --git a/README.md b/README.md index 98834e5..a9ec050 100644 --- a/README.md +++ b/README.md @@ -26,8 +26,24 @@ This software implements a heavily parallelized pipeline to recognize text in PD 3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details. ```bash cd / -ocr -i input -o output -m models/ -l -# or -ocr -i input -o output -m models/* -l +# is the model filename without the ".traineddata" suffix +ocr \ + --input-dir input \ + --output-dir output \ + --model-file models/ + -m +# More then one model +ocr \ + --input-dir input \ + --output-dir output \ + --model-file models/ + --model-file models/ + -m + +# Instead of multiple --model-file statements, you can also use +ocr \ + --input-dir input \ + --output-dir output \ + --model-file models/* + -m + ``` 4. Check your results in the `//output` directory. diff --git a/hocr-combine b/hocr-combine index 4008890..d4b606c 100755 --- a/hocr-combine +++ b/hocr-combine @@ -1,33 +1,42 @@ #!/usr/bin/env python3.7 # coding=utf-8 -""""Combine multiple hOCR files.""" +''' Combine multiple hOCR files. ''' from argparse import ArgumentParser from lxml import html parser = ArgumentParser(description='Combine multiple hOCR files.') -parser.add_argument('file', help='Input file(s)', nargs='+') -parser.add_argument('-o', '--output-file', help='Output file', required=True) +parser.add_argument( + '-i', '--input-file', + help='Input file', + nargs='+', + required=True +) +parser.add_argument( + '-o', '--output-file', + help='Output file', + required=True +) args = parser.parse_args() +print(args) - -for file in args.file: - files = [] - if file.startswith('@'): - with open(file[1:], 'r') as f: - files += [x for x in f.read().split("\n") if x != ''] +for input_file in args.input_file: + input_files = [] + if input_file.startswith('@'): + with open(input_file[1:], 'r') as f: + input_files += [x for x in f.read().split("\n") if x != ''] else: - files.append(file) -if len(files) == 0: + input_files.append(input_file) +if len(input_files) == 0: exit(1) -hocr = html.parse(files[0]) +hocr = html.parse(input_files[0]) hocr_body = hocr.find('body') -for file in files[1:]: - for ocr_page in html.parse(file).findall('//div[@class="ocr_page"]'): +for input_file in input_files[1:]: + for ocr_page in html.parse(input_file).findall('//div[@class="ocr_page"]'): hocr_body.append(ocr_page) diff --git a/hocr2tei b/hocr2tei index 04a3db7..0350efb 100755 --- a/hocr2tei +++ b/hocr2tei @@ -1,7 +1,7 @@ #!/usr/bin/env python3.7 # coding=utf-8 -""""Convert hOCR to TEI XML.""" +''' Convert hOCR to TEI XML. ''' from argparse import ArgumentParser from lxml import html @@ -10,8 +10,15 @@ import re parser = ArgumentParser(description='Convert hOCR to TEI XML.') -parser.add_argument('file', help='Input file') -parser.add_argument('-o', '--output-file', help='Output file', required=True) +parser.add_argument( + '-i', '--input-file', + help='Input file' +) +parser.add_argument( + '-o', '--output-file', + help='Output file', + required=True +) args = parser.parse_args() @@ -32,7 +39,7 @@ tei += ' \n' tei += ' \n' tei += ' \n' tei += ' \n' -hocr = html.parse(args.file) +hocr = html.parse(args.input_file) for ocr_page in hocr.findall('.//div[@class="ocr_page"]'): ocr_page_title_attrib = ocr_page.attrib.get('title') facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1) @@ -42,11 +49,13 @@ for ocr_page in hocr.findall('.//div[@class="ocr_page"]'): tei += '

\n' for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'): tei += ' ' - indent = '' + is_first_word_in_line = True for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'): if ocrx_word.text is not None: - tei += indent + escape(ocrx_word.text) - indent = ' ' + if not is_first_word_in_line: + tei += ' ' + tei += escape(ocrx_word.text) + is_first_word_in_line = False tei += '\n' tei += '

\n' tei += ' \n' diff --git a/ocr b/ocr index 5e13c5f..f8a31fe 100755 --- a/ocr +++ b/ocr @@ -8,7 +8,6 @@ __version__ = '0.1.0' from argparse import ArgumentParser from pyflow import WorkflowRunner import json -import multiprocessing import os import sys @@ -52,8 +51,8 @@ class SplitInputWorkflow(WorkflowRunner): cmd += ' -dQUIET' cmd += ' -r300' cmd += ' -sDEVICE=png16m' - cmd += ' -sOutputFile="{}/page-%d.png"'.format( - os.path.join(self.job.tmp_dir, 'images') + cmd += ' -sOutputFile="{}"'.format( + os.path.join(self.job.tmp_dir, 'images', 'page-%d.png') ) cmd += ' "{}"'.format(self.job.file) self.addTask( @@ -82,13 +81,18 @@ class BinarizationWorkflow(WorkflowRunner): os.path.join(self.job.tmp_dir, 'images', 'inputs.txt') ) cmd += ' && ' - cmd += 'ocropus-nlbin "@{}"'.format(os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')) # noqa + cmd += 'ocropus-nlbin "@{}"'.format( + os.path.join(self.job.tmp_dir, 'images', 'inputs.txt') + ) cmd += ' --nocheck' cmd += ' --output "{}"'.format( - os.path.join(self.job.tmp_dir, 'images')) + os.path.join(self.job.tmp_dir, 'images') + ) cmd += ' --parallel "{}"'.format(n_cores) cmd += ' && ' - cmd += 'rm "{}"'.format(os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')) # noqa + cmd += 'rm "{}"'.format( + os.path.join(self.job.tmp_dir, 'images', 'inputs.txt') + ) ocropus_nlbin_task = self.addTask( 'ocropus_nlbin', command=cmd, @@ -130,9 +134,9 @@ class BinarizationWorkflow(WorkflowRunner): class OCRWorkflow(WorkflowRunner): - def __init__(self, job, lang): + def __init__(self, job, model): self.job = job - self.lang = lang + self.model = model def workflow(self): ''' @@ -148,7 +152,7 @@ class OCRWorkflow(WorkflowRunner): os.path.join(self.job.tmp_dir, 'images', file), os.path.join(self.job.tmp_dir, file[:-4]) ) - cmd += ' -l "{}"'.format(self.lang) + cmd += ' -l "{}"'.format(self.model) cmd += ' hocr pdf txt' cmd += ' || ' cmd += 'echo "${?}"' @@ -166,6 +170,7 @@ class OCRWorkflow(WorkflowRunner): ' # move_files # ' ################################################## ''' + move_files_tasks = [] n_cores = 1 mem_mb = min(128, self.getMemMb()) for i, file_extension in enumerate(['hocr', 'pdf', 'txt']): @@ -174,24 +179,26 @@ class OCRWorkflow(WorkflowRunner): file_extension, os.path.join(self.job.tmp_dir, file_extension) ) - self.addTask( + task = self.addTask( 'move_{}_files'.format(file_extension), command=cmd, dependencies=tesseract_tasks, memMb=mem_mb, nCores=n_cores ) + move_files_tasks.append(task) cmd = 'mv "{}" "{}"'.format( os.path.join(self.job.tmp_dir, 'images'), os.path.join(self.job.output_dir) ) - self.addTask( + task = self.addTask( 'move_image_files', command=cmd, dependencies=tesseract_tasks, memMb=mem_mb, nCores=n_cores ) + move_files_tasks.append(task) class CreateHOCRWorkflow(WorkflowRunner): @@ -256,13 +263,14 @@ class CreateHOCRWorkflow(WorkflowRunner): ' ################################################## ''' n_cores = 1 - mem_mb = min(512, self.getMemMb()) + mem_mb = min(256, self.getMemMb()) cmd = 'ls -dv "{}/"* > "{}"'.format( os.path.join(self.job.tmp_dir, 'hocr'), os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt') ) cmd += ' && ' - cmd += 'hocr-combine "@{}"'.format( + cmd += 'hocr-combine' + cmd += ' --input-file "@{}"'.format( os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt') ) cmd += ' --output-file "{}.hocr"'.format( @@ -301,12 +309,17 @@ class CreatePDFWorkflow(WorkflowRunner): cmd += ' -dPDFSETTINGS=/ebook' cmd += ' -dQUIET' cmd += ' -sDEVICE=pdfwrite' - cmd += ' -sOutputFile="{}.pdf"'.format( - os.path.join(self.job.output_dir, self.job.name) + cmd += ' -sOutputFile="{}"'.format( + os.path.join(self.job.output_dir, '{}.pdf'.format(self.job.name)) ) cmd += ' && ' cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'pdf')) - self.addTask('pdf_combine', command=cmd, memMb=mem_mb, nCores=n_cores) + self.addTask( + 'pdf_combine', + command=cmd, + memMb=mem_mb, + nCores=n_cores + ) class CreateTEIWorkflow(WorkflowRunner): @@ -320,14 +333,23 @@ class CreateTEIWorkflow(WorkflowRunner): ' ################################################## ''' n_cores = 1 - mem_mb = min(512, self.getMemMb()) - cmd = 'hocr2tei "{}.hocr"'.format( - os.path.join(self.job.output_dir, self.job.name) + mem_mb = min(256, self.getMemMb()) + cmd = 'hocr2tei' + cmd += ' --input-file "{}"'.format( + os.path.join(self.job.output_dir, '{}.hocr'.format(self.job.name)) ) - cmd += ' --output-file "{}.tei.xml"'.format( - os.path.join(self.job.output_dir, self.job.name) + cmd += ' --output-file "{}"'.format( + os.path.join( + self.job.output_dir, + '{}.tei.xml'.format(self.job.name) + ) + ) + self.addTask( + 'hocr2tei', + command=cmd, + memMb=mem_mb, + nCores=n_cores ) - self.addTask('hocr2tei', command=cmd, memMb=mem_mb, nCores=n_cores) class CreatePoCoZipWorkflow(WorkflowRunner): @@ -354,7 +376,12 @@ class CreatePoCoZipWorkflow(WorkflowRunner): cmd += 'rm -r images' cmd += ' && ' cmd += 'cd -' - task = self.addTask('zip', command=cmd, memMb=mem_mb, nCores=n_cores) + task = self.addTask( + 'zip', + command=cmd, + memMb=mem_mb, + nCores=n_cores + ) zip_tasks.append(task) @@ -377,13 +404,18 @@ class CreateTxtWorkflow(WorkflowRunner): cmd += '"{}.txt"'.format(os.path.join(self.job.output_dir, self.job.name)) # noqa cmd += ' && ' cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'txt')) - self.addTask('txt_combine', command=cmd, memMb=mem_mb, nCores=n_cores) + self.addTask( + 'txt_combine', + command=cmd, + memMb=mem_mb, + nCores=n_cores + ) class MainWorkflow(WorkflowRunner): - def __init__(self, input_dir, lang, output_dir, binarize): + def __init__(self, input_dir, model, output_dir, binarize): self.input_dir = input_dir - self.lang = lang + self.model = model self.output_dir = output_dir self.binarize = binarize self.jobs = [] @@ -419,11 +451,13 @@ class MainWorkflow(WorkflowRunner): ' # split-input # ' ################################################## ''' + split_input_tasks = [] for i, job in enumerate(self.jobs): - self.addWorkflowTask( + task = self.addWorkflowTask( 'split_input_-_{}'.format(i), SplitInputWorkflow(job) ) + split_input_tasks.append(task) if self.binarize: ''' @@ -431,12 +465,14 @@ class MainWorkflow(WorkflowRunner): ' # binarization # ' ################################################## ''' + binarization_tasks = [] for i, job in enumerate(self.jobs): - self.addWorkflowTask( + task = self.addWorkflowTask( 'binarization_-_{}'.format(i), BinarizationWorkflow(job), dependencies='split_input_-_{}'.format(i) ) + binarization_tasks.append(task) ''' ' ################################################## @@ -451,7 +487,7 @@ class MainWorkflow(WorkflowRunner): deps = 'split_input_-_{}'.format(i) task = self.addWorkflowTask( 'ocr_-_{}'.format(i), - OCRWorkflow(job, self.lang), + OCRWorkflow(job, self.model), dependencies=deps ) ocr_tasks.append(task) @@ -527,55 +563,80 @@ class MainWorkflow(WorkflowRunner): create_txt_tasks.append(task) self.waitForTasks() - output_files = [] + outputs = [] for job in self.jobs: # Remove temporary directory os.rmdir(job.tmp_dir) # Track output files - relative_output_dir = os.path.relpath(job.output_dir, start=self.output_dir) # noqa - output_files.append( + relative_output_dir = os.path.relpath( + job.output_dir, + start=self.output_dir + ) + outputs.append( { 'description': 'Post correction package (.png and .hocr).', - 'file': os.path.join(relative_output_dir, '{}.poco.zip'.format(job.name)), # noqa + 'file': os.path.join( + relative_output_dir, + '{}.poco.zip'.format(job.name) + ), 'mimetype': 'application/zip' } ) - output_files.append( + outputs.append( { 'description': 'PDF file with text layer.', - 'file': os.path.join(relative_output_dir, '{}.pdf'.format(job.name)), # noqa + 'file': os.path.join( + relative_output_dir, + '{}.pdf'.format(job.name) + ), 'mimetype': 'application/pdf' } ) - output_files.append( + outputs.append( { 'description': 'Plain text file.', - 'file': os.path.join(relative_output_dir, '{}.txt'.format(job.name)), # noqa + 'file': os.path.join( + relative_output_dir, + '{}.txt'.format(job.name) + ), 'mimetype': 'text/plain' } ) - output_files.append( + outputs.append( { 'description': 'TEI compliant XML file.', - 'file': os.path.join(relative_output_dir, '{}.tei.xml'.format(job.name)), # noqa + 'file': os.path.join( + relative_output_dir, + '{}.tei.xml'.format(job.name) + ), 'mimetype': 'application/tei+xml' } ) - with open(os.path.join(self.output_dir, 'output_records.json'), 'w') as f: # noqa - json.dump(output_files, f, indent=4) + with open(os.path.join(self.output_dir, 'outputs.json'), 'w') as f: + json.dump(outputs, f, indent=4) def parse_args(): - parser = ArgumentParser(description='Pipeline for PDF file OCR processing') + parser = ArgumentParser( + description='Pipeline for PDF file OCR processing' + ) parser.add_argument( - '-i', '--input-dir', help='Input directory', required=True) + '-i', '--input-dir', + help='Input directory', + required=True + ) parser.add_argument( - '-o', '--output-dir', help='Output directory', required=True) + '-o', '--output-dir', + help='Output directory', + required=True + ) parser.add_argument( - '-l', '--language', - choices=[x[:-12] for x in os.listdir('/usr/local/share/tessdata') - if x.endswith('.traineddata') and len(x) > 12], - help='Language of the input (3-character ISO 639-2 language codes)', + '-m', '--model', + choices=[ + x[:-12] for x in os.listdir('/usr/local/share/tessdata') + if x.endswith('.traineddata') and len(x) > 12 + ], + help='Name of the model to be used', required=True ) parser.add_argument( @@ -584,16 +645,19 @@ def parse_args(): help='Add binarization as a preprocessing step' ) parser.add_argument( - '--log-dir', help='Logging directory (Default: --output-dir)') + '--log-dir', + help='Logging directory (Default: --output-dir)' + ) parser.add_argument( '--mem-mb', - help='Amount of system memory to be used (Default: min(--n-cores * 512, available system memory))', # noqa + help='Amount of system memory to be used ' + '(Default: min(--n-cores * 512, available system memory))', type=int ) parser.add_argument( '--n-cores', - default=min(4, multiprocessing.cpu_count()), - help='Number of CPU threads to be used (Default: min(4, CPU count))', + default=1, + help='Number of CPU threads to be used', type=int ) parser.add_argument( @@ -620,10 +684,17 @@ def parse_args(): def main(): args = parse_args() main_workflow = MainWorkflow( - args.input_dir, args.language, args.output_dir, args.binarize) + args.input_dir, + args.model, + args.output_dir, + args.binarize + ) main_workflow.collect_jobs() retval = main_workflow.run( - dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores) + dataDirRoot=args.log_dir, + memMb=args.mem_mb, + nCores=args.n_cores + ) sys.exit(retval) diff --git a/wrapper/ocr b/wrapper/ocr index 58a0bca..d36b129 100755 --- a/wrapper/ocr +++ b/wrapper/ocr @@ -17,7 +17,7 @@ GID = str(os.getgid()) parser = ArgumentParser(add_help=False) parser.add_argument('-i', '--input-dir') parser.add_argument('-o', '--output-dir') -parser.add_argument('-m', '--model', action='extend', dest='models', nargs='+') +parser.add_argument('-t', '--model-file', action='extend', nargs='+') parser.add_argument('--log-dir') args, remaining_args = parser.parse_known_args() @@ -30,9 +30,9 @@ if args.output_dir is not None: mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}' cmd += ['-v', mapping] remaining_args += ['-o', CONTAINER_OUTPUT_DIR] -if args.models is not None: - for model in args.models: - mapping = f'{os.path.abspath(model)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model)}' # noqa +if args.model_file is not None: + for model_file in args.model_file: + mapping = f'{os.path.abspath(model_file)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model_file)}' # noqa cmd += ['-v', mapping] if args.log_dir is not None: mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}'