mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2025-01-13 12:20:35 +00:00
Codestyle enhacements
This commit is contained in:
parent
aeab9b7802
commit
4518ca1c83
22
README.md
22
README.md
@ -26,8 +26,24 @@ This software implements a heavily parallelized pipeline to recognize text in PD
|
|||||||
3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details.
|
3. Start the pipeline process. Check the pipeline help (`ocr --help`) for more details.
|
||||||
```bash
|
```bash
|
||||||
cd /<my_data_location>
|
cd /<my_data_location>
|
||||||
ocr -i input -o output -m models/<model_name> -l <language_code> <optional_pipeline_arguments>
|
# <model_code> is the model filename without the ".traineddata" suffix
|
||||||
# or
|
ocr \
|
||||||
ocr -i input -o output -m models/* -l <language_code> <optional_pipeline_arguments>
|
--input-dir input \
|
||||||
|
--output-dir output \
|
||||||
|
--model-file models/<model>
|
||||||
|
-m <model_code> <optional_pipeline_arguments>
|
||||||
|
# More then one model
|
||||||
|
ocr \
|
||||||
|
--input-dir input \
|
||||||
|
--output-dir output \
|
||||||
|
--model-file models/<model1>
|
||||||
|
--model-file models/<model2>
|
||||||
|
-m <model1_code>+<model2_code> <optional_pipeline_arguments>
|
||||||
|
# Instead of multiple --model-file statements, you can also use
|
||||||
|
ocr \
|
||||||
|
--input-dir input \
|
||||||
|
--output-dir output \
|
||||||
|
--model-file models/*
|
||||||
|
-m <model1_code>+<model2_code> <optional_pipeline_arguments>
|
||||||
```
|
```
|
||||||
4. Check your results in the `/<my_data_location>/output` directory.
|
4. Check your results in the `/<my_data_location>/output` directory.
|
||||||
|
37
hocr-combine
37
hocr-combine
@ -1,33 +1,42 @@
|
|||||||
#!/usr/bin/env python3.7
|
#!/usr/bin/env python3.7
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
|
||||||
""""Combine multiple hOCR files."""
|
''' Combine multiple hOCR files. '''
|
||||||
|
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
from lxml import html
|
from lxml import html
|
||||||
|
|
||||||
|
|
||||||
parser = ArgumentParser(description='Combine multiple hOCR files.')
|
parser = ArgumentParser(description='Combine multiple hOCR files.')
|
||||||
parser.add_argument('file', help='Input file(s)', nargs='+')
|
parser.add_argument(
|
||||||
parser.add_argument('-o', '--output-file', help='Output file', required=True)
|
'-i', '--input-file',
|
||||||
|
help='Input file',
|
||||||
|
nargs='+',
|
||||||
|
required=True
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'-o', '--output-file',
|
||||||
|
help='Output file',
|
||||||
|
required=True
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
print(args)
|
||||||
|
|
||||||
|
for input_file in args.input_file:
|
||||||
for file in args.file:
|
input_files = []
|
||||||
files = []
|
if input_file.startswith('@'):
|
||||||
if file.startswith('@'):
|
with open(input_file[1:], 'r') as f:
|
||||||
with open(file[1:], 'r') as f:
|
input_files += [x for x in f.read().split("\n") if x != '']
|
||||||
files += [x for x in f.read().split("\n") if x != '']
|
|
||||||
else:
|
else:
|
||||||
files.append(file)
|
input_files.append(input_file)
|
||||||
if len(files) == 0:
|
if len(input_files) == 0:
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
hocr = html.parse(files[0])
|
hocr = html.parse(input_files[0])
|
||||||
hocr_body = hocr.find('body')
|
hocr_body = hocr.find('body')
|
||||||
for file in files[1:]:
|
for input_file in input_files[1:]:
|
||||||
for ocr_page in html.parse(file).findall('//div[@class="ocr_page"]'):
|
for ocr_page in html.parse(input_file).findall('//div[@class="ocr_page"]'):
|
||||||
hocr_body.append(ocr_page)
|
hocr_body.append(ocr_page)
|
||||||
|
|
||||||
|
|
||||||
|
23
hocr2tei
23
hocr2tei
@ -1,7 +1,7 @@
|
|||||||
#!/usr/bin/env python3.7
|
#!/usr/bin/env python3.7
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
|
||||||
""""Convert hOCR to TEI XML."""
|
''' Convert hOCR to TEI XML. '''
|
||||||
|
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
from lxml import html
|
from lxml import html
|
||||||
@ -10,8 +10,15 @@ import re
|
|||||||
|
|
||||||
|
|
||||||
parser = ArgumentParser(description='Convert hOCR to TEI XML.')
|
parser = ArgumentParser(description='Convert hOCR to TEI XML.')
|
||||||
parser.add_argument('file', help='Input file')
|
parser.add_argument(
|
||||||
parser.add_argument('-o', '--output-file', help='Output file', required=True)
|
'-i', '--input-file',
|
||||||
|
help='Input file'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'-o', '--output-file',
|
||||||
|
help='Output file',
|
||||||
|
required=True
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
@ -32,7 +39,7 @@ tei += ' </fileDesc>\n'
|
|||||||
tei += ' </teiHeader>\n'
|
tei += ' </teiHeader>\n'
|
||||||
tei += ' <text>\n'
|
tei += ' <text>\n'
|
||||||
tei += ' <body>\n'
|
tei += ' <body>\n'
|
||||||
hocr = html.parse(args.file)
|
hocr = html.parse(args.input_file)
|
||||||
for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
|
for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
|
||||||
ocr_page_title_attrib = ocr_page.attrib.get('title')
|
ocr_page_title_attrib = ocr_page.attrib.get('title')
|
||||||
facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1)
|
facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1)
|
||||||
@ -42,11 +49,13 @@ for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
|
|||||||
tei += ' <p>\n'
|
tei += ' <p>\n'
|
||||||
for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'):
|
for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'):
|
||||||
tei += ' <lb/>'
|
tei += ' <lb/>'
|
||||||
indent = ''
|
is_first_word_in_line = True
|
||||||
for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'):
|
for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'):
|
||||||
if ocrx_word.text is not None:
|
if ocrx_word.text is not None:
|
||||||
tei += indent + escape(ocrx_word.text)
|
if not is_first_word_in_line:
|
||||||
indent = ' '
|
tei += ' '
|
||||||
|
tei += escape(ocrx_word.text)
|
||||||
|
is_first_word_in_line = False
|
||||||
tei += '\n'
|
tei += '\n'
|
||||||
tei += ' </p>\n'
|
tei += ' </p>\n'
|
||||||
tei += ' </body>\n'
|
tei += ' </body>\n'
|
||||||
|
179
ocr
179
ocr
@ -8,7 +8,6 @@ __version__ = '0.1.0'
|
|||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
from pyflow import WorkflowRunner
|
from pyflow import WorkflowRunner
|
||||||
import json
|
import json
|
||||||
import multiprocessing
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
@ -52,8 +51,8 @@ class SplitInputWorkflow(WorkflowRunner):
|
|||||||
cmd += ' -dQUIET'
|
cmd += ' -dQUIET'
|
||||||
cmd += ' -r300'
|
cmd += ' -r300'
|
||||||
cmd += ' -sDEVICE=png16m'
|
cmd += ' -sDEVICE=png16m'
|
||||||
cmd += ' -sOutputFile="{}/page-%d.png"'.format(
|
cmd += ' -sOutputFile="{}"'.format(
|
||||||
os.path.join(self.job.tmp_dir, 'images')
|
os.path.join(self.job.tmp_dir, 'images', 'page-%d.png')
|
||||||
)
|
)
|
||||||
cmd += ' "{}"'.format(self.job.file)
|
cmd += ' "{}"'.format(self.job.file)
|
||||||
self.addTask(
|
self.addTask(
|
||||||
@ -82,13 +81,18 @@ class BinarizationWorkflow(WorkflowRunner):
|
|||||||
os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')
|
os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')
|
||||||
)
|
)
|
||||||
cmd += ' && '
|
cmd += ' && '
|
||||||
cmd += 'ocropus-nlbin "@{}"'.format(os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')) # noqa
|
cmd += 'ocropus-nlbin "@{}"'.format(
|
||||||
|
os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')
|
||||||
|
)
|
||||||
cmd += ' --nocheck'
|
cmd += ' --nocheck'
|
||||||
cmd += ' --output "{}"'.format(
|
cmd += ' --output "{}"'.format(
|
||||||
os.path.join(self.job.tmp_dir, 'images'))
|
os.path.join(self.job.tmp_dir, 'images')
|
||||||
|
)
|
||||||
cmd += ' --parallel "{}"'.format(n_cores)
|
cmd += ' --parallel "{}"'.format(n_cores)
|
||||||
cmd += ' && '
|
cmd += ' && '
|
||||||
cmd += 'rm "{}"'.format(os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')) # noqa
|
cmd += 'rm "{}"'.format(
|
||||||
|
os.path.join(self.job.tmp_dir, 'images', 'inputs.txt')
|
||||||
|
)
|
||||||
ocropus_nlbin_task = self.addTask(
|
ocropus_nlbin_task = self.addTask(
|
||||||
'ocropus_nlbin',
|
'ocropus_nlbin',
|
||||||
command=cmd,
|
command=cmd,
|
||||||
@ -130,9 +134,9 @@ class BinarizationWorkflow(WorkflowRunner):
|
|||||||
|
|
||||||
|
|
||||||
class OCRWorkflow(WorkflowRunner):
|
class OCRWorkflow(WorkflowRunner):
|
||||||
def __init__(self, job, lang):
|
def __init__(self, job, model):
|
||||||
self.job = job
|
self.job = job
|
||||||
self.lang = lang
|
self.model = model
|
||||||
|
|
||||||
def workflow(self):
|
def workflow(self):
|
||||||
'''
|
'''
|
||||||
@ -148,7 +152,7 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
os.path.join(self.job.tmp_dir, 'images', file),
|
os.path.join(self.job.tmp_dir, 'images', file),
|
||||||
os.path.join(self.job.tmp_dir, file[:-4])
|
os.path.join(self.job.tmp_dir, file[:-4])
|
||||||
)
|
)
|
||||||
cmd += ' -l "{}"'.format(self.lang)
|
cmd += ' -l "{}"'.format(self.model)
|
||||||
cmd += ' hocr pdf txt'
|
cmd += ' hocr pdf txt'
|
||||||
cmd += ' || '
|
cmd += ' || '
|
||||||
cmd += 'echo "${?}"'
|
cmd += 'echo "${?}"'
|
||||||
@ -166,6 +170,7 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
' # move_files #
|
' # move_files #
|
||||||
' ##################################################
|
' ##################################################
|
||||||
'''
|
'''
|
||||||
|
move_files_tasks = []
|
||||||
n_cores = 1
|
n_cores = 1
|
||||||
mem_mb = min(128, self.getMemMb())
|
mem_mb = min(128, self.getMemMb())
|
||||||
for i, file_extension in enumerate(['hocr', 'pdf', 'txt']):
|
for i, file_extension in enumerate(['hocr', 'pdf', 'txt']):
|
||||||
@ -174,24 +179,26 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
file_extension,
|
file_extension,
|
||||||
os.path.join(self.job.tmp_dir, file_extension)
|
os.path.join(self.job.tmp_dir, file_extension)
|
||||||
)
|
)
|
||||||
self.addTask(
|
task = self.addTask(
|
||||||
'move_{}_files'.format(file_extension),
|
'move_{}_files'.format(file_extension),
|
||||||
command=cmd,
|
command=cmd,
|
||||||
dependencies=tesseract_tasks,
|
dependencies=tesseract_tasks,
|
||||||
memMb=mem_mb,
|
memMb=mem_mb,
|
||||||
nCores=n_cores
|
nCores=n_cores
|
||||||
)
|
)
|
||||||
|
move_files_tasks.append(task)
|
||||||
cmd = 'mv "{}" "{}"'.format(
|
cmd = 'mv "{}" "{}"'.format(
|
||||||
os.path.join(self.job.tmp_dir, 'images'),
|
os.path.join(self.job.tmp_dir, 'images'),
|
||||||
os.path.join(self.job.output_dir)
|
os.path.join(self.job.output_dir)
|
||||||
)
|
)
|
||||||
self.addTask(
|
task = self.addTask(
|
||||||
'move_image_files',
|
'move_image_files',
|
||||||
command=cmd,
|
command=cmd,
|
||||||
dependencies=tesseract_tasks,
|
dependencies=tesseract_tasks,
|
||||||
memMb=mem_mb,
|
memMb=mem_mb,
|
||||||
nCores=n_cores
|
nCores=n_cores
|
||||||
)
|
)
|
||||||
|
move_files_tasks.append(task)
|
||||||
|
|
||||||
|
|
||||||
class CreateHOCRWorkflow(WorkflowRunner):
|
class CreateHOCRWorkflow(WorkflowRunner):
|
||||||
@ -256,13 +263,14 @@ class CreateHOCRWorkflow(WorkflowRunner):
|
|||||||
' ##################################################
|
' ##################################################
|
||||||
'''
|
'''
|
||||||
n_cores = 1
|
n_cores = 1
|
||||||
mem_mb = min(512, self.getMemMb())
|
mem_mb = min(256, self.getMemMb())
|
||||||
cmd = 'ls -dv "{}/"* > "{}"'.format(
|
cmd = 'ls -dv "{}/"* > "{}"'.format(
|
||||||
os.path.join(self.job.tmp_dir, 'hocr'),
|
os.path.join(self.job.tmp_dir, 'hocr'),
|
||||||
os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt')
|
os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt')
|
||||||
)
|
)
|
||||||
cmd += ' && '
|
cmd += ' && '
|
||||||
cmd += 'hocr-combine "@{}"'.format(
|
cmd += 'hocr-combine'
|
||||||
|
cmd += ' --input-file "@{}"'.format(
|
||||||
os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt')
|
os.path.join(self.job.tmp_dir, 'hocr', 'inputs.txt')
|
||||||
)
|
)
|
||||||
cmd += ' --output-file "{}.hocr"'.format(
|
cmd += ' --output-file "{}.hocr"'.format(
|
||||||
@ -301,12 +309,17 @@ class CreatePDFWorkflow(WorkflowRunner):
|
|||||||
cmd += ' -dPDFSETTINGS=/ebook'
|
cmd += ' -dPDFSETTINGS=/ebook'
|
||||||
cmd += ' -dQUIET'
|
cmd += ' -dQUIET'
|
||||||
cmd += ' -sDEVICE=pdfwrite'
|
cmd += ' -sDEVICE=pdfwrite'
|
||||||
cmd += ' -sOutputFile="{}.pdf"'.format(
|
cmd += ' -sOutputFile="{}"'.format(
|
||||||
os.path.join(self.job.output_dir, self.job.name)
|
os.path.join(self.job.output_dir, '{}.pdf'.format(self.job.name))
|
||||||
)
|
)
|
||||||
cmd += ' && '
|
cmd += ' && '
|
||||||
cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'pdf'))
|
cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'pdf'))
|
||||||
self.addTask('pdf_combine', command=cmd, memMb=mem_mb, nCores=n_cores)
|
self.addTask(
|
||||||
|
'pdf_combine',
|
||||||
|
command=cmd,
|
||||||
|
memMb=mem_mb,
|
||||||
|
nCores=n_cores
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class CreateTEIWorkflow(WorkflowRunner):
|
class CreateTEIWorkflow(WorkflowRunner):
|
||||||
@ -320,14 +333,23 @@ class CreateTEIWorkflow(WorkflowRunner):
|
|||||||
' ##################################################
|
' ##################################################
|
||||||
'''
|
'''
|
||||||
n_cores = 1
|
n_cores = 1
|
||||||
mem_mb = min(512, self.getMemMb())
|
mem_mb = min(256, self.getMemMb())
|
||||||
cmd = 'hocr2tei "{}.hocr"'.format(
|
cmd = 'hocr2tei'
|
||||||
os.path.join(self.job.output_dir, self.job.name)
|
cmd += ' --input-file "{}"'.format(
|
||||||
|
os.path.join(self.job.output_dir, '{}.hocr'.format(self.job.name))
|
||||||
)
|
)
|
||||||
cmd += ' --output-file "{}.tei.xml"'.format(
|
cmd += ' --output-file "{}"'.format(
|
||||||
os.path.join(self.job.output_dir, self.job.name)
|
os.path.join(
|
||||||
|
self.job.output_dir,
|
||||||
|
'{}.tei.xml'.format(self.job.name)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self.addTask(
|
||||||
|
'hocr2tei',
|
||||||
|
command=cmd,
|
||||||
|
memMb=mem_mb,
|
||||||
|
nCores=n_cores
|
||||||
)
|
)
|
||||||
self.addTask('hocr2tei', command=cmd, memMb=mem_mb, nCores=n_cores)
|
|
||||||
|
|
||||||
|
|
||||||
class CreatePoCoZipWorkflow(WorkflowRunner):
|
class CreatePoCoZipWorkflow(WorkflowRunner):
|
||||||
@ -354,7 +376,12 @@ class CreatePoCoZipWorkflow(WorkflowRunner):
|
|||||||
cmd += 'rm -r images'
|
cmd += 'rm -r images'
|
||||||
cmd += ' && '
|
cmd += ' && '
|
||||||
cmd += 'cd -'
|
cmd += 'cd -'
|
||||||
task = self.addTask('zip', command=cmd, memMb=mem_mb, nCores=n_cores)
|
task = self.addTask(
|
||||||
|
'zip',
|
||||||
|
command=cmd,
|
||||||
|
memMb=mem_mb,
|
||||||
|
nCores=n_cores
|
||||||
|
)
|
||||||
zip_tasks.append(task)
|
zip_tasks.append(task)
|
||||||
|
|
||||||
|
|
||||||
@ -377,13 +404,18 @@ class CreateTxtWorkflow(WorkflowRunner):
|
|||||||
cmd += '"{}.txt"'.format(os.path.join(self.job.output_dir, self.job.name)) # noqa
|
cmd += '"{}.txt"'.format(os.path.join(self.job.output_dir, self.job.name)) # noqa
|
||||||
cmd += ' && '
|
cmd += ' && '
|
||||||
cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'txt'))
|
cmd += 'rm -r "{}"'.format(os.path.join(self.job.tmp_dir, 'txt'))
|
||||||
self.addTask('txt_combine', command=cmd, memMb=mem_mb, nCores=n_cores)
|
self.addTask(
|
||||||
|
'txt_combine',
|
||||||
|
command=cmd,
|
||||||
|
memMb=mem_mb,
|
||||||
|
nCores=n_cores
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class MainWorkflow(WorkflowRunner):
|
class MainWorkflow(WorkflowRunner):
|
||||||
def __init__(self, input_dir, lang, output_dir, binarize):
|
def __init__(self, input_dir, model, output_dir, binarize):
|
||||||
self.input_dir = input_dir
|
self.input_dir = input_dir
|
||||||
self.lang = lang
|
self.model = model
|
||||||
self.output_dir = output_dir
|
self.output_dir = output_dir
|
||||||
self.binarize = binarize
|
self.binarize = binarize
|
||||||
self.jobs = []
|
self.jobs = []
|
||||||
@ -419,11 +451,13 @@ class MainWorkflow(WorkflowRunner):
|
|||||||
' # split-input #
|
' # split-input #
|
||||||
' ##################################################
|
' ##################################################
|
||||||
'''
|
'''
|
||||||
|
split_input_tasks = []
|
||||||
for i, job in enumerate(self.jobs):
|
for i, job in enumerate(self.jobs):
|
||||||
self.addWorkflowTask(
|
task = self.addWorkflowTask(
|
||||||
'split_input_-_{}'.format(i),
|
'split_input_-_{}'.format(i),
|
||||||
SplitInputWorkflow(job)
|
SplitInputWorkflow(job)
|
||||||
)
|
)
|
||||||
|
split_input_tasks.append(task)
|
||||||
|
|
||||||
if self.binarize:
|
if self.binarize:
|
||||||
'''
|
'''
|
||||||
@ -431,12 +465,14 @@ class MainWorkflow(WorkflowRunner):
|
|||||||
' # binarization #
|
' # binarization #
|
||||||
' ##################################################
|
' ##################################################
|
||||||
'''
|
'''
|
||||||
|
binarization_tasks = []
|
||||||
for i, job in enumerate(self.jobs):
|
for i, job in enumerate(self.jobs):
|
||||||
self.addWorkflowTask(
|
task = self.addWorkflowTask(
|
||||||
'binarization_-_{}'.format(i),
|
'binarization_-_{}'.format(i),
|
||||||
BinarizationWorkflow(job),
|
BinarizationWorkflow(job),
|
||||||
dependencies='split_input_-_{}'.format(i)
|
dependencies='split_input_-_{}'.format(i)
|
||||||
)
|
)
|
||||||
|
binarization_tasks.append(task)
|
||||||
|
|
||||||
'''
|
'''
|
||||||
' ##################################################
|
' ##################################################
|
||||||
@ -451,7 +487,7 @@ class MainWorkflow(WorkflowRunner):
|
|||||||
deps = 'split_input_-_{}'.format(i)
|
deps = 'split_input_-_{}'.format(i)
|
||||||
task = self.addWorkflowTask(
|
task = self.addWorkflowTask(
|
||||||
'ocr_-_{}'.format(i),
|
'ocr_-_{}'.format(i),
|
||||||
OCRWorkflow(job, self.lang),
|
OCRWorkflow(job, self.model),
|
||||||
dependencies=deps
|
dependencies=deps
|
||||||
)
|
)
|
||||||
ocr_tasks.append(task)
|
ocr_tasks.append(task)
|
||||||
@ -527,55 +563,80 @@ class MainWorkflow(WorkflowRunner):
|
|||||||
create_txt_tasks.append(task)
|
create_txt_tasks.append(task)
|
||||||
|
|
||||||
self.waitForTasks()
|
self.waitForTasks()
|
||||||
output_files = []
|
outputs = []
|
||||||
for job in self.jobs:
|
for job in self.jobs:
|
||||||
# Remove temporary directory
|
# Remove temporary directory
|
||||||
os.rmdir(job.tmp_dir)
|
os.rmdir(job.tmp_dir)
|
||||||
# Track output files
|
# Track output files
|
||||||
relative_output_dir = os.path.relpath(job.output_dir, start=self.output_dir) # noqa
|
relative_output_dir = os.path.relpath(
|
||||||
output_files.append(
|
job.output_dir,
|
||||||
|
start=self.output_dir
|
||||||
|
)
|
||||||
|
outputs.append(
|
||||||
{
|
{
|
||||||
'description': 'Post correction package (.png and .hocr).',
|
'description': 'Post correction package (.png and .hocr).',
|
||||||
'file': os.path.join(relative_output_dir, '{}.poco.zip'.format(job.name)), # noqa
|
'file': os.path.join(
|
||||||
|
relative_output_dir,
|
||||||
|
'{}.poco.zip'.format(job.name)
|
||||||
|
),
|
||||||
'mimetype': 'application/zip'
|
'mimetype': 'application/zip'
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
output_files.append(
|
outputs.append(
|
||||||
{
|
{
|
||||||
'description': 'PDF file with text layer.',
|
'description': 'PDF file with text layer.',
|
||||||
'file': os.path.join(relative_output_dir, '{}.pdf'.format(job.name)), # noqa
|
'file': os.path.join(
|
||||||
|
relative_output_dir,
|
||||||
|
'{}.pdf'.format(job.name)
|
||||||
|
),
|
||||||
'mimetype': 'application/pdf'
|
'mimetype': 'application/pdf'
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
output_files.append(
|
outputs.append(
|
||||||
{
|
{
|
||||||
'description': 'Plain text file.',
|
'description': 'Plain text file.',
|
||||||
'file': os.path.join(relative_output_dir, '{}.txt'.format(job.name)), # noqa
|
'file': os.path.join(
|
||||||
|
relative_output_dir,
|
||||||
|
'{}.txt'.format(job.name)
|
||||||
|
),
|
||||||
'mimetype': 'text/plain'
|
'mimetype': 'text/plain'
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
output_files.append(
|
outputs.append(
|
||||||
{
|
{
|
||||||
'description': 'TEI compliant XML file.',
|
'description': 'TEI compliant XML file.',
|
||||||
'file': os.path.join(relative_output_dir, '{}.tei.xml'.format(job.name)), # noqa
|
'file': os.path.join(
|
||||||
|
relative_output_dir,
|
||||||
|
'{}.tei.xml'.format(job.name)
|
||||||
|
),
|
||||||
'mimetype': 'application/tei+xml'
|
'mimetype': 'application/tei+xml'
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
with open(os.path.join(self.output_dir, 'output_records.json'), 'w') as f: # noqa
|
with open(os.path.join(self.output_dir, 'outputs.json'), 'w') as f:
|
||||||
json.dump(output_files, f, indent=4)
|
json.dump(outputs, f, indent=4)
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
parser = ArgumentParser(description='Pipeline for PDF file OCR processing')
|
parser = ArgumentParser(
|
||||||
|
description='Pipeline for PDF file OCR processing'
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-i', '--input-dir', help='Input directory', required=True)
|
'-i', '--input-dir',
|
||||||
|
help='Input directory',
|
||||||
|
required=True
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-o', '--output-dir', help='Output directory', required=True)
|
'-o', '--output-dir',
|
||||||
|
help='Output directory',
|
||||||
|
required=True
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-l', '--language',
|
'-m', '--model',
|
||||||
choices=[x[:-12] for x in os.listdir('/usr/local/share/tessdata')
|
choices=[
|
||||||
if x.endswith('.traineddata') and len(x) > 12],
|
x[:-12] for x in os.listdir('/usr/local/share/tessdata')
|
||||||
help='Language of the input (3-character ISO 639-2 language codes)',
|
if x.endswith('.traineddata') and len(x) > 12
|
||||||
|
],
|
||||||
|
help='Name of the model to be used',
|
||||||
required=True
|
required=True
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -584,16 +645,19 @@ def parse_args():
|
|||||||
help='Add binarization as a preprocessing step'
|
help='Add binarization as a preprocessing step'
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--log-dir', help='Logging directory (Default: --output-dir)')
|
'--log-dir',
|
||||||
|
help='Logging directory (Default: --output-dir)'
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--mem-mb',
|
'--mem-mb',
|
||||||
help='Amount of system memory to be used (Default: min(--n-cores * 512, available system memory))', # noqa
|
help='Amount of system memory to be used '
|
||||||
|
'(Default: min(--n-cores * 512, available system memory))',
|
||||||
type=int
|
type=int
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--n-cores',
|
'--n-cores',
|
||||||
default=min(4, multiprocessing.cpu_count()),
|
default=1,
|
||||||
help='Number of CPU threads to be used (Default: min(4, CPU count))',
|
help='Number of CPU threads to be used',
|
||||||
type=int
|
type=int
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -620,10 +684,17 @@ def parse_args():
|
|||||||
def main():
|
def main():
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
main_workflow = MainWorkflow(
|
main_workflow = MainWorkflow(
|
||||||
args.input_dir, args.language, args.output_dir, args.binarize)
|
args.input_dir,
|
||||||
|
args.model,
|
||||||
|
args.output_dir,
|
||||||
|
args.binarize
|
||||||
|
)
|
||||||
main_workflow.collect_jobs()
|
main_workflow.collect_jobs()
|
||||||
retval = main_workflow.run(
|
retval = main_workflow.run(
|
||||||
dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores)
|
dataDirRoot=args.log_dir,
|
||||||
|
memMb=args.mem_mb,
|
||||||
|
nCores=args.n_cores
|
||||||
|
)
|
||||||
sys.exit(retval)
|
sys.exit(retval)
|
||||||
|
|
||||||
|
|
||||||
|
@ -17,7 +17,7 @@ GID = str(os.getgid())
|
|||||||
parser = ArgumentParser(add_help=False)
|
parser = ArgumentParser(add_help=False)
|
||||||
parser.add_argument('-i', '--input-dir')
|
parser.add_argument('-i', '--input-dir')
|
||||||
parser.add_argument('-o', '--output-dir')
|
parser.add_argument('-o', '--output-dir')
|
||||||
parser.add_argument('-m', '--model', action='extend', dest='models', nargs='+')
|
parser.add_argument('-t', '--model-file', action='extend', nargs='+')
|
||||||
parser.add_argument('--log-dir')
|
parser.add_argument('--log-dir')
|
||||||
args, remaining_args = parser.parse_known_args()
|
args, remaining_args = parser.parse_known_args()
|
||||||
|
|
||||||
@ -30,9 +30,9 @@ if args.output_dir is not None:
|
|||||||
mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}'
|
mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}'
|
||||||
cmd += ['-v', mapping]
|
cmd += ['-v', mapping]
|
||||||
remaining_args += ['-o', CONTAINER_OUTPUT_DIR]
|
remaining_args += ['-o', CONTAINER_OUTPUT_DIR]
|
||||||
if args.models is not None:
|
if args.model_file is not None:
|
||||||
for model in args.models:
|
for model_file in args.model_file:
|
||||||
mapping = f'{os.path.abspath(model)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model)}' # noqa
|
mapping = f'{os.path.abspath(model_file)}:{CONTAINER_MODELS_DIR}/{os.path.basename(model_file)}' # noqa
|
||||||
cmd += ['-v', mapping]
|
cmd += ['-v', mapping]
|
||||||
if args.log_dir is not None:
|
if args.log_dir is not None:
|
||||||
mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}'
|
mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}'
|
||||||
|
Loading…
x
Reference in New Issue
Block a user