mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2024-12-27 07:34:18 +00:00
Codestyle
This commit is contained in:
parent
93de923b4e
commit
e1462152fe
12
hocrtotei
12
hocrtotei
@ -5,7 +5,9 @@ from xml.sax.saxutils import escape
|
|||||||
import argparse
|
import argparse
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description='hocrtotei merges several hOCR files in order of their occurrence on command line to one TEI result file.')
|
parser = argparse.ArgumentParser(
|
||||||
|
description='Merges several hOCR files in order of their occurrence on command line to one TEI result file.'
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'i',
|
'i',
|
||||||
metavar='hOCR-sourcefile',
|
metavar='hOCR-sourcefile',
|
||||||
@ -17,7 +19,7 @@ parser.add_argument(
|
|||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
output_file = open(args.o, "w")
|
output_file = open(args.o, 'w')
|
||||||
|
|
||||||
output_file.write(
|
output_file.write(
|
||||||
'<?xml version="1.0" encoding="UTF-8"?>\n'
|
'<?xml version="1.0" encoding="UTF-8"?>\n'
|
||||||
@ -37,11 +39,11 @@ output_file.write(
|
|||||||
for index, input_file in enumerate(args.i):
|
for index, input_file in enumerate(args.i):
|
||||||
tree = ET.parse(input_file)
|
tree = ET.parse(input_file)
|
||||||
output_file.write(' <pb n="%i"/>\n' % (index + 1))
|
output_file.write(' <pb n="%i"/>\n' % (index + 1))
|
||||||
for para in tree.findall(".//*[@class='ocr_par']"):
|
for para in tree.findall('.//*[@class="ocr_par"]'):
|
||||||
output_file.write(' <p>\n')
|
output_file.write(' <p>\n')
|
||||||
for line in para.findall(".//*[@class='ocr_line']"):
|
for line in para.findall('.//*[@class="ocr_line"]'):
|
||||||
first_word_in_line = True
|
first_word_in_line = True
|
||||||
for word in line.findall(".//*[@class='ocrx_word']"):
|
for word in line.findall('.//*[@class="ocrx_word"]'):
|
||||||
if word.text is not None:
|
if word.text is not None:
|
||||||
output_file.write((' ' if first_word_in_line else ' ') + escape(word.text.strip()))
|
output_file.write((' ' if first_word_in_line else ' ') + escape(word.text.strip()))
|
||||||
first_word_in_line = False
|
first_word_in_line = False
|
||||||
|
18
ocr
18
ocr
@ -21,7 +21,7 @@ from pyflow import WorkflowRunner
|
|||||||
''' TODO:
|
''' TODO:
|
||||||
' Implement --end-page: Last page to ocr
|
' Implement --end-page: Last page to ocr
|
||||||
' Implement --memMb: Total amount of memory (RAM) available for this workflow.
|
' Implement --memMb: Total amount of memory (RAM) available for this workflow.
|
||||||
' Default: 2048 * nCores
|
' Default: 2048 * n_cores
|
||||||
' Implement --rotate: Rotate pages from input (90, 180, 270)
|
' Implement --rotate: Rotate pages from input (90, 180, 270)
|
||||||
' Implement --split-pages: Split pages in half after possible rotation
|
' Implement --split-pages: Split pages in half after possible rotation
|
||||||
' Implement --start-page: First page to ocr
|
' Implement --start-page: First page to ocr
|
||||||
@ -123,7 +123,7 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
' ##################################################
|
' ##################################################
|
||||||
'''
|
'''
|
||||||
split_jobs = []
|
split_jobs = []
|
||||||
split_job_nCores = min(
|
split_job_n_cores = min(
|
||||||
self.n_cores,
|
self.n_cores,
|
||||||
max(1, int(self.n_cores / len(self.jobs)))
|
max(1, int(self.n_cores / len(self.jobs)))
|
||||||
)
|
)
|
||||||
@ -148,7 +148,7 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
command=cmd,
|
command=cmd,
|
||||||
dependencies='create_output_directories_job_-_%i' % (index),
|
dependencies='create_output_directories_job_-_%i' % (index),
|
||||||
label='split_job_-_%i' % (index),
|
label='split_job_-_%i' % (index),
|
||||||
nCores=split_job_nCores
|
nCores=split_job_n_cores
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -170,7 +170,7 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
' four cores available for this workflow, the available core
|
' four cores available for this workflow, the available core
|
||||||
' number.
|
' number.
|
||||||
'''
|
'''
|
||||||
binarisation_job_nCores = min(4, self.n_cores)
|
binarisation_job_n_cores = min(4, self.n_cores)
|
||||||
for index, job in enumerate(self.jobs):
|
for index, job in enumerate(self.jobs):
|
||||||
files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
|
files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
|
||||||
files = filter(lambda x: x.endswith('.tif'), files)
|
files = filter(lambda x: x.endswith('.tif'), files)
|
||||||
@ -181,7 +181,7 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
)
|
)
|
||||||
cmd = 'ocropus-nlbin --output "%s" --parallel "%i" %s' % (
|
cmd = 'ocropus-nlbin --output "%s" --parallel "%i" %s' % (
|
||||||
os.path.join(job['output_dir'], 'tmp'),
|
os.path.join(job['output_dir'], 'tmp'),
|
||||||
binarisation_job_nCores,
|
binarisation_job_n_cores,
|
||||||
' '.join(files)
|
' '.join(files)
|
||||||
)
|
)
|
||||||
binarisation_jobs.append(
|
binarisation_jobs.append(
|
||||||
@ -189,7 +189,7 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
command=cmd,
|
command=cmd,
|
||||||
dependencies='split_job_-_%i' % (index),
|
dependencies='split_job_-_%i' % (index),
|
||||||
label='binarisation_job_-_%i' % (index),
|
label='binarisation_job_-_%i' % (index),
|
||||||
nCores=binarisation_job_nCores
|
nCores=binarisation_job_n_cores
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -249,13 +249,13 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
' or, if there are less then four cores available for this workflow,
|
' or, if there are less then four cores available for this workflow,
|
||||||
' the available core number.
|
' the available core number.
|
||||||
'''
|
'''
|
||||||
ocr_job_nCores = min(4, self.n_cores)
|
ocr_job_n_cores = min(4, self.n_cores)
|
||||||
'''
|
'''
|
||||||
' WORKAROUND: Tesseract only uses one core for the deu_frak language
|
' WORKAROUND: Tesseract only uses one core for the deu_frak language
|
||||||
' model, so the workflow will also only reserve one in this case.
|
' model, so the workflow will also only reserve one in this case.
|
||||||
'''
|
'''
|
||||||
if self.lang == "deu_frak":
|
if self.lang == "deu_frak":
|
||||||
ocr_job_nCores = 1
|
ocr_job_n_cores = 1
|
||||||
for index, job in enumerate(self.jobs):
|
for index, job in enumerate(self.jobs):
|
||||||
files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
|
files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
|
||||||
if self.skip_binarisation:
|
if self.skip_binarisation:
|
||||||
@ -293,7 +293,7 @@ class OCRWorkflow(WorkflowRunner):
|
|||||||
command=cmd,
|
command=cmd,
|
||||||
dependencies=ocr_job_dependencies,
|
dependencies=ocr_job_dependencies,
|
||||||
label='ocr_job_-_%i-%i' % (index, number),
|
label='ocr_job_-_%i-%i' % (index, number),
|
||||||
nCores=ocr_job_nCores
|
nCores=ocr_job_n_cores
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
number += 1
|
number += 1
|
||||||
|
Loading…
Reference in New Issue
Block a user