mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2024-12-26 05:04:17 +00:00
Codestyle
This commit is contained in:
parent
93de923b4e
commit
e1462152fe
12
hocrtotei
12
hocrtotei
@ -5,7 +5,9 @@ from xml.sax.saxutils import escape
|
||||
import argparse
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
parser = argparse.ArgumentParser(description='hocrtotei merges several hOCR files in order of their occurrence on command line to one TEI result file.')
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Merges several hOCR files in order of their occurrence on command line to one TEI result file.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'i',
|
||||
metavar='hOCR-sourcefile',
|
||||
@ -17,7 +19,7 @@ parser.add_argument(
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
output_file = open(args.o, "w")
|
||||
output_file = open(args.o, 'w')
|
||||
|
||||
output_file.write(
|
||||
'<?xml version="1.0" encoding="UTF-8"?>\n'
|
||||
@ -37,11 +39,11 @@ output_file.write(
|
||||
for index, input_file in enumerate(args.i):
|
||||
tree = ET.parse(input_file)
|
||||
output_file.write(' <pb n="%i"/>\n' % (index + 1))
|
||||
for para in tree.findall(".//*[@class='ocr_par']"):
|
||||
for para in tree.findall('.//*[@class="ocr_par"]'):
|
||||
output_file.write(' <p>\n')
|
||||
for line in para.findall(".//*[@class='ocr_line']"):
|
||||
for line in para.findall('.//*[@class="ocr_line"]'):
|
||||
first_word_in_line = True
|
||||
for word in line.findall(".//*[@class='ocrx_word']"):
|
||||
for word in line.findall('.//*[@class="ocrx_word"]'):
|
||||
if word.text is not None:
|
||||
output_file.write((' ' if first_word_in_line else ' ') + escape(word.text.strip()))
|
||||
first_word_in_line = False
|
||||
|
18
ocr
18
ocr
@ -21,7 +21,7 @@ from pyflow import WorkflowRunner
|
||||
''' TODO:
|
||||
' Implement --end-page: Last page to ocr
|
||||
' Implement --memMb: Total amount of memory (RAM) available for this workflow.
|
||||
' Default: 2048 * nCores
|
||||
' Default: 2048 * n_cores
|
||||
' Implement --rotate: Rotate pages from input (90, 180, 270)
|
||||
' Implement --split-pages: Split pages in half after possible rotation
|
||||
' Implement --start-page: First page to ocr
|
||||
@ -123,7 +123,7 @@ class OCRWorkflow(WorkflowRunner):
|
||||
' ##################################################
|
||||
'''
|
||||
split_jobs = []
|
||||
split_job_nCores = min(
|
||||
split_job_n_cores = min(
|
||||
self.n_cores,
|
||||
max(1, int(self.n_cores / len(self.jobs)))
|
||||
)
|
||||
@ -148,7 +148,7 @@ class OCRWorkflow(WorkflowRunner):
|
||||
command=cmd,
|
||||
dependencies='create_output_directories_job_-_%i' % (index),
|
||||
label='split_job_-_%i' % (index),
|
||||
nCores=split_job_nCores
|
||||
nCores=split_job_n_cores
|
||||
)
|
||||
)
|
||||
|
||||
@ -170,7 +170,7 @@ class OCRWorkflow(WorkflowRunner):
|
||||
' four cores available for this workflow, the available core
|
||||
' number.
|
||||
'''
|
||||
binarisation_job_nCores = min(4, self.n_cores)
|
||||
binarisation_job_n_cores = min(4, self.n_cores)
|
||||
for index, job in enumerate(self.jobs):
|
||||
files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
|
||||
files = filter(lambda x: x.endswith('.tif'), files)
|
||||
@ -181,7 +181,7 @@ class OCRWorkflow(WorkflowRunner):
|
||||
)
|
||||
cmd = 'ocropus-nlbin --output "%s" --parallel "%i" %s' % (
|
||||
os.path.join(job['output_dir'], 'tmp'),
|
||||
binarisation_job_nCores,
|
||||
binarisation_job_n_cores,
|
||||
' '.join(files)
|
||||
)
|
||||
binarisation_jobs.append(
|
||||
@ -189,7 +189,7 @@ class OCRWorkflow(WorkflowRunner):
|
||||
command=cmd,
|
||||
dependencies='split_job_-_%i' % (index),
|
||||
label='binarisation_job_-_%i' % (index),
|
||||
nCores=binarisation_job_nCores
|
||||
nCores=binarisation_job_n_cores
|
||||
)
|
||||
)
|
||||
|
||||
@ -249,13 +249,13 @@ class OCRWorkflow(WorkflowRunner):
|
||||
' or, if there are less then four cores available for this workflow,
|
||||
' the available core number.
|
||||
'''
|
||||
ocr_job_nCores = min(4, self.n_cores)
|
||||
ocr_job_n_cores = min(4, self.n_cores)
|
||||
'''
|
||||
' WORKAROUND: Tesseract only uses one core for the deu_frak language
|
||||
' model, so the workflow will also only reserve one in this case.
|
||||
'''
|
||||
if self.lang == "deu_frak":
|
||||
ocr_job_nCores = 1
|
||||
ocr_job_n_cores = 1
|
||||
for index, job in enumerate(self.jobs):
|
||||
files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
|
||||
if self.skip_binarisation:
|
||||
@ -293,7 +293,7 @@ class OCRWorkflow(WorkflowRunner):
|
||||
command=cmd,
|
||||
dependencies=ocr_job_dependencies,
|
||||
label='ocr_job_-_%i-%i' % (index, number),
|
||||
nCores=ocr_job_nCores
|
||||
nCores=ocr_job_n_cores
|
||||
)
|
||||
)
|
||||
number += 1
|
||||
|
Loading…
Reference in New Issue
Block a user