diff --git a/hocrtotei b/hocrtotei
index efdf179..96a4045 100755
--- a/hocrtotei
+++ b/hocrtotei
@@ -5,7 +5,9 @@ from xml.sax.saxutils import escape
import argparse
import xml.etree.ElementTree as ET
-parser = argparse.ArgumentParser(description='hocrtotei merges several hOCR files in order of their occurrence on command line to one TEI result file.')
+parser = argparse.ArgumentParser(
+ description='Merges several hOCR files in order of their occurrence on command line to one TEI result file.'
+)
parser.add_argument(
'i',
metavar='hOCR-sourcefile',
@@ -17,7 +19,7 @@ parser.add_argument(
)
args = parser.parse_args()
-output_file = open(args.o, "w")
+output_file = open(args.o, 'w')
output_file.write(
'\n'
@@ -37,11 +39,11 @@ output_file.write(
for index, input_file in enumerate(args.i):
tree = ET.parse(input_file)
output_file.write('
\n') - for line in para.findall(".//*[@class='ocr_line']"): + for line in para.findall('.//*[@class="ocr_line"]'): first_word_in_line = True - for word in line.findall(".//*[@class='ocrx_word']"): + for word in line.findall('.//*[@class="ocrx_word"]'): if word.text is not None: output_file.write((' ' if first_word_in_line else ' ') + escape(word.text.strip())) first_word_in_line = False diff --git a/ocr b/ocr index 5783263..ffe63aa 100755 --- a/ocr +++ b/ocr @@ -21,7 +21,7 @@ from pyflow import WorkflowRunner ''' TODO: ' Implement --end-page: Last page to ocr ' Implement --memMb: Total amount of memory (RAM) available for this workflow. -' Default: 2048 * nCores +' Default: 2048 * n_cores ' Implement --rotate: Rotate pages from input (90, 180, 270) ' Implement --split-pages: Split pages in half after possible rotation ' Implement --start-page: First page to ocr @@ -123,7 +123,7 @@ class OCRWorkflow(WorkflowRunner): ' ################################################## ''' split_jobs = [] - split_job_nCores = min( + split_job_n_cores = min( self.n_cores, max(1, int(self.n_cores / len(self.jobs))) ) @@ -148,7 +148,7 @@ class OCRWorkflow(WorkflowRunner): command=cmd, dependencies='create_output_directories_job_-_%i' % (index), label='split_job_-_%i' % (index), - nCores=split_job_nCores + nCores=split_job_n_cores ) ) @@ -170,7 +170,7 @@ class OCRWorkflow(WorkflowRunner): ' four cores available for this workflow, the available core ' number. ''' - binarisation_job_nCores = min(4, self.n_cores) + binarisation_job_n_cores = min(4, self.n_cores) for index, job in enumerate(self.jobs): files = os.listdir(os.path.join(job['output_dir'], 'tmp')) files = filter(lambda x: x.endswith('.tif'), files) @@ -181,7 +181,7 @@ class OCRWorkflow(WorkflowRunner): ) cmd = 'ocropus-nlbin --output "%s" --parallel "%i" %s' % ( os.path.join(job['output_dir'], 'tmp'), - binarisation_job_nCores, + binarisation_job_n_cores, ' '.join(files) ) binarisation_jobs.append( @@ -189,7 +189,7 @@ class OCRWorkflow(WorkflowRunner): command=cmd, dependencies='split_job_-_%i' % (index), label='binarisation_job_-_%i' % (index), - nCores=binarisation_job_nCores + nCores=binarisation_job_n_cores ) ) @@ -249,13 +249,13 @@ class OCRWorkflow(WorkflowRunner): ' or, if there are less then four cores available for this workflow, ' the available core number. ''' - ocr_job_nCores = min(4, self.n_cores) + ocr_job_n_cores = min(4, self.n_cores) ''' ' WORKAROUND: Tesseract only uses one core for the deu_frak language ' model, so the workflow will also only reserve one in this case. ''' if self.lang == "deu_frak": - ocr_job_nCores = 1 + ocr_job_n_cores = 1 for index, job in enumerate(self.jobs): files = os.listdir(os.path.join(job['output_dir'], 'tmp')) if self.skip_binarisation: @@ -293,7 +293,7 @@ class OCRWorkflow(WorkflowRunner): command=cmd, dependencies=ocr_job_dependencies, label='ocr_job_-_%i-%i' % (index, number), - nCores=ocr_job_nCores + nCores=ocr_job_n_cores ) ) number += 1