From e1462152fe437c1b8bf5dd4c582d125c71ce8fb0 Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Mon, 20 May 2019 11:10:40 +0200 Subject: [PATCH] Codestyle --- hocrtotei | 12 +++++++----- ocr | 18 +++++++++--------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/hocrtotei b/hocrtotei index efdf179..96a4045 100755 --- a/hocrtotei +++ b/hocrtotei @@ -5,7 +5,9 @@ from xml.sax.saxutils import escape import argparse import xml.etree.ElementTree as ET -parser = argparse.ArgumentParser(description='hocrtotei merges several hOCR files in order of their occurrence on command line to one TEI result file.') +parser = argparse.ArgumentParser( + description='Merges several hOCR files in order of their occurrence on command line to one TEI result file.' +) parser.add_argument( 'i', metavar='hOCR-sourcefile', @@ -17,7 +19,7 @@ parser.add_argument( ) args = parser.parse_args() -output_file = open(args.o, "w") +output_file = open(args.o, 'w') output_file.write( '\n' @@ -37,11 +39,11 @@ output_file.write( for index, input_file in enumerate(args.i): tree = ET.parse(input_file) output_file.write(' \n' % (index + 1)) - for para in tree.findall(".//*[@class='ocr_par']"): + for para in tree.findall('.//*[@class="ocr_par"]'): output_file.write('

\n') - for line in para.findall(".//*[@class='ocr_line']"): + for line in para.findall('.//*[@class="ocr_line"]'): first_word_in_line = True - for word in line.findall(".//*[@class='ocrx_word']"): + for word in line.findall('.//*[@class="ocrx_word"]'): if word.text is not None: output_file.write((' ' if first_word_in_line else ' ') + escape(word.text.strip())) first_word_in_line = False diff --git a/ocr b/ocr index 5783263..ffe63aa 100755 --- a/ocr +++ b/ocr @@ -21,7 +21,7 @@ from pyflow import WorkflowRunner ''' TODO: ' Implement --end-page: Last page to ocr ' Implement --memMb: Total amount of memory (RAM) available for this workflow. -' Default: 2048 * nCores +' Default: 2048 * n_cores ' Implement --rotate: Rotate pages from input (90, 180, 270) ' Implement --split-pages: Split pages in half after possible rotation ' Implement --start-page: First page to ocr @@ -123,7 +123,7 @@ class OCRWorkflow(WorkflowRunner): ' ################################################## ''' split_jobs = [] - split_job_nCores = min( + split_job_n_cores = min( self.n_cores, max(1, int(self.n_cores / len(self.jobs))) ) @@ -148,7 +148,7 @@ class OCRWorkflow(WorkflowRunner): command=cmd, dependencies='create_output_directories_job_-_%i' % (index), label='split_job_-_%i' % (index), - nCores=split_job_nCores + nCores=split_job_n_cores ) ) @@ -170,7 +170,7 @@ class OCRWorkflow(WorkflowRunner): ' four cores available for this workflow, the available core ' number. ''' - binarisation_job_nCores = min(4, self.n_cores) + binarisation_job_n_cores = min(4, self.n_cores) for index, job in enumerate(self.jobs): files = os.listdir(os.path.join(job['output_dir'], 'tmp')) files = filter(lambda x: x.endswith('.tif'), files) @@ -181,7 +181,7 @@ class OCRWorkflow(WorkflowRunner): ) cmd = 'ocropus-nlbin --output "%s" --parallel "%i" %s' % ( os.path.join(job['output_dir'], 'tmp'), - binarisation_job_nCores, + binarisation_job_n_cores, ' '.join(files) ) binarisation_jobs.append( @@ -189,7 +189,7 @@ class OCRWorkflow(WorkflowRunner): command=cmd, dependencies='split_job_-_%i' % (index), label='binarisation_job_-_%i' % (index), - nCores=binarisation_job_nCores + nCores=binarisation_job_n_cores ) ) @@ -249,13 +249,13 @@ class OCRWorkflow(WorkflowRunner): ' or, if there are less then four cores available for this workflow, ' the available core number. ''' - ocr_job_nCores = min(4, self.n_cores) + ocr_job_n_cores = min(4, self.n_cores) ''' ' WORKAROUND: Tesseract only uses one core for the deu_frak language ' model, so the workflow will also only reserve one in this case. ''' if self.lang == "deu_frak": - ocr_job_nCores = 1 + ocr_job_n_cores = 1 for index, job in enumerate(self.jobs): files = os.listdir(os.path.join(job['output_dir'], 'tmp')) if self.skip_binarisation: @@ -293,7 +293,7 @@ class OCRWorkflow(WorkflowRunner): command=cmd, dependencies=ocr_job_dependencies, label='ocr_job_-_%i-%i' % (index, number), - nCores=ocr_job_nCores + nCores=ocr_job_n_cores ) ) number += 1