Codestyle

2026-08-02 13:48:04 +00:00 · 2019-05-20 11:10:40 +02:00
parent 93de923b4e
commit e1462152fe
2 changed files with 16 additions and 14 deletions
@@ -5,7 +5,9 @@ from xml.sax.saxutils import escape
 import argparse
 import xml.etree.ElementTree as ET

-parser = argparse.ArgumentParser(description='hocrtotei merges several hOCR files in order of their occurrence on command line to one TEI result file.')
+parser = argparse.ArgumentParser(
+    description='Merges several hOCR files in order of their occurrence on command line to one TEI result file.'
+)
 parser.add_argument(
    'i',
    metavar='hOCR-sourcefile',
@@ -17,7 +19,7 @@ parser.add_argument(
 )
 args = parser.parse_args()

-output_file = open(args.o, "w")
+output_file = open(args.o, 'w')

 output_file.write(
      '<?xml version="1.0" encoding="UTF-8"?>\n'
@@ -37,11 +39,11 @@ output_file.write(
 for index, input_file in enumerate(args.i):
    tree = ET.parse(input_file)
    output_file.write('            <pb n="%i"/>\n' % (index + 1))
-    for para in tree.findall(".//*[@class='ocr_par']"):
+    for para in tree.findall('.//*[@class="ocr_par"]'):
        output_file.write('            <p>\n')
-        for line in para.findall(".//*[@class='ocr_line']"):
+        for line in para.findall('.//*[@class="ocr_line"]'):
            first_word_in_line = True
-            for word in line.findall(".//*[@class='ocrx_word']"):
+            for word in line.findall('.//*[@class="ocrx_word"]'):
                if word.text is not None:
                    output_file.write(('                ' if first_word_in_line else ' ') + escape(word.text.strip()))
                    first_word_in_line = False
@@ -21,7 +21,7 @@ from pyflow import WorkflowRunner
 ''' TODO:
 ' Implement --end-page: Last page to ocr
 ' Implement --memMb: Total amount of memory (RAM) available for this workflow.
-'                    Default: 2048 * nCores
+'                    Default: 2048 * n_cores
 ' Implement --rotate: Rotate pages from input (90, 180, 270)
 ' Implement --split-pages: Split pages in half after possible rotation
 ' Implement --start-page: First page to ocr
@@ -123,7 +123,7 @@ class OCRWorkflow(WorkflowRunner):
        ' ##################################################
        '''
        split_jobs = []
-        split_job_nCores = min(
+        split_job_n_cores = min(
            self.n_cores,
            max(1, int(self.n_cores / len(self.jobs)))
        )
@@ -148,7 +148,7 @@ class OCRWorkflow(WorkflowRunner):
                    command=cmd,
                    dependencies='create_output_directories_job_-_%i' % (index),
                    label='split_job_-_%i' % (index),
-                    nCores=split_job_nCores
+                    nCores=split_job_n_cores
                )
            )

@@ -170,7 +170,7 @@ class OCRWorkflow(WorkflowRunner):
            ' four cores available for this workflow, the available core
            ' number.
            '''
-            binarisation_job_nCores = min(4, self.n_cores)
+            binarisation_job_n_cores = min(4, self.n_cores)
            for index, job in enumerate(self.jobs):
                files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
                files = filter(lambda x: x.endswith('.tif'), files)
@@ -181,7 +181,7 @@ class OCRWorkflow(WorkflowRunner):
                )
                cmd = 'ocropus-nlbin --output "%s" --parallel "%i" %s' % (
                    os.path.join(job['output_dir'], 'tmp'),
-                    binarisation_job_nCores,
+                    binarisation_job_n_cores,
                    ' '.join(files)
                )
                binarisation_jobs.append(
@@ -189,7 +189,7 @@ class OCRWorkflow(WorkflowRunner):
                        command=cmd,
                        dependencies='split_job_-_%i' % (index),
                        label='binarisation_job_-_%i' % (index),
-                        nCores=binarisation_job_nCores
+                        nCores=binarisation_job_n_cores
                    )
                )

@@ -249,13 +249,13 @@ class OCRWorkflow(WorkflowRunner):
        ' or, if there are less then four cores available for this workflow,
        ' the available core number.
        '''
-        ocr_job_nCores = min(4, self.n_cores)
+        ocr_job_n_cores = min(4, self.n_cores)
        '''
        ' WORKAROUND: Tesseract only uses one core for the deu_frak language
        ' model, so the workflow will also only reserve one in this case.
        '''
        if self.lang == "deu_frak":
-            ocr_job_nCores = 1
+            ocr_job_n_cores = 1
        for index, job in enumerate(self.jobs):
            files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
            if self.skip_binarisation:
@@ -293,7 +293,7 @@ class OCRWorkflow(WorkflowRunner):
                        command=cmd,
                        dependencies=ocr_job_dependencies,
                        label='ocr_job_-_%i-%i' % (index, number),
-                        nCores=ocr_job_nCores
+                        nCores=ocr_job_n_cores
                    )
                )
                number += 1