Codestyle enhacements

2026-02-12 12:02:03 +00:00 · 2022-01-27 13:40:23 +01:00
parent aeab9b7802
commit 4518ca1c83
5 changed files with 187 additions and 82 deletions
--- a/23
+++ b/23
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3.7
 # coding=utf-8

-""""Convert hOCR to TEI XML."""
+''' Convert hOCR to TEI XML. '''

 from argparse import ArgumentParser
 from lxml import html
@@ -10,8 +10,15 @@ import re


 parser = ArgumentParser(description='Convert hOCR to TEI XML.')
-parser.add_argument('file', help='Input file')
-parser.add_argument('-o', '--output-file', help='Output file', required=True)
+parser.add_argument(
+    '-i', '--input-file',
+    help='Input file'
+)
+parser.add_argument(
+    '-o', '--output-file',
+    help='Output file',
+    required=True
+)
 args = parser.parse_args()


@@ -32,7 +39,7 @@ tei += '    </fileDesc>\n'
 tei += '  </teiHeader>\n'
 tei += '  <text>\n'
 tei += '    <body>\n'
-hocr = html.parse(args.file)
+hocr = html.parse(args.input_file)
 for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
    ocr_page_title_attrib = ocr_page.attrib.get('title')
    facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1)
@@ -42,11 +49,13 @@ for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
        tei += '      <p>\n'
        for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'):
            tei += '        <lb/>'
-            indent = ''
+            is_first_word_in_line = True
            for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'):
                if ocrx_word.text is not None:
-                    tei += indent + escape(ocrx_word.text)
-                    indent = ' '
+                    if not is_first_word_in_line:
+                        tei += ' '
+                    tei += escape(ocrx_word.text)
+                    is_first_word_in_line = False
            tei += '\n'
        tei += '      </p>\n'
 tei += '    </body>\n'