Codestyle enhacements

This commit is contained in:
Patrick Jentsch
2022-01-27 13:40:23 +01:00
parent aeab9b7802
commit 4518ca1c83
5 changed files with 187 additions and 82 deletions

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python3.7
# coding=utf-8
""""Convert hOCR to TEI XML."""
''' Convert hOCR to TEI XML. '''
from argparse import ArgumentParser
from lxml import html
@ -10,8 +10,15 @@ import re
parser = ArgumentParser(description='Convert hOCR to TEI XML.')
parser.add_argument('file', help='Input file')
parser.add_argument('-o', '--output-file', help='Output file', required=True)
parser.add_argument(
'-i', '--input-file',
help='Input file'
)
parser.add_argument(
'-o', '--output-file',
help='Output file',
required=True
)
args = parser.parse_args()
@ -32,7 +39,7 @@ tei += ' </fileDesc>\n'
tei += ' </teiHeader>\n'
tei += ' <text>\n'
tei += ' <body>\n'
hocr = html.parse(args.file)
hocr = html.parse(args.input_file)
for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
ocr_page_title_attrib = ocr_page.attrib.get('title')
facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1)
@ -42,11 +49,13 @@ for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
tei += ' <p>\n'
for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'):
tei += ' <lb/>'
indent = ''
is_first_word_in_line = True
for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'):
if ocrx_word.text is not None:
tei += indent + escape(ocrx_word.text)
indent = ' '
if not is_first_word_in_line:
tei += ' '
tei += escape(ocrx_word.text)
is_first_word_in_line = False
tei += '\n'
tei += ' </p>\n'
tei += ' </body>\n'