mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2025-07-01 17:20:33 +00:00
Codestyle enhacements
This commit is contained in:
23
hocr2tei
23
hocr2tei
@ -1,7 +1,7 @@
|
||||
#!/usr/bin/env python3.7
|
||||
# coding=utf-8
|
||||
|
||||
""""Convert hOCR to TEI XML."""
|
||||
''' Convert hOCR to TEI XML. '''
|
||||
|
||||
from argparse import ArgumentParser
|
||||
from lxml import html
|
||||
@ -10,8 +10,15 @@ import re
|
||||
|
||||
|
||||
parser = ArgumentParser(description='Convert hOCR to TEI XML.')
|
||||
parser.add_argument('file', help='Input file')
|
||||
parser.add_argument('-o', '--output-file', help='Output file', required=True)
|
||||
parser.add_argument(
|
||||
'-i', '--input-file',
|
||||
help='Input file'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-o', '--output-file',
|
||||
help='Output file',
|
||||
required=True
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
@ -32,7 +39,7 @@ tei += ' </fileDesc>\n'
|
||||
tei += ' </teiHeader>\n'
|
||||
tei += ' <text>\n'
|
||||
tei += ' <body>\n'
|
||||
hocr = html.parse(args.file)
|
||||
hocr = html.parse(args.input_file)
|
||||
for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
|
||||
ocr_page_title_attrib = ocr_page.attrib.get('title')
|
||||
facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1)
|
||||
@ -42,11 +49,13 @@ for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
|
||||
tei += ' <p>\n'
|
||||
for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'):
|
||||
tei += ' <lb/>'
|
||||
indent = ''
|
||||
is_first_word_in_line = True
|
||||
for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'):
|
||||
if ocrx_word.text is not None:
|
||||
tei += indent + escape(ocrx_word.text)
|
||||
indent = ' '
|
||||
if not is_first_word_in_line:
|
||||
tei += ' '
|
||||
tei += escape(ocrx_word.text)
|
||||
is_first_word_in_line = False
|
||||
tei += '\n'
|
||||
tei += ' </p>\n'
|
||||
tei += ' </body>\n'
|
||||
|
Reference in New Issue
Block a user