2020-04-06 07:21:52 +00:00
|
|
|
#!/usr/bin/env python3.7
|
2018-10-09 12:43:23 +00:00
|
|
|
# coding=utf-8
|
|
|
|
|
2022-01-27 12:40:23 +00:00
|
|
|
''' Convert hOCR to TEI XML. '''
|
2021-02-19 12:04:03 +00:00
|
|
|
|
2020-04-03 15:35:30 +00:00
|
|
|
from argparse import ArgumentParser
|
2022-01-04 10:42:55 +00:00
|
|
|
from lxml import html
|
|
|
|
from xml.sax.saxutils import escape
|
2021-03-15 11:45:05 +00:00
|
|
|
import re
|
2022-01-04 10:42:55 +00:00
|
|
|
|
2018-10-09 12:43:23 +00:00
|
|
|
|
2021-03-17 15:58:13 +00:00
|
|
|
parser = ArgumentParser(description='Convert hOCR to TEI XML.')
|
2022-01-27 12:40:23 +00:00
|
|
|
parser.add_argument(
|
|
|
|
'-i', '--input-file',
|
2022-02-03 09:40:50 +00:00
|
|
|
help='Input file',
|
|
|
|
required=True
|
2022-01-27 12:40:23 +00:00
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'-o', '--output-file',
|
|
|
|
help='Output file',
|
|
|
|
required=True
|
|
|
|
)
|
2019-05-16 12:21:01 +00:00
|
|
|
args = parser.parse_args()
|
|
|
|
|
2022-01-04 10:42:55 +00:00
|
|
|
|
2021-03-17 15:58:13 +00:00
|
|
|
tei = ''
|
|
|
|
tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n'
|
|
|
|
tei += ' <teiHeader>\n'
|
|
|
|
tei += ' <fileDesc>\n'
|
|
|
|
tei += ' <titleStmt>\n'
|
|
|
|
tei += ' <title></title>\n'
|
|
|
|
tei += ' </titleStmt>\n'
|
|
|
|
tei += ' <publicationStmt>\n'
|
|
|
|
tei += ' <p></p>\n'
|
|
|
|
tei += ' </publicationStmt>\n'
|
|
|
|
tei += ' <sourceDesc>\n'
|
|
|
|
tei += ' <p></p>\n'
|
|
|
|
tei += ' </sourceDesc>\n'
|
|
|
|
tei += ' </fileDesc>\n'
|
|
|
|
tei += ' </teiHeader>\n'
|
|
|
|
tei += ' <text>\n'
|
|
|
|
tei += ' <body>\n'
|
2022-01-27 12:40:23 +00:00
|
|
|
hocr = html.parse(args.input_file)
|
2022-01-04 10:42:55 +00:00
|
|
|
for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
|
|
|
|
ocr_page_title_attrib = ocr_page.attrib.get('title')
|
|
|
|
facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1)
|
|
|
|
page_number = re.search(r'ppageno (\d+)', ocr_page_title_attrib).group(1)
|
|
|
|
tei += f' <pb facs="{facsimile}" n="{page_number}"/>\n'
|
|
|
|
for ocr_par in ocr_page.findall('.//p[@class="ocr_par"]'):
|
2021-03-17 15:58:13 +00:00
|
|
|
tei += ' <p>\n'
|
2022-01-04 10:42:55 +00:00
|
|
|
for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'):
|
2021-03-17 15:58:13 +00:00
|
|
|
tei += ' <lb/>'
|
2022-01-27 12:40:23 +00:00
|
|
|
is_first_word_in_line = True
|
2022-01-04 10:42:55 +00:00
|
|
|
for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'):
|
|
|
|
if ocrx_word.text is not None:
|
2022-01-27 12:40:23 +00:00
|
|
|
if not is_first_word_in_line:
|
|
|
|
tei += ' '
|
|
|
|
tei += escape(ocrx_word.text)
|
|
|
|
is_first_word_in_line = False
|
2021-03-17 15:58:13 +00:00
|
|
|
tei += '\n'
|
|
|
|
tei += ' </p>\n'
|
|
|
|
tei += ' </body>\n'
|
|
|
|
tei += ' </text>\n'
|
|
|
|
tei += '</TEI>\n'
|
|
|
|
|
2022-01-04 10:42:55 +00:00
|
|
|
|
|
|
|
with open(args.output_file, 'w') as f:
|
|
|
|
f.write(tei)
|