ocr/hocr2tei

59 lines
1.8 KiB
Plaintext
Raw Normal View History

2020-04-06 09:21:52 +02:00
#!/usr/bin/env python3.7
2018-10-09 14:43:23 +02:00
# coding=utf-8
2021-03-17 16:58:13 +01:00
""""Convert hOCR to TEI XML."""
2021-02-19 13:04:03 +01:00
2020-04-03 17:35:30 +02:00
from argparse import ArgumentParser
from lxml import html
from xml.sax.saxutils import escape
import re
2018-10-09 14:43:23 +02:00
2021-03-17 16:58:13 +01:00
parser = ArgumentParser(description='Convert hOCR to TEI XML.')
parser.add_argument('file', help='Input file')
parser.add_argument('-o', '--output-file', help='Output file', required=True)
2019-05-16 14:21:01 +02:00
args = parser.parse_args()
2021-03-17 16:58:13 +01:00
tei = ''
tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n'
tei += ' <teiHeader>\n'
tei += ' <fileDesc>\n'
tei += ' <titleStmt>\n'
tei += ' <title></title>\n'
tei += ' </titleStmt>\n'
tei += ' <publicationStmt>\n'
tei += ' <p></p>\n'
tei += ' </publicationStmt>\n'
tei += ' <sourceDesc>\n'
tei += ' <p></p>\n'
tei += ' </sourceDesc>\n'
tei += ' </fileDesc>\n'
tei += ' </teiHeader>\n'
tei += ' <text>\n'
tei += ' <body>\n'
hocr = html.parse(args.file)
for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
ocr_page_title_attrib = ocr_page.attrib.get('title')
facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1)
page_number = re.search(r'ppageno (\d+)', ocr_page_title_attrib).group(1)
tei += f' <pb facs="{facsimile}" n="{page_number}"/>\n'
for ocr_par in ocr_page.findall('.//p[@class="ocr_par"]'):
2021-03-17 16:58:13 +01:00
tei += ' <p>\n'
for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'):
2021-03-17 16:58:13 +01:00
tei += ' <lb/>'
indent = ''
for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'):
if ocrx_word.text is not None:
tei += indent + escape(ocrx_word.text)
indent = ' '
2021-03-17 16:58:13 +01:00
tei += '\n'
tei += ' </p>\n'
tei += ' </body>\n'
tei += ' </text>\n'
tei += '</TEI>\n'
with open(args.output_file, 'w') as f:
f.write(tei)