ocr/hocrtotei

58 lines
1.8 KiB
Plaintext
Raw Normal View History

2020-04-06 07:21:52 +00:00
#!/usr/bin/env python3.7
2018-10-09 12:43:23 +00:00
# coding=utf-8
2021-03-17 15:58:13 +00:00
""""Convert hOCR to TEI XML."""
2021-02-19 12:04:03 +00:00
2018-10-09 12:43:23 +00:00
from xml.sax.saxutils import escape
2020-04-03 15:35:30 +00:00
from argparse import ArgumentParser
import re
2019-05-16 12:21:01 +00:00
import xml.etree.ElementTree as ET
2018-10-09 12:43:23 +00:00
2021-03-17 15:58:13 +00:00
parser = ArgumentParser(description='Convert hOCR to TEI XML.')
parser.add_argument('input', metavar='Path to hOCR input file')
parser.add_argument('output', metavar='Path to TEI output file')
2019-05-16 12:21:01 +00:00
args = parser.parse_args()
2021-03-17 15:58:13 +00:00
tei = ''
tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n'
tei += ' <teiHeader>\n'
tei += ' <fileDesc>\n'
tei += ' <titleStmt>\n'
tei += ' <title></title>\n'
tei += ' </titleStmt>\n'
tei += ' <publicationStmt>\n'
tei += ' <p></p>\n'
tei += ' </publicationStmt>\n'
tei += ' <sourceDesc>\n'
tei += ' <p></p>\n'
tei += ' </sourceDesc>\n'
tei += ' </fileDesc>\n'
tei += ' </teiHeader>\n'
tei += ' <text>\n'
tei += ' <body>\n'
# Conversion start
hocr = ET.parse(args.input)
2021-03-17 15:58:13 +00:00
for page in hocr.findall('.//*[@class="ocr_page"]'):
page_properties = page.attrib.get('title')
facsimile = re.search(r'image \"(.*?)\"', page_properties).group(1)
page_number = re.search(r'ppageno (\d+)', page_properties).group(1)
2021-03-17 15:58:13 +00:00
tei += ' <pb facs="{}" n="{}"/>\n'.format(facsimile, page_number)
for para in page.findall('.//*[@class="ocr_par"]'):
2021-03-17 15:58:13 +00:00
tei += ' <p>\n'
2019-05-20 09:10:40 +00:00
for line in para.findall('.//*[@class="ocr_line"]'):
2021-03-17 15:58:13 +00:00
tei += ' <lb/>'
indent = ''
2019-05-20 09:10:40 +00:00
for word in line.findall('.//*[@class="ocrx_word"]'):
2018-10-09 12:43:23 +00:00
if word.text is not None:
2021-03-17 15:58:13 +00:00
tei += indent + escape(word.text.strip())
indent = ' '
2021-03-17 15:58:13 +00:00
tei += '\n'
tei += ' </p>\n'
# Conversion end
tei += ' </body>\n'
tei += ' </text>\n'
tei += '</TEI>\n'
with open(args.output, 'w') as tei_file:
2021-03-17 15:58:13 +00:00
tei_file.write(tei)