ocr/hocrtotei

58 lines
1.8 KiB
Python
Executable File

#!/usr/bin/env python3.7
# coding=utf-8
""""Convert hOCR to TEI XML."""
from xml.sax.saxutils import escape
from argparse import ArgumentParser
import re
import xml.etree.ElementTree as ET
parser = ArgumentParser(description='Convert hOCR to TEI XML.')
parser.add_argument('input', metavar='Path to hOCR input file')
parser.add_argument('output', metavar='Path to TEI output file')
args = parser.parse_args()
tei = ''
tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n'
tei += ' <teiHeader>\n'
tei += ' <fileDesc>\n'
tei += ' <titleStmt>\n'
tei += ' <title></title>\n'
tei += ' </titleStmt>\n'
tei += ' <publicationStmt>\n'
tei += ' <p></p>\n'
tei += ' </publicationStmt>\n'
tei += ' <sourceDesc>\n'
tei += ' <p></p>\n'
tei += ' </sourceDesc>\n'
tei += ' </fileDesc>\n'
tei += ' </teiHeader>\n'
tei += ' <text>\n'
tei += ' <body>\n'
# Conversion start
hocr = ET.parse(args.input)
for page in hocr.findall('.//*[@class="ocr_page"]'):
page_properties = page.attrib.get('title')
facsimile = re.search(r'image \"(.*?)\"', page_properties).group(1)
page_number = re.search(r'ppageno (\d+)', page_properties).group(1)
tei += ' <pb facs="{}" n="{}"/>\n'.format(facsimile, page_number)
for para in page.findall('.//*[@class="ocr_par"]'):
tei += ' <p>\n'
for line in para.findall('.//*[@class="ocr_line"]'):
tei += ' <lb/>'
indent = ''
for word in line.findall('.//*[@class="ocrx_word"]'):
if word.text is not None:
tei += indent + escape(word.text.strip())
indent = ' '
tei += '\n'
tei += ' </p>\n'
# Conversion end
tei += ' </body>\n'
tei += ' </text>\n'
tei += '</TEI>\n'
with open(args.output, 'w') as tei_file:
tei_file.write(tei)