#!/usr/bin/env python3.7 # coding=utf-8 """"Convert hOCR to TEI XML.""" from xml.sax.saxutils import escape from argparse import ArgumentParser import re import xml.etree.ElementTree as ET parser = ArgumentParser(description='Convert hOCR to TEI XML.') parser.add_argument('input', metavar='Path to hOCR input file') parser.add_argument('output', metavar='Path to TEI output file') args = parser.parse_args() tei = '' tei += '\n' tei += ' \n' tei += ' \n' tei += ' \n' tei += ' \n' tei += ' \n' tei += ' \n' tei += '

\n' tei += '
\n' tei += ' \n' tei += '

\n' tei += '
\n' tei += '
\n' tei += '
\n' tei += ' \n' tei += ' \n' # Conversion start hocr = ET.parse(args.input) for page in hocr.findall('.//*[@class="ocr_page"]'): page_properties = page.attrib.get('title') facsimile = re.search(r'image \"(.*?)\"', page_properties).group(1) page_number = re.search(r'ppageno (\d+)', page_properties).group(1) tei += ' \n'.format(facsimile, page_number) for para in page.findall('.//*[@class="ocr_par"]'): tei += '

\n' for line in para.findall('.//*[@class="ocr_line"]'): tei += ' ' indent = '' for word in line.findall('.//*[@class="ocrx_word"]'): if word.text is not None: tei += indent + escape(word.text.strip()) indent = ' ' tei += '\n' tei += '

\n' # Conversion end tei += ' \n' tei += '
\n' tei += '
\n' with open(args.output, 'w') as tei_file: tei_file.write(tei)