#!/usr/bin/env python3.7 # coding=utf-8 """"Merges hOCR files into a TEI file.""" from xml.sax.saxutils import escape from argparse import ArgumentParser import re import xml.etree.ElementTree as ET parser = ArgumentParser(description='Merges hOCR files into a TEI file.') parser.add_argument('i', metavar='hOCR-sourcefile') parser.add_argument('o', metavar='TEI-destfile') args = parser.parse_args() output_file = open(args.o, 'w') output_file.write( '\n' + '\n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' ) tree = ET.parse(args.i) for page in tree.findall('.//*[@class="ocr_page"]'): page_properties = page.attrib.get('title') facsimile = re.search(r'image \"(.*?)\"', page_properties).group(1) page_number = re.search(r'ppageno (\d+)', page_properties).group(1) output_file.write(' \n' % (facsimile, page_number)) # noqa for para in page.findall('.//*[@class="ocr_par"]'): output_file.write('

\n') for line in para.findall('.//*[@class="ocr_line"]'): output_file.write(' ') indent = '' for word in line.findall('.//*[@class="ocrx_word"]'): if word.text is not None: output_file.write(indent + escape(word.text.strip())) indent = ' ' output_file.write('\n') output_file.write('

\n') output_file.write( ' \n' + '
\n' + '
' ) output_file.close()