#!/usr/bin/env python3.7 # coding=utf-8 from xml.sax.saxutils import escape from argparse import ArgumentParser import xml.etree.ElementTree as ET parser = ArgumentParser(description='Merges hOCR files to one P5 file.') parser.add_argument('i', metavar='hOCR-sourcefile', nargs='+') parser.add_argument('o', metavar='TEI-destfile',) args = parser.parse_args() output_file = open(args.o, 'w') output_file.write( '<?xml version="1.0" encoding="UTF-8"?>\n' + '<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="dtabf">\n' + ' <teiHeader>\n' + ' <fileDesc>\n' + ' <titleStmt/>\n' + ' <publicationStmt/>\n' + ' <sourceDesc/>\n' + ' </fileDesc>\n' + ' <encodingDesc/>\n' + ' <profileDesc/>\n' + ' </teiHeader>\n' + ' <text>\n' + ' <body>\n' ) for index, input_file in enumerate(args.i): tree = ET.parse(input_file) output_file.write(' <pb n="%i"/>\n' % (index + 1)) for para in tree.findall('.//*[@class="ocr_par"]'): output_file.write(' <p>\n') for line in para.findall('.//*[@class="ocr_line"]'): first_word_in_line = True for word in line.findall('.//*[@class="ocrx_word"]'): if word.text is not None: output_file.write((' ' if first_word_in_line else ' ') + escape(word.text.strip())) first_word_in_line = False if not first_word_in_line: output_file.write('<lb/>\n') output_file.write(' </p>\n') output_file.write( ' </body>\n' + ' </text>\n' + '</TEI>') output_file.close()