#!/usr/bin/env python3.5 # coding=utf-8 from xml.sax.saxutils import escape import argparse import xml.etree.ElementTree as ET parser = argparse.ArgumentParser(description='hocrtotei merges several hOCR files in order of their occurrence on command line to one TEI result file.') parser.add_argument( 'i', metavar='hOCR-sourcefile', help='Input file in hOCR file format.', nargs='+' ) parser.add_argument( 'o', metavar='TEI-destfile', help='Output file.' ) args = parser.parse_args() output_file = open(args.o, "w") output_file.write( '\n' + '\n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' ) for index, input_file in enumerate(args.i): tree = ET.parse(input_file) output_file.write(' \n' % (index + 1)) for para in tree.findall(".//*[@class='ocr_par']"): output_file.write('

\n') for line in para.findall(".//*[@class='ocr_line']"): first_word_in_line = True for word in line.findall(".//*[@class='ocrx_word']"): if word.text is not None: output_file.write((' ' if first_word_in_line else ' ') + escape(word.text.strip())) first_word_in_line = False if not first_word_in_line: output_file.write('\n') output_file.write('

\n') output_file.write( ' \n' + '
\n' + '
') output_file.close()