#!/usr/bin/env python3.5 # coding=utf-8 import xml.etree.ElementTree as ET from xml.sax.saxutils import escape import os import sys input_files = filter(lambda x: x.endswith(".hocr"), sorted(os.listdir(sys.argv[1]))) output_file = open(sys.argv[2], "w") output_file.write('<?xml version="1.0" encoding="UTF-8"?>\n' + '<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="dtabf">\n' + ' <teiHeader>\n' + ' <fileDesc>\n' + ' <titleStmt/>\n' + ' <publicationStmt/>\n' + ' <sourceDesc/>\n' + ' </fileDesc>\n' + ' <encodingDesc/>\n' + ' <profileDesc/>\n' + ' </teiHeader>\n' + ' <text>\n' + ' <body>\n') for input_file in input_files: tree = ET.parse(os.path.join(sys.argv[1], input_file)) output_file.write(' <pb n="%s" facs="%s.tif"/>\n' % (input_file.split(".")[0], input_file.split(".")[0])) for para in tree.findall(".//*[@class='ocr_par']"): output_file.write(' <p>\n') for line in para.findall(".//*[@class='ocr_line']"): first_word_in_line = True for word in line.findall(".//*[@class='ocrx_word']"): if word.text is not None: output_file.write((" " if first_word_in_line else " ") + escape(word.text.strip())) first_word_in_line = False if not first_word_in_line: output_file.write('<lb/>\n') output_file.write(' </p>\n') output_file.write(' </body>\n' + ' </text>\n' + '</TEI>') output_file.close()