#!/usr/bin/env python3.5 # coding=utf-8 import xml.etree.ElementTree as ET from xml.sax.saxutils import escape import os import re import sys input_files = sorted( filter( lambda x: x.endswith(".hocr"), os.listdir(sys.argv[1]) ), key=lambda x: int(re.search(r'\d+', x).group(0)) ) # "page-1.hocr" -> "1" output_file = open(sys.argv[2], "w") output_file.write( '\n' + '\n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' ) for input_file in input_files: tree = ET.parse(os.path.join(sys.argv[1], input_file)) page_number = int(re.search(r'\d+', input_file.split(".")[0]).group(0)) output_file.write(' \n' % (page_number)) for para in tree.findall(".//*[@class='ocr_par']"): output_file.write('

\n') for line in para.findall(".//*[@class='ocr_line']"): first_word_in_line = True for word in line.findall(".//*[@class='ocrx_word']"): if word.text is not None: output_file.write((" " if first_word_in_line else " ") + escape(word.text.strip())) first_word_in_line = False if not first_word_in_line: output_file.write('\n') output_file.write('

\n') output_file.write( ' \n' + '
\n' + '
') output_file.close()