diff --git a/hocrtotei b/hocrtotei
index ee492d7..a6e4963 100755
--- a/hocrtotei
+++ b/hocrtotei
@@ -4,6 +4,7 @@
import xml.etree.ElementTree as ET
from xml.sax.saxutils import escape
import os
+import re
import sys
input_files = filter(lambda x: x.endswith(".hocr"), sorted(os.listdir(sys.argv[1])))
@@ -25,7 +26,8 @@ output_file.write('\n' +
for input_file in input_files:
tree = ET.parse(os.path.join(sys.argv[1], input_file))
- output_file.write('
\n') for line in para.findall(".//*[@class='ocr_line']"): @@ -41,4 +43,4 @@ for input_file in input_files: output_file.write('