Update to Tesseract 5.0.0, Set version 0.1.0

2026-01-31 17:10:55 +00:00 · 2022-01-04 11:42:55 +01:00
parent a0760487ae
commit e1b78b6ba4
7 changed files with 574 additions and 338 deletions
--- a/35
+++ b/35
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3.7
+# coding=utf-8
+
+""""Combine multiple hOCR files."""
+
+from argparse import ArgumentParser
+from lxml import html
+
+
+parser = ArgumentParser(description='Combine multiple hOCR files.')
+parser.add_argument('file', help='Input file(s)', nargs='+')
+parser.add_argument('-o', '--output-file', help='Output file', required=True)
+args = parser.parse_args()
+
+
+for file in args.file:
+    files = []
+    if file.startswith('@'):
+        with open(file[1:], 'r') as f:
+            files += [x for x in f.read().split("\n") if x != '']
+    else:
+        files.append(file)
+if len(files) == 0:
+    exit(1)
+
+
+hocr = html.parse(files[0])
+hocr_body = hocr.find('body')
+for file in files[1:]:
+    for ocr_page in html.parse(file).findall('//div[@class="ocr_page"]'):
+        hocr_body.append(ocr_page)
+
+
+with open(args.output_file, 'wb') as f:
+    hocr.write(f, encoding='UTF-8', method='html')