From b81ad4cc6723c3e4e6480dd53f33186f7aedd9d3 Mon Sep 17 00:00:00 2001
From: Patrick Jentsch <pjentsch@pjentsch-Laptop.local>
Date: Thu, 16 May 2019 14:21:01 +0200
Subject: [PATCH] Use argparse in hocrtotei

---
 hocrtotei | 37 ++++++++++++++++++-------------------
 ocr       | 11 +++++++++--
 2 files changed, 27 insertions(+), 21 deletions(-)
diff --git a/hocrtotei b/hocrtotei
index fde4b46..f623650 100755
--- a/hocrtotei
+++ b/hocrtotei
@@ -1,21 +1,23 @@
 #!/usr/bin/env python3.5
 # coding=utf-8
 
-import xml.etree.ElementTree as ET
 from xml.sax.saxutils import escape
-import os
-import re
-import sys
+import argparse
+import xml.etree.ElementTree as ET
 
-input_files = sorted(
-    filter(
-        lambda x: x.endswith(".hocr"),
-        os.listdir(sys.argv[1])
-    ),
-    key=lambda x: int(re.search(r'\d+', x).group(0))
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    'i',
+    help='The input files.',
+    nargs='*',
 )
-# "page-1.hocr" -> "1"
-output_file = open(sys.argv[2], "w")
+parser.add_argument(
+    'o',
+    help='The output file.',
+)
+args = parser.parse_args()
+
+output_file = open(args.o, "w")
 
 output_file.write(
       '<?xml version="1.0" encoding="UTF-8"?>\n'
@@ -32,23 +34,20 @@ output_file.write(
     + '    <text>\n'
     + '        <body>\n'
 )
-
-for input_file in input_files:
-    tree = ET.parse(os.path.join(sys.argv[1], input_file))
-    page_number = int(re.search(r'\d+', input_file.split(".")[0]).group(0))
-    output_file.write('            <pb n="%i"/>\n' % (page_number))
+for index, input_file in enumerate(args.i):
+    tree = ET.parse(input_file)
+    output_file.write('            <pb n="%i"/>\n' % (index + 1))
     for para in tree.findall(".//*[@class='ocr_par']"):
         output_file.write('            <p>\n')
         for line in para.findall(".//*[@class='ocr_line']"):
             first_word_in_line = True
             for word in line.findall(".//*[@class='ocrx_word']"):
                 if word.text is not None:
-                    output_file.write(("                " if first_word_in_line else " ") + escape(word.text.strip()))
+                    output_file.write(('                ' if first_word_in_line else ' ') + escape(word.text.strip()))
                     first_word_in_line = False
             if not first_word_in_line:
                 output_file.write('<lb/>\n')
         output_file.write('            </p>\n')
-
 output_file.write(
       '        </body>\n'
     + '    </text>\n'
diff --git a/ocr b/ocr
index 312ac22..8c4d30a 100755
--- a/ocr
+++ b/ocr
@@ -307,8 +307,15 @@ class OCRWorkflow(WorkflowRunner):
         '''
         hocr_to_tei_jobs = []
         for index, job in enumerate(self.jobs):
-            cmd = 'hocrtotei "%s" "%s"' % (
-                os.path.join(job['output_dir'], 'tmp'),
+            files = os.listdir(os.path.join(job['output_dir'], 'tmp'))
+            files = filter(lambda x: x.endswith('.hocr'), files)
+            files.sort(key=lambda x: int(re.search(r'\d+', x).group(0)))
+            files = map(
+                lambda x: '"' + os.path.join(job['output_dir'], 'tmp', x) + '"',
+                files
+            )
+            cmd = 'hocrtotei %s "%s"' % (
+                ' '.join(files),
                 os.path.join(
                     job['output_dir'],
                     os.path.join(job['output_dir'], job['name'] + '.xml')