diff --git a/hocrtotei b/hocrtotei index fde4b46..f623650 100755 --- a/hocrtotei +++ b/hocrtotei @@ -1,21 +1,23 @@ #!/usr/bin/env python3.5 # coding=utf-8 -import xml.etree.ElementTree as ET from xml.sax.saxutils import escape -import os -import re -import sys +import argparse +import xml.etree.ElementTree as ET -input_files = sorted( - filter( - lambda x: x.endswith(".hocr"), - os.listdir(sys.argv[1]) - ), - key=lambda x: int(re.search(r'\d+', x).group(0)) +parser = argparse.ArgumentParser() +parser.add_argument( + 'i', + help='The input files.', + nargs='*', ) -# "page-1.hocr" -> "1" -output_file = open(sys.argv[2], "w") +parser.add_argument( + 'o', + help='The output file.', +) +args = parser.parse_args() + +output_file = open(args.o, "w") output_file.write( '\n' @@ -32,23 +34,20 @@ output_file.write( + ' \n' + ' \n' ) - -for input_file in input_files: - tree = ET.parse(os.path.join(sys.argv[1], input_file)) - page_number = int(re.search(r'\d+', input_file.split(".")[0]).group(0)) - output_file.write(' \n' % (page_number)) +for index, input_file in enumerate(args.i): + tree = ET.parse(input_file) + output_file.write(' \n' % (index + 1)) for para in tree.findall(".//*[@class='ocr_par']"): output_file.write('

\n') for line in para.findall(".//*[@class='ocr_line']"): first_word_in_line = True for word in line.findall(".//*[@class='ocrx_word']"): if word.text is not None: - output_file.write((" " if first_word_in_line else " ") + escape(word.text.strip())) + output_file.write((' ' if first_word_in_line else ' ') + escape(word.text.strip())) first_word_in_line = False if not first_word_in_line: output_file.write('\n') output_file.write('

\n') - output_file.write( ' \n' + '
\n' diff --git a/ocr b/ocr index 312ac22..8c4d30a 100755 --- a/ocr +++ b/ocr @@ -307,8 +307,15 @@ class OCRWorkflow(WorkflowRunner): ''' hocr_to_tei_jobs = [] for index, job in enumerate(self.jobs): - cmd = 'hocrtotei "%s" "%s"' % ( - os.path.join(job['output_dir'], 'tmp'), + files = os.listdir(os.path.join(job['output_dir'], 'tmp')) + files = filter(lambda x: x.endswith('.hocr'), files) + files.sort(key=lambda x: int(re.search(r'\d+', x).group(0))) + files = map( + lambda x: '"' + os.path.join(job['output_dir'], 'tmp', x) + '"', + files + ) + cmd = 'hocrtotei %s "%s"' % ( + ' '.join(files), os.path.join( job['output_dir'], os.path.join(job['output_dir'], job['name'] + '.xml')