#!/usr/bin/env python3.7 # coding=utf-8 ''' Combine multiple hOCR files. ''' from argparse import ArgumentParser from lxml import html parser = ArgumentParser(description='Combine multiple hOCR files.') parser.add_argument( '-i', '--input-file', help='Input file', nargs='+', required=True ) parser.add_argument( '-o', '--output-file', help='Output file', required=True ) args = parser.parse_args() for input_file in args.input_file: input_files = [] if input_file.startswith('@'): with open(input_file[1:], 'r') as f: input_files += [x for x in f.read().split("\n") if x != ''] else: input_files.append(input_file) if len(input_files) == 0: exit(1) hocr = html.parse(input_files[0]) hocr_body = hocr.find('body') for input_file in input_files[1:]: for ocr_page in html.parse(input_file).findall('//div[@class="ocr_page"]'): hocr_body.append(ocr_page) with open(args.output_file, 'wb') as f: hocr.write(f, encoding='UTF-8', method='html')