#!/usr/bin/env python3.7 # coding=utf-8 """"Combine multiple hOCR files.""" from argparse import ArgumentParser from lxml import html parser = ArgumentParser(description='Combine multiple hOCR files.') parser.add_argument('file', help='Input file(s)', nargs='+') parser.add_argument('-o', '--output-file', help='Output file', required=True) args = parser.parse_args() for file in args.file: files = [] if file.startswith('@'): with open(file[1:], 'r') as f: files += [x for x in f.read().split("\n") if x != ''] else: files.append(file) if len(files) == 0: exit(1) hocr = html.parse(files[0]) hocr_body = hocr.find('body') for file in files[1:]: for ocr_page in html.parse(file).findall('//div[@class="ocr_page"]'): hocr_body.append(ocr_page) with open(args.output_file, 'wb') as f: hocr.write(f, encoding='UTF-8', method='html')