ocr/hocr-combine

#!/usr/bin/env python3.7
# coding=utf-8

''' Combine multiple hOCR files. '''

from argparse import ArgumentParser
from lxml import html


parser = ArgumentParser(description='Combine multiple hOCR files.')
parser.add_argument(
    '-i', '--input-file',
    help='Input file',
    nargs='+',
    required=True
)
parser.add_argument(
    '-o', '--output-file',
    help='Output file',
    required=True
)
args = parser.parse_args()


for input_file in args.input_file:
    input_files = []
    if input_file.startswith('@'):
        with open(input_file[1:], 'r') as f:
            input_files += [x for x in f.read().split("\n") if x != '']
    else:
        input_files.append(input_file)
if len(input_files) == 0:
    exit(1)


hocr = html.parse(input_files[0])
hocr_body = hocr.find('body')
for input_file in input_files[1:]:
    for ocr_page in html.parse(input_file).findall('//div[@class="ocr_page"]'):
        hocr_body.append(ocr_page)


with open(args.output_file, 'wb') as f:
    hocr.write(f, encoding='UTF-8', method='html')
Update to Tesseract 5.0.0, Set version 0.1.0 2022-01-04 11:42:55 +01:00			`#!/usr/bin/env python3.7`
			`# coding=utf-8`

Codestyle enhacements 2022-01-27 13:40:23 +01:00			`''' Combine multiple hOCR files. '''`
Update to Tesseract 5.0.0, Set version 0.1.0 2022-01-04 11:42:55 +01:00
			`from argparse import ArgumentParser`
			`from lxml import html`


			`parser = ArgumentParser(description='Combine multiple hOCR files.')`
Codestyle enhacements 2022-01-27 13:40:23 +01:00			`parser.add_argument(`
			`'-i', '--input-file',`
			`help='Input file',`
			`nargs='+',`
			`required=True`
			`)`
			`parser.add_argument(`
			`'-o', '--output-file',`
			`help='Output file',`
			`required=True`
			`)`
Update to Tesseract 5.0.0, Set version 0.1.0 2022-01-04 11:42:55 +01:00			`args = parser.parse_args()`
Mark required arguments in scripts as required 2022-02-03 10:40:50 +01:00
Update to Tesseract 5.0.0, Set version 0.1.0 2022-01-04 11:42:55 +01:00
Codestyle enhacements 2022-01-27 13:40:23 +01:00			`for input_file in args.input_file:`
			`input_files = []`
			`if input_file.startswith('@'):`
			`with open(input_file[1:], 'r') as f:`
			`input_files += [x for x in f.read().split("\n") if x != '']`
Update to Tesseract 5.0.0, Set version 0.1.0 2022-01-04 11:42:55 +01:00			`else:`
Codestyle enhacements 2022-01-27 13:40:23 +01:00			`input_files.append(input_file)`
			`if len(input_files) == 0:`
Update to Tesseract 5.0.0, Set version 0.1.0 2022-01-04 11:42:55 +01:00			`exit(1)`


Codestyle enhacements 2022-01-27 13:40:23 +01:00			`hocr = html.parse(input_files[0])`
Update to Tesseract 5.0.0, Set version 0.1.0 2022-01-04 11:42:55 +01:00			`hocr_body = hocr.find('body')`
Codestyle enhacements 2022-01-27 13:40:23 +01:00			`for input_file in input_files[1:]:`
			`for ocr_page in html.parse(input_file).findall('//div[@class="ocr_page"]'):`
Update to Tesseract 5.0.0, Set version 0.1.0 2022-01-04 11:42:55 +01:00			`hocr_body.append(ocr_page)`


			`with open(args.output_file, 'wb') as f:`
			`hocr.write(f, encoding='UTF-8', method='html')`