mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2024-12-25 16:14:19 +00:00
45 lines
1.0 KiB
Python
Executable File
45 lines
1.0 KiB
Python
Executable File
#!/usr/bin/env python3.7
|
|
# coding=utf-8
|
|
|
|
''' Combine multiple hOCR files. '''
|
|
|
|
from argparse import ArgumentParser
|
|
from lxml import html
|
|
|
|
|
|
parser = ArgumentParser(description='Combine multiple hOCR files.')
|
|
parser.add_argument(
|
|
'-i', '--input-file',
|
|
help='Input file',
|
|
nargs='+',
|
|
required=True
|
|
)
|
|
parser.add_argument(
|
|
'-o', '--output-file',
|
|
help='Output file',
|
|
required=True
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
|
|
for input_file in args.input_file:
|
|
input_files = []
|
|
if input_file.startswith('@'):
|
|
with open(input_file[1:], 'r') as f:
|
|
input_files += [x for x in f.read().split("\n") if x != '']
|
|
else:
|
|
input_files.append(input_file)
|
|
if len(input_files) == 0:
|
|
exit(1)
|
|
|
|
|
|
hocr = html.parse(input_files[0])
|
|
hocr_body = hocr.find('body')
|
|
for input_file in input_files[1:]:
|
|
for ocr_page in html.parse(input_file).findall('//div[@class="ocr_page"]'):
|
|
hocr_body.append(ocr_page)
|
|
|
|
|
|
with open(args.output_file, 'wb') as f:
|
|
hocr.write(f, encoding='UTF-8', method='html')
|