mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2025-01-13 03:40:35 +00:00
36 lines
900 B
Plaintext
36 lines
900 B
Plaintext
|
#!/usr/bin/env python3.7
|
||
|
# coding=utf-8
|
||
|
|
||
|
""""Combine multiple hOCR files."""
|
||
|
|
||
|
from argparse import ArgumentParser
|
||
|
from lxml import html
|
||
|
|
||
|
|
||
|
parser = ArgumentParser(description='Combine multiple hOCR files.')
|
||
|
parser.add_argument('file', help='Input file(s)', nargs='+')
|
||
|
parser.add_argument('-o', '--output-file', help='Output file', required=True)
|
||
|
args = parser.parse_args()
|
||
|
|
||
|
|
||
|
for file in args.file:
|
||
|
files = []
|
||
|
if file.startswith('@'):
|
||
|
with open(file[1:], 'r') as f:
|
||
|
files += [x for x in f.read().split("\n") if x != '']
|
||
|
else:
|
||
|
files.append(file)
|
||
|
if len(files) == 0:
|
||
|
exit(1)
|
||
|
|
||
|
|
||
|
hocr = html.parse(files[0])
|
||
|
hocr_body = hocr.find('body')
|
||
|
for file in files[1:]:
|
||
|
for ocr_page in html.parse(file).findall('//div[@class="ocr_page"]'):
|
||
|
hocr_body.append(ocr_page)
|
||
|
|
||
|
|
||
|
with open(args.output_file, 'wb') as f:
|
||
|
hocr.write(f, encoding='UTF-8', method='html')
|