mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2025-07-01 17:20:33 +00:00
Codestyle enhacements
This commit is contained in:
37
hocr-combine
37
hocr-combine
@ -1,33 +1,42 @@
|
||||
#!/usr/bin/env python3.7
|
||||
# coding=utf-8
|
||||
|
||||
""""Combine multiple hOCR files."""
|
||||
''' Combine multiple hOCR files. '''
|
||||
|
||||
from argparse import ArgumentParser
|
||||
from lxml import html
|
||||
|
||||
|
||||
parser = ArgumentParser(description='Combine multiple hOCR files.')
|
||||
parser.add_argument('file', help='Input file(s)', nargs='+')
|
||||
parser.add_argument('-o', '--output-file', help='Output file', required=True)
|
||||
parser.add_argument(
|
||||
'-i', '--input-file',
|
||||
help='Input file',
|
||||
nargs='+',
|
||||
required=True
|
||||
)
|
||||
parser.add_argument(
|
||||
'-o', '--output-file',
|
||||
help='Output file',
|
||||
required=True
|
||||
)
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
|
||||
|
||||
for file in args.file:
|
||||
files = []
|
||||
if file.startswith('@'):
|
||||
with open(file[1:], 'r') as f:
|
||||
files += [x for x in f.read().split("\n") if x != '']
|
||||
for input_file in args.input_file:
|
||||
input_files = []
|
||||
if input_file.startswith('@'):
|
||||
with open(input_file[1:], 'r') as f:
|
||||
input_files += [x for x in f.read().split("\n") if x != '']
|
||||
else:
|
||||
files.append(file)
|
||||
if len(files) == 0:
|
||||
input_files.append(input_file)
|
||||
if len(input_files) == 0:
|
||||
exit(1)
|
||||
|
||||
|
||||
hocr = html.parse(files[0])
|
||||
hocr = html.parse(input_files[0])
|
||||
hocr_body = hocr.find('body')
|
||||
for file in files[1:]:
|
||||
for ocr_page in html.parse(file).findall('//div[@class="ocr_page"]'):
|
||||
for input_file in input_files[1:]:
|
||||
for ocr_page in html.parse(input_file).findall('//div[@class="ocr_page"]'):
|
||||
hocr_body.append(ocr_page)
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user