Codestyle enhacements

This commit is contained in:
Patrick Jentsch
2022-01-27 13:40:23 +01:00
parent aeab9b7802
commit 4518ca1c83
5 changed files with 187 additions and 82 deletions

View File

@ -1,33 +1,42 @@
#!/usr/bin/env python3.7
# coding=utf-8
""""Combine multiple hOCR files."""
''' Combine multiple hOCR files. '''
from argparse import ArgumentParser
from lxml import html
parser = ArgumentParser(description='Combine multiple hOCR files.')
parser.add_argument('file', help='Input file(s)', nargs='+')
parser.add_argument('-o', '--output-file', help='Output file', required=True)
parser.add_argument(
'-i', '--input-file',
help='Input file',
nargs='+',
required=True
)
parser.add_argument(
'-o', '--output-file',
help='Output file',
required=True
)
args = parser.parse_args()
print(args)
for file in args.file:
files = []
if file.startswith('@'):
with open(file[1:], 'r') as f:
files += [x for x in f.read().split("\n") if x != '']
for input_file in args.input_file:
input_files = []
if input_file.startswith('@'):
with open(input_file[1:], 'r') as f:
input_files += [x for x in f.read().split("\n") if x != '']
else:
files.append(file)
if len(files) == 0:
input_files.append(input_file)
if len(input_files) == 0:
exit(1)
hocr = html.parse(files[0])
hocr = html.parse(input_files[0])
hocr_body = hocr.find('body')
for file in files[1:]:
for ocr_page in html.parse(file).findall('//div[@class="ocr_page"]'):
for input_file in input_files[1:]:
for ocr_page in html.parse(input_file).findall('//div[@class="ocr_page"]'):
hocr_body.append(ocr_page)