mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
synced 2024-12-26 18:44:18 +00:00
Use more descriptive argument names then i and o (now: input and output)
This commit is contained in:
parent
41f70da8eb
commit
e78f667438
@ -9,8 +9,8 @@ import re
|
|||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
parser = ArgumentParser(description='Convert hOCR to TEI XML.')
|
parser = ArgumentParser(description='Convert hOCR to TEI XML.')
|
||||||
parser.add_argument('i', metavar='Path to hOCR input file')
|
parser.add_argument('input', metavar='Path to hOCR input file')
|
||||||
parser.add_argument('o', metavar='Path to TEI output file')
|
parser.add_argument('output', metavar='Path to TEI output file')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
tei = ''
|
tei = ''
|
||||||
@ -31,7 +31,7 @@ tei += ' </teiHeader>\n'
|
|||||||
tei += ' <text>\n'
|
tei += ' <text>\n'
|
||||||
tei += ' <body>\n'
|
tei += ' <body>\n'
|
||||||
# Conversion start
|
# Conversion start
|
||||||
hocr = ET.parse(args.i)
|
hocr = ET.parse(args.input)
|
||||||
for page in hocr.findall('.//*[@class="ocr_page"]'):
|
for page in hocr.findall('.//*[@class="ocr_page"]'):
|
||||||
page_properties = page.attrib.get('title')
|
page_properties = page.attrib.get('title')
|
||||||
facsimile = re.search(r'image \"(.*?)\"', page_properties).group(1)
|
facsimile = re.search(r'image \"(.*?)\"', page_properties).group(1)
|
||||||
@ -53,5 +53,5 @@ tei += ' </body>\n'
|
|||||||
tei += ' </text>\n'
|
tei += ' </text>\n'
|
||||||
tei += '</TEI>\n'
|
tei += '</TEI>\n'
|
||||||
|
|
||||||
with open(args.o, 'w') as tei_file:
|
with open(args.output, 'w') as tei_file:
|
||||||
tei_file.write(tei)
|
tei_file.write(tei)
|
||||||
|
Loading…
Reference in New Issue
Block a user