Update the hocrtotei script

This commit is contained in:
Patrick Jentsch 2021-03-17 16:58:13 +01:00
parent 6db7f70446
commit 41f70da8eb
2 changed files with 38 additions and 35 deletions

View File

@ -1,54 +1,57 @@
#!/usr/bin/env python3.7
# coding=utf-8
""""Merges hOCR files into a TEI file."""
""""Convert hOCR to TEI XML."""
from xml.sax.saxutils import escape
from argparse import ArgumentParser
import re
import xml.etree.ElementTree as ET
parser = ArgumentParser(description='Merges hOCR files into a TEI file.')
parser.add_argument('i', metavar='hOCR-sourcefile')
parser.add_argument('o', metavar='TEI-destfile')
parser = ArgumentParser(description='Convert hOCR to TEI XML.')
parser.add_argument('i', metavar='Path to hOCR input file')
parser.add_argument('o', metavar='Path to TEI output file')
args = parser.parse_args()
output_file = open(args.o, 'w')
output_file.write(
'<?xml version="1.0" encoding="UTF-8"?>\n'
+ '<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="dtabf">\n'
+ ' <teiHeader>\n'
+ ' <fileDesc>\n'
+ ' <titleStmt/>\n'
+ ' <publicationStmt/>\n'
+ ' <sourceDesc/>\n'
+ ' </fileDesc>\n'
+ ' <encodingDesc/>\n'
+ ' <profileDesc/>\n'
+ ' </teiHeader>\n'
+ ' <text>\n'
+ ' <body>\n'
)
tree = ET.parse(args.i)
for page in tree.findall('.//*[@class="ocr_page"]'):
tei = ''
tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n'
tei += ' <teiHeader>\n'
tei += ' <fileDesc>\n'
tei += ' <titleStmt>\n'
tei += ' <title></title>\n'
tei += ' </titleStmt>\n'
tei += ' <publicationStmt>\n'
tei += ' <p></p>\n'
tei += ' </publicationStmt>\n'
tei += ' <sourceDesc>\n'
tei += ' <p></p>\n'
tei += ' </sourceDesc>\n'
tei += ' </fileDesc>\n'
tei += ' </teiHeader>\n'
tei += ' <text>\n'
tei += ' <body>\n'
# Conversion start
hocr = ET.parse(args.i)
for page in hocr.findall('.//*[@class="ocr_page"]'):
page_properties = page.attrib.get('title')
facsimile = re.search(r'image \"(.*?)\"', page_properties).group(1)
page_number = re.search(r'ppageno (\d+)', page_properties).group(1)
output_file.write(' <pb facs="%s" n="%s"/>\n' % (facsimile, page_number)) # noqa
tei += ' <pb facs="{}" n="{}"/>\n'.format(facsimile, page_number)
for para in page.findall('.//*[@class="ocr_par"]'):
output_file.write(' <p>\n')
tei += ' <p>\n'
for line in para.findall('.//*[@class="ocr_line"]'):
output_file.write(' <lb/>')
tei += ' <lb/>'
indent = ''
for word in line.findall('.//*[@class="ocrx_word"]'):
if word.text is not None:
output_file.write(indent + escape(word.text.strip()))
tei += indent + escape(word.text.strip())
indent = ' '
output_file.write('\n')
output_file.write(' </p>\n')
output_file.write(
' </body>\n'
+ ' </text>\n'
+ '</TEI>'
)
output_file.close()
tei += '\n'
tei += ' </p>\n'
# Conversion end
tei += ' </body>\n'
tei += ' </text>\n'
tei += '</TEI>\n'
with open(args.o, 'w') as tei_file:
tei_file.write(tei)

2
ocr
View File

@ -1,7 +1,7 @@
#!/usr/bin/env python2.7
# coding=utf-8
"""An OCR pipeline for PDF file processing."""
"""OCR pipeline for PDF file processing."""
__author__ = 'Patrick Jentsch <p.jentsch@uni-bielefeld.de>,' \
'Stephan Porada <porada@posteo.de>'