Update the hocrtotei script

This commit is contained in:
Patrick Jentsch 2021-03-17 16:58:13 +01:00
parent 6db7f70446
commit 41f70da8eb
2 changed files with 38 additions and 35 deletions

View File

@ -1,54 +1,57 @@
#!/usr/bin/env python3.7 #!/usr/bin/env python3.7
# coding=utf-8 # coding=utf-8
""""Merges hOCR files into a TEI file.""" """"Convert hOCR to TEI XML."""
from xml.sax.saxutils import escape from xml.sax.saxutils import escape
from argparse import ArgumentParser from argparse import ArgumentParser
import re import re
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
parser = ArgumentParser(description='Merges hOCR files into a TEI file.') parser = ArgumentParser(description='Convert hOCR to TEI XML.')
parser.add_argument('i', metavar='hOCR-sourcefile') parser.add_argument('i', metavar='Path to hOCR input file')
parser.add_argument('o', metavar='TEI-destfile') parser.add_argument('o', metavar='Path to TEI output file')
args = parser.parse_args() args = parser.parse_args()
output_file = open(args.o, 'w') tei = ''
output_file.write( tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n'
'<?xml version="1.0" encoding="UTF-8"?>\n' tei += ' <teiHeader>\n'
+ '<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="dtabf">\n' tei += ' <fileDesc>\n'
+ ' <teiHeader>\n' tei += ' <titleStmt>\n'
+ ' <fileDesc>\n' tei += ' <title></title>\n'
+ ' <titleStmt/>\n' tei += ' </titleStmt>\n'
+ ' <publicationStmt/>\n' tei += ' <publicationStmt>\n'
+ ' <sourceDesc/>\n' tei += ' <p></p>\n'
+ ' </fileDesc>\n' tei += ' </publicationStmt>\n'
+ ' <encodingDesc/>\n' tei += ' <sourceDesc>\n'
+ ' <profileDesc/>\n' tei += ' <p></p>\n'
+ ' </teiHeader>\n' tei += ' </sourceDesc>\n'
+ ' <text>\n' tei += ' </fileDesc>\n'
+ ' <body>\n' tei += ' </teiHeader>\n'
) tei += ' <text>\n'
tree = ET.parse(args.i) tei += ' <body>\n'
for page in tree.findall('.//*[@class="ocr_page"]'): # Conversion start
hocr = ET.parse(args.i)
for page in hocr.findall('.//*[@class="ocr_page"]'):
page_properties = page.attrib.get('title') page_properties = page.attrib.get('title')
facsimile = re.search(r'image \"(.*?)\"', page_properties).group(1) facsimile = re.search(r'image \"(.*?)\"', page_properties).group(1)
page_number = re.search(r'ppageno (\d+)', page_properties).group(1) page_number = re.search(r'ppageno (\d+)', page_properties).group(1)
output_file.write(' <pb facs="%s" n="%s"/>\n' % (facsimile, page_number)) # noqa tei += ' <pb facs="{}" n="{}"/>\n'.format(facsimile, page_number)
for para in page.findall('.//*[@class="ocr_par"]'): for para in page.findall('.//*[@class="ocr_par"]'):
output_file.write(' <p>\n') tei += ' <p>\n'
for line in para.findall('.//*[@class="ocr_line"]'): for line in para.findall('.//*[@class="ocr_line"]'):
output_file.write(' <lb/>') tei += ' <lb/>'
indent = '' indent = ''
for word in line.findall('.//*[@class="ocrx_word"]'): for word in line.findall('.//*[@class="ocrx_word"]'):
if word.text is not None: if word.text is not None:
output_file.write(indent + escape(word.text.strip())) tei += indent + escape(word.text.strip())
indent = ' ' indent = ' '
output_file.write('\n') tei += '\n'
output_file.write(' </p>\n') tei += ' </p>\n'
output_file.write( # Conversion end
' </body>\n' tei += ' </body>\n'
+ ' </text>\n' tei += ' </text>\n'
+ '</TEI>' tei += '</TEI>\n'
)
output_file.close() with open(args.o, 'w') as tei_file:
tei_file.write(tei)

2
ocr
View File

@ -1,7 +1,7 @@
#!/usr/bin/env python2.7 #!/usr/bin/env python2.7
# coding=utf-8 # coding=utf-8
"""An OCR pipeline for PDF file processing.""" """OCR pipeline for PDF file processing."""
__author__ = 'Patrick Jentsch <p.jentsch@uni-bielefeld.de>,' \ __author__ = 'Patrick Jentsch <p.jentsch@uni-bielefeld.de>,' \
'Stephan Porada <porada@posteo.de>' 'Stephan Porada <porada@posteo.de>'