#!/usr/bin/env python3.7 # coding=utf-8 ''' Convert hOCR to TEI XML. ''' from argparse import ArgumentParser from lxml import html from xml.sax.saxutils import escape import re parser = ArgumentParser(description='Convert hOCR to TEI XML.') parser.add_argument( '-i', '--input-file', help='Input file', required=True ) parser.add_argument( '-o', '--output-file', help='Output file', required=True ) args = parser.parse_args() tei = '' tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n' tei += ' <teiHeader>\n' tei += ' <fileDesc>\n' tei += ' <titleStmt>\n' tei += ' <title></title>\n' tei += ' </titleStmt>\n' tei += ' <publicationStmt>\n' tei += ' <p></p>\n' tei += ' </publicationStmt>\n' tei += ' <sourceDesc>\n' tei += ' <p></p>\n' tei += ' </sourceDesc>\n' tei += ' </fileDesc>\n' tei += ' </teiHeader>\n' tei += ' <text>\n' tei += ' <body>\n' hocr = html.parse(args.input_file) for ocr_page in hocr.findall('.//div[@class="ocr_page"]'): ocr_page_title_attrib = ocr_page.attrib.get('title') facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1) page_number = re.search(r'ppageno (\d+)', ocr_page_title_attrib).group(1) tei += f' <pb facs="{facsimile}" n="{page_number}"/>\n' for ocr_par in ocr_page.findall('.//p[@class="ocr_par"]'): tei += ' <p>\n' for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'): tei += ' <lb/>' is_first_word_in_line = True for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'): if ocrx_word.text is not None: if not is_first_word_in_line: tei += ' ' tei += escape(ocrx_word.text) is_first_word_in_line = False tei += '\n' tei += ' </p>\n' tei += ' </body>\n' tei += ' </text>\n' tei += '</TEI>\n' with open(args.output_file, 'w') as f: f.write(tei)