ocr/hocr2tei

#!/usr/bin/env python3.7
# coding=utf-8

''' Convert hOCR to TEI XML. '''

from argparse import ArgumentParser
from lxml import html
from xml.sax.saxutils import escape
import re


parser = ArgumentParser(description='Convert hOCR to TEI XML.')
parser.add_argument(
    '-i', '--input-file',
    help='Input file'
)
parser.add_argument(
    '-o', '--output-file',
    help='Output file',
    required=True
)
args = parser.parse_args()


tei = ''
tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n'
tei += '  <teiHeader>\n'
tei += '    <fileDesc>\n'
tei += '      <titleStmt>\n'
tei += '        <title></title>\n'
tei += '      </titleStmt>\n'
tei += '      <publicationStmt>\n'
tei += '        <p></p>\n'
tei += '      </publicationStmt>\n'
tei += '      <sourceDesc>\n'
tei += '        <p></p>\n'
tei += '      </sourceDesc>\n'
tei += '    </fileDesc>\n'
tei += '  </teiHeader>\n'
tei += '  <text>\n'
tei += '    <body>\n'
hocr = html.parse(args.input_file)
for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
    ocr_page_title_attrib = ocr_page.attrib.get('title')
    facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1)
    page_number = re.search(r'ppageno (\d+)', ocr_page_title_attrib).group(1)
    tei += f'      <pb facs="{facsimile}" n="{page_number}"/>\n'
    for ocr_par in ocr_page.findall('.//p[@class="ocr_par"]'):
        tei += '      <p>\n'
        for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'):
            tei += '        <lb/>'
            is_first_word_in_line = True
            for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'):
                if ocrx_word.text is not None:
                    if not is_first_word_in_line:
                        tei += ' '
                    tei += escape(ocrx_word.text)
                    is_first_word_in_line = False
            tei += '\n'
        tei += '      </p>\n'
tei += '    </body>\n'
tei += '  </text>\n'
tei += '</TEI>\n'


with open(args.output_file, 'w') as f:
    f.write(tei)
Bump versions 2020-04-06 09:21:52 +02:00			`#!/usr/bin/env python3.7`
Initial commit 2018-10-09 14:43:23 +02:00			`# coding=utf-8`

Codestyle enhacements 2022-01-27 13:40:23 +01:00			`''' Convert hOCR to TEI XML. '''`
First work on version 1.0.0 2021-02-19 13:04:03 +01:00
Update OCR Pipeline 2020-04-03 17:35:30 +02:00			`from argparse import ArgumentParser`
Update to Tesseract 5.0.0, Set version 0.1.0 2022-01-04 11:42:55 +01:00			`from lxml import html`
			`from xml.sax.saxutils import escape`
Cleanup and make use of globbing for input files for binarization and ocr 2021-03-15 12:45:05 +01:00			`import re`
Update to Tesseract 5.0.0, Set version 0.1.0 2022-01-04 11:42:55 +01:00
Initial commit 2018-10-09 14:43:23 +02:00
Update the hocrtotei script 2021-03-17 16:58:13 +01:00			`parser = ArgumentParser(description='Convert hOCR to TEI XML.')`
Codestyle enhacements 2022-01-27 13:40:23 +01:00			`parser.add_argument(`
			`'-i', '--input-file',`
			`help='Input file'`
			`)`
			`parser.add_argument(`
			`'-o', '--output-file',`
			`help='Output file',`
			`required=True`
			`)`
Use argparse in hocrtotei 2019-05-16 14:21:01 +02:00			`args = parser.parse_args()`

Update to Tesseract 5.0.0, Set version 0.1.0 2022-01-04 11:42:55 +01:00
Update the hocrtotei script 2021-03-17 16:58:13 +01:00			`tei = ''`
			`tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n'`
			`tei += ' <teiHeader>\n'`
			`tei += ' <fileDesc>\n'`
			`tei += ' <titleStmt>\n'`
			`tei += ' <title></title>\n'`
			`tei += ' </titleStmt>\n'`
			`tei += ' <publicationStmt>\n'`
			`tei += ' <p></p>\n'`
			`tei += ' </publicationStmt>\n'`
			`tei += ' <sourceDesc>\n'`
			`tei += ' <p></p>\n'`
			`tei += ' </sourceDesc>\n'`
			`tei += ' </fileDesc>\n'`
			`tei += ' </teiHeader>\n'`
			`tei += ' <text>\n'`
			`tei += ' <body>\n'`
Codestyle enhacements 2022-01-27 13:40:23 +01:00			`hocr = html.parse(args.input_file)`
Update to Tesseract 5.0.0, Set version 0.1.0 2022-01-04 11:42:55 +01:00			`for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):`
			`ocr_page_title_attrib = ocr_page.attrib.get('title')`
			`facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1)`
			`page_number = re.search(r'ppageno (\d+)', ocr_page_title_attrib).group(1)`
			`tei += f' <pb facs="{facsimile}" n="{page_number}"/>\n'`
			`for ocr_par in ocr_page.findall('.//p[@class="ocr_par"]'):`
Update the hocrtotei script 2021-03-17 16:58:13 +01:00			`tei += ' <p>\n'`
Update to Tesseract 5.0.0, Set version 0.1.0 2022-01-04 11:42:55 +01:00			`for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'):`
Update the hocrtotei script 2021-03-17 16:58:13 +01:00			`tei += ' <lb/>'`
Codestyle enhacements 2022-01-27 13:40:23 +01:00			`is_first_word_in_line = True`
Update to Tesseract 5.0.0, Set version 0.1.0 2022-01-04 11:42:55 +01:00			`for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'):`
			`if ocrx_word.text is not None:`
Codestyle enhacements 2022-01-27 13:40:23 +01:00			`if not is_first_word_in_line:`
			`tei += ' '`
			`tei += escape(ocrx_word.text)`
			`is_first_word_in_line = False`
Update the hocrtotei script 2021-03-17 16:58:13 +01:00			`tei += '\n'`
			`tei += ' </p>\n'`
			`tei += ' </body>\n'`
			`tei += ' </text>\n'`
			`tei += '</TEI>\n'`

Update to Tesseract 5.0.0, Set version 0.1.0 2022-01-04 11:42:55 +01:00
			`with open(args.output_file, 'w') as f:`
			`f.write(tei)`