mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
				synced 2025-10-31 21:13:16 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			69 lines
		
	
	
		
			2.0 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			69 lines
		
	
	
		
			2.0 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/env python3.7
 | |
| # coding=utf-8
 | |
| 
 | |
| ''' Convert hOCR to TEI XML. '''
 | |
| 
 | |
| from argparse import ArgumentParser
 | |
| from lxml import html
 | |
| from xml.sax.saxutils import escape
 | |
| import re
 | |
| 
 | |
| 
 | |
| parser = ArgumentParser(description='Convert hOCR to TEI XML.')
 | |
| parser.add_argument(
 | |
|     '-i', '--input-file',
 | |
|     help='Input file',
 | |
|     required=True
 | |
| )
 | |
| parser.add_argument(
 | |
|     '-o', '--output-file',
 | |
|     help='Output file',
 | |
|     required=True
 | |
| )
 | |
| args = parser.parse_args()
 | |
| 
 | |
| 
 | |
| tei = ''
 | |
| tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n'
 | |
| tei += '  <teiHeader>\n'
 | |
| tei += '    <fileDesc>\n'
 | |
| tei += '      <titleStmt>\n'
 | |
| tei += '        <title></title>\n'
 | |
| tei += '      </titleStmt>\n'
 | |
| tei += '      <publicationStmt>\n'
 | |
| tei += '        <p></p>\n'
 | |
| tei += '      </publicationStmt>\n'
 | |
| tei += '      <sourceDesc>\n'
 | |
| tei += '        <p></p>\n'
 | |
| tei += '      </sourceDesc>\n'
 | |
| tei += '    </fileDesc>\n'
 | |
| tei += '  </teiHeader>\n'
 | |
| tei += '  <text>\n'
 | |
| tei += '    <body>\n'
 | |
| hocr = html.parse(args.input_file)
 | |
| for ocr_page in hocr.findall('.//div[@class="ocr_page"]'):
 | |
|     ocr_page_title_attrib = ocr_page.attrib.get('title')
 | |
|     facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1)
 | |
|     page_number = re.search(r'ppageno (\d+)', ocr_page_title_attrib).group(1)
 | |
|     tei += f'      <pb facs="{facsimile}" n="{page_number}"/>\n'
 | |
|     for ocr_par in ocr_page.findall('.//p[@class="ocr_par"]'):
 | |
|         tei += '      <p>\n'
 | |
|         for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'):
 | |
|             tei += '        <lb/>'
 | |
|             is_first_word_in_line = True
 | |
|             for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'):
 | |
|                 if ocrx_word.text is not None:
 | |
|                     if not is_first_word_in_line:
 | |
|                         tei += ' '
 | |
|                     tei += escape(ocrx_word.text)
 | |
|                     is_first_word_in_line = False
 | |
|             tei += '\n'
 | |
|         tei += '      </p>\n'
 | |
| tei += '    </body>\n'
 | |
| tei += '  </text>\n'
 | |
| tei += '</TEI>\n'
 | |
| 
 | |
| 
 | |
| with open(args.output_file, 'w') as f:
 | |
|     f.write(tei)
 |