mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
				synced 2025-10-31 00:32:55 +00:00 
			
		
		
		
	Update the hocrtotei script
This commit is contained in:
		
							
								
								
									
										71
									
								
								hocrtotei
									
									
									
									
									
								
							
							
						
						
									
										71
									
								
								hocrtotei
									
									
									
									
									
								
							| @@ -1,54 +1,57 @@ | ||||
| #!/usr/bin/env python3.7 | ||||
| # coding=utf-8 | ||||
|  | ||||
| """"Merges hOCR files into a TEI file.""" | ||||
| """"Convert hOCR to TEI XML.""" | ||||
|  | ||||
| from xml.sax.saxutils import escape | ||||
| from argparse import ArgumentParser | ||||
| import re | ||||
| import xml.etree.ElementTree as ET | ||||
|  | ||||
| parser = ArgumentParser(description='Merges hOCR files into a TEI file.') | ||||
| parser.add_argument('i', metavar='hOCR-sourcefile') | ||||
| parser.add_argument('o', metavar='TEI-destfile') | ||||
| parser = ArgumentParser(description='Convert hOCR to TEI XML.') | ||||
| parser.add_argument('i', metavar='Path to hOCR input file') | ||||
| parser.add_argument('o', metavar='Path to TEI output file') | ||||
| args = parser.parse_args() | ||||
|  | ||||
| output_file = open(args.o, 'w') | ||||
| output_file.write( | ||||
|       '<?xml version="1.0" encoding="UTF-8"?>\n' | ||||
|       + '<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="dtabf">\n' | ||||
|       + '    <teiHeader>\n' | ||||
|       + '        <fileDesc>\n' | ||||
|       + '            <titleStmt/>\n' | ||||
|       + '            <publicationStmt/>\n' | ||||
|       + '            <sourceDesc/>\n' | ||||
|       + '        </fileDesc>\n' | ||||
|       + '        <encodingDesc/>\n' | ||||
|       + '        <profileDesc/>\n' | ||||
|       + '    </teiHeader>\n' | ||||
|       + '    <text>\n' | ||||
|       + '        <body>\n' | ||||
| ) | ||||
| tree = ET.parse(args.i) | ||||
| for page in tree.findall('.//*[@class="ocr_page"]'): | ||||
| tei = '' | ||||
| tei += '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n' | ||||
| tei += '  <teiHeader>\n' | ||||
| tei += '    <fileDesc>\n' | ||||
| tei += '      <titleStmt>\n' | ||||
| tei += '        <title></title>\n' | ||||
| tei += '      </titleStmt>\n' | ||||
| tei += '      <publicationStmt>\n' | ||||
| tei += '        <p></p>\n' | ||||
| tei += '      </publicationStmt>\n' | ||||
| tei += '      <sourceDesc>\n' | ||||
| tei += '        <p></p>\n' | ||||
| tei += '      </sourceDesc>\n' | ||||
| tei += '    </fileDesc>\n' | ||||
| tei += '  </teiHeader>\n' | ||||
| tei += '  <text>\n' | ||||
| tei += '    <body>\n' | ||||
| # Conversion start | ||||
| hocr = ET.parse(args.i) | ||||
| for page in hocr.findall('.//*[@class="ocr_page"]'): | ||||
|     page_properties = page.attrib.get('title') | ||||
|     facsimile = re.search(r'image \"(.*?)\"', page_properties).group(1) | ||||
|     page_number = re.search(r'ppageno (\d+)', page_properties).group(1) | ||||
|     output_file.write('            <pb facs="%s" n="%s"/>\n' % (facsimile, page_number))  # noqa | ||||
|     tei += '      <pb facs="{}" n="{}"/>\n'.format(facsimile, page_number) | ||||
|     for para in page.findall('.//*[@class="ocr_par"]'): | ||||
|         output_file.write('            <p>\n') | ||||
|         tei += '      <p>\n' | ||||
|         for line in para.findall('.//*[@class="ocr_line"]'): | ||||
|             output_file.write('                <lb/>') | ||||
|             tei += '        <lb/>' | ||||
|             indent = '' | ||||
|             for word in line.findall('.//*[@class="ocrx_word"]'): | ||||
|                 if word.text is not None: | ||||
|                     output_file.write(indent + escape(word.text.strip())) | ||||
|                     tei += indent + escape(word.text.strip()) | ||||
|                     indent = ' ' | ||||
|             output_file.write('\n') | ||||
|         output_file.write('            </p>\n') | ||||
| output_file.write( | ||||
|       '        </body>\n' | ||||
|       + '    </text>\n' | ||||
|       + '</TEI>' | ||||
| ) | ||||
| output_file.close() | ||||
|             tei += '\n' | ||||
|         tei += '      </p>\n' | ||||
| # Conversion end | ||||
| tei += '    </body>\n' | ||||
| tei += '  </text>\n' | ||||
| tei += '</TEI>\n' | ||||
|  | ||||
| with open(args.o, 'w') as tei_file: | ||||
|     tei_file.write(tei) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user