mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
				synced 2025-10-31 21:03:16 +00:00 
			
		
		
		
	Codestyle enhacements
This commit is contained in:
		
							
								
								
									
										23
									
								
								hocr2tei
									
									
									
									
									
								
							
							
						
						
									
										23
									
								
								hocr2tei
									
									
									
									
									
								
							| @@ -1,7 +1,7 @@ | ||||
| #!/usr/bin/env python3.7 | ||||
| # coding=utf-8 | ||||
|  | ||||
| """"Convert hOCR to TEI XML.""" | ||||
| ''' Convert hOCR to TEI XML. ''' | ||||
|  | ||||
| from argparse import ArgumentParser | ||||
| from lxml import html | ||||
| @@ -10,8 +10,15 @@ import re | ||||
|  | ||||
|  | ||||
| parser = ArgumentParser(description='Convert hOCR to TEI XML.') | ||||
| parser.add_argument('file', help='Input file') | ||||
| parser.add_argument('-o', '--output-file', help='Output file', required=True) | ||||
| parser.add_argument( | ||||
|     '-i', '--input-file', | ||||
|     help='Input file' | ||||
| ) | ||||
| parser.add_argument( | ||||
|     '-o', '--output-file', | ||||
|     help='Output file', | ||||
|     required=True | ||||
| ) | ||||
| args = parser.parse_args() | ||||
|  | ||||
|  | ||||
| @@ -32,7 +39,7 @@ tei += '    </fileDesc>\n' | ||||
| tei += '  </teiHeader>\n' | ||||
| tei += '  <text>\n' | ||||
| tei += '    <body>\n' | ||||
| hocr = html.parse(args.file) | ||||
| hocr = html.parse(args.input_file) | ||||
| for ocr_page in hocr.findall('.//div[@class="ocr_page"]'): | ||||
|     ocr_page_title_attrib = ocr_page.attrib.get('title') | ||||
|     facsimile = re.search(r'image \"(.*?)\"', ocr_page_title_attrib).group(1) | ||||
| @@ -42,11 +49,13 @@ for ocr_page in hocr.findall('.//div[@class="ocr_page"]'): | ||||
|         tei += '      <p>\n' | ||||
|         for ocr_line in ocr_par.findall('.//span[@class="ocr_line"]'): | ||||
|             tei += '        <lb/>' | ||||
|             indent = '' | ||||
|             is_first_word_in_line = True | ||||
|             for ocrx_word in ocr_line.findall('.//span[@class="ocrx_word"]'): | ||||
|                 if ocrx_word.text is not None: | ||||
|                     tei += indent + escape(ocrx_word.text) | ||||
|                     indent = ' ' | ||||
|                     if not is_first_word_in_line: | ||||
|                         tei += ' ' | ||||
|                     tei += escape(ocrx_word.text) | ||||
|                     is_first_word_in_line = False | ||||
|             tei += '\n' | ||||
|         tei += '      </p>\n' | ||||
| tei += '    </body>\n' | ||||
|   | ||||
		Reference in New Issue
	
	Block a user