mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr.git
				synced 2025-10-31 14:12:45 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			45 lines
		
	
	
		
			1.0 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			45 lines
		
	
	
		
			1.0 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/env python3.7
 | |
| # coding=utf-8
 | |
| 
 | |
| ''' Combine multiple hOCR files. '''
 | |
| 
 | |
| from argparse import ArgumentParser
 | |
| from lxml import html
 | |
| 
 | |
| 
 | |
| parser = ArgumentParser(description='Combine multiple hOCR files.')
 | |
| parser.add_argument(
 | |
|     '-i', '--input-file',
 | |
|     help='Input file',
 | |
|     nargs='+',
 | |
|     required=True
 | |
| )
 | |
| parser.add_argument(
 | |
|     '-o', '--output-file',
 | |
|     help='Output file',
 | |
|     required=True
 | |
| )
 | |
| args = parser.parse_args()
 | |
| 
 | |
| 
 | |
| for input_file in args.input_file:
 | |
|     input_files = []
 | |
|     if input_file.startswith('@'):
 | |
|         with open(input_file[1:], 'r') as f:
 | |
|             input_files += [x for x in f.read().split("\n") if x != '']
 | |
|     else:
 | |
|         input_files.append(input_file)
 | |
| if len(input_files) == 0:
 | |
|     exit(1)
 | |
| 
 | |
| 
 | |
| hocr = html.parse(input_files[0])
 | |
| hocr_body = hocr.find('body')
 | |
| for input_file in input_files[1:]:
 | |
|     for ocr_page in html.parse(input_file).findall('//div[@class="ocr_page"]'):
 | |
|         hocr_body.append(ocr_page)
 | |
| 
 | |
| 
 | |
| with open(args.output_file, 'wb') as f:
 | |
|     hocr.write(f, encoding='UTF-8', method='html')
 |