mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git
				synced 2025-10-31 20:33:12 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			46 lines
		
	
	
		
			1.2 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			46 lines
		
	
	
		
			1.2 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/env python3.7
 | |
| # coding=utf-8
 | |
| 
 | |
| from argparse import ArgumentParser
 | |
| from stand_off_data import StandOffData
 | |
| import hashlib
 | |
| import json
 | |
| 
 | |
| 
 | |
| parser = ArgumentParser(
 | |
|     description='Convert plain text and JSON stand off to a CWB vrt file'
 | |
| )
 | |
| parser.add_argument(
 | |
|     '-s', '--stand-off-data-file',
 | |
|     help='JSON stand off data input file',
 | |
|     required=True
 | |
| )
 | |
| parser.add_argument(
 | |
|     '-t', '--text-file',
 | |
|     help='Plain text input file',
 | |
|     required=True
 | |
| )
 | |
| parser.add_argument(
 | |
|     '-o', '--output-file',
 | |
|     help='Output file',
 | |
|     required=True
 | |
| )
 | |
| args = parser.parse_args()
 | |
| 
 | |
| 
 | |
| with open(args.stand_off_data_file) as stand_of_data_file:
 | |
|     stand_off_data = StandOffData(json.load(stand_of_data_file))
 | |
| 
 | |
| with open(args.text_file, "rb") as text_file:
 | |
|     text_md5 = hashlib.md5()
 | |
|     for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''):
 | |
|         text_md5.update(chunk)
 | |
|     if text_md5.hexdigest() != stand_off_data.meta['file']['md5']:
 | |
|         raise Exception('md5 not equal')
 | |
| 
 | |
| with open(args.text_file, encoding=stand_off_data.meta['file']['encoding']) as text_file:  # noqa
 | |
|     text = text_file.read()
 | |
| 
 | |
| with open(args.output_file, 'w') as vrt_file:
 | |
|     vrt_file.write(stand_off_data.to_vrt(text))
 |