mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git
				synced 2025-10-31 01:12:47 +00:00 
			
		
		
		
	Update nlp software metadata represantation
This commit is contained in:
		
							
								
								
									
										30
									
								
								spacy-nlp
									
									
									
									
									
								
							
							
						
						
									
										30
									
								
								spacy-nlp
									
									
									
									
									
								
							| @@ -66,23 +66,20 @@ nlp = spacy.load(SPACY_MODELS[args.language]) | ||||
| # See: http://cwb.sourceforge.net/files/CWB_Encoding_Tutorial/node3.html | ||||
| output_file_original_filename = args.o | ||||
| output_file_stand_off_filename = args.o.replace('.vrt', '.stand-off.vrt') | ||||
| xml_head = '''<?xml version="1.0" encoding="UTF-8"?>\n\ | ||||
| <corpus>\n\ | ||||
| <text>\n\ | ||||
| <metadata\n\ | ||||
|     spacyVersion="{spacy_version}" | ||||
|     spacyModel="{spacy_model}" | ||||
|     spacyModelVersion="{spacy_model_version}" | ||||
|     md5HashOfInput="{md5_hash}">\n'''.format(md5_hash=md5_hash, | ||||
|                                              spacy_version=spacy.__version__, | ||||
|                                              spacy_model=SPACY_MODELS[args.language], | ||||
|                                              spacy_model_version=nlp.meta['version']) | ||||
| common_xml = ('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n' | ||||
|               + '<corpus>\n' | ||||
|               + '<text>\n' | ||||
|               + '<nlp name="spaCy"\n' | ||||
|               + '     version="{}"\n'.format(spacy.__version__) | ||||
|               + '     model="{}"\n'.format(SPACY_MODELS[args.language]) | ||||
|               + '     model_version="{}"\n'.format(nlp.meta['version']) | ||||
|               + '     md5_hash_of_input="{}" />'.format(md5_hash)) | ||||
|  | ||||
| with open(output_file_original_filename, 'w+') as output_file_original, \ | ||||
|      open(output_file_stand_off_filename, 'w+') as output_file_stand_off: | ||||
|  | ||||
|     output_file_original.write(xml_head) | ||||
|     output_file_stand_off.write(xml_head) | ||||
|     output_file_original.write(common_xml) | ||||
|     output_file_stand_off.write(common_xml) | ||||
|     text_offset = 0 | ||||
|     for text_chunk in text_chunks: | ||||
|         doc = nlp(text_chunk) | ||||
| @@ -91,7 +88,8 @@ with open(output_file_original_filename, 'w+') as output_file_original, \ | ||||
|             output_file_stand_off.write('<s>\n') | ||||
|             space_flag = False | ||||
|             # Skip whitespace tokens | ||||
|             sent_no_space = [token for token in sent if not token.text.isspace()] | ||||
|             sent_no_space = [token for token in sent | ||||
|                              if not token.text.isspace()] | ||||
|             # No space variant for cwb original .vrt file input. | ||||
|             for token in sent_no_space: | ||||
|                 output_file_original.write('{}'.format(escape(token.text)) | ||||
| @@ -112,5 +110,5 @@ with open(output_file_original_filename, 'w+') as output_file_original, \ | ||||
|             output_file_original.write('</s>\n') | ||||
|             output_file_stand_off.write('</s>\n') | ||||
|         text_offset = token_end + 1 | ||||
|     output_file_original.write('</metadata>\n</text>\n</corpus>') | ||||
|     output_file_stand_off.write('</metadata>\n</text>\n</corpus>') | ||||
|     output_file_original.write('</text>\n</corpus>') | ||||
|     output_file_stand_off.write('</text>\n</corpus>') | ||||
|   | ||||
		Reference in New Issue
	
	Block a user