Update nlp software metadata represantation

This commit is contained in:
Patrick Jentsch 2020-06-10 13:14:34 +02:00
parent 91708308bc
commit fe7ab93513

View File

@ -66,23 +66,20 @@ nlp = spacy.load(SPACY_MODELS[args.language])
# See: http://cwb.sourceforge.net/files/CWB_Encoding_Tutorial/node3.html # See: http://cwb.sourceforge.net/files/CWB_Encoding_Tutorial/node3.html
output_file_original_filename = args.o output_file_original_filename = args.o
output_file_stand_off_filename = args.o.replace('.vrt', '.stand-off.vrt') output_file_stand_off_filename = args.o.replace('.vrt', '.stand-off.vrt')
xml_head = '''<?xml version="1.0" encoding="UTF-8"?>\n\ common_xml = ('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'
<corpus>\n\ + '<corpus>\n'
<text>\n\ + '<text>\n'
<metadata\n\ + '<nlp name="spaCy"\n'
spacyVersion="{spacy_version}" + ' version="{}"\n'.format(spacy.__version__)
spacyModel="{spacy_model}" + ' model="{}"\n'.format(SPACY_MODELS[args.language])
spacyModelVersion="{spacy_model_version}" + ' model_version="{}"\n'.format(nlp.meta['version'])
md5HashOfInput="{md5_hash}">\n'''.format(md5_hash=md5_hash, + ' md5_hash_of_input="{}" />'.format(md5_hash))
spacy_version=spacy.__version__,
spacy_model=SPACY_MODELS[args.language],
spacy_model_version=nlp.meta['version'])
with open(output_file_original_filename, 'w+') as output_file_original, \ with open(output_file_original_filename, 'w+') as output_file_original, \
open(output_file_stand_off_filename, 'w+') as output_file_stand_off: open(output_file_stand_off_filename, 'w+') as output_file_stand_off:
output_file_original.write(xml_head) output_file_original.write(common_xml)
output_file_stand_off.write(xml_head) output_file_stand_off.write(common_xml)
text_offset = 0 text_offset = 0
for text_chunk in text_chunks: for text_chunk in text_chunks:
doc = nlp(text_chunk) doc = nlp(text_chunk)
@ -91,7 +88,8 @@ with open(output_file_original_filename, 'w+') as output_file_original, \
output_file_stand_off.write('<s>\n') output_file_stand_off.write('<s>\n')
space_flag = False space_flag = False
# Skip whitespace tokens # Skip whitespace tokens
sent_no_space = [token for token in sent if not token.text.isspace()] sent_no_space = [token for token in sent
if not token.text.isspace()]
# No space variant for cwb original .vrt file input. # No space variant for cwb original .vrt file input.
for token in sent_no_space: for token in sent_no_space:
output_file_original.write('{}'.format(escape(token.text)) output_file_original.write('{}'.format(escape(token.text))
@ -112,5 +110,5 @@ with open(output_file_original_filename, 'w+') as output_file_original, \
output_file_original.write('</s>\n') output_file_original.write('</s>\n')
output_file_stand_off.write('</s>\n') output_file_stand_off.write('</s>\n')
text_offset = token_end + 1 text_offset = token_end + 1
output_file_original.write('</metadata>\n</text>\n</corpus>') output_file_original.write('</text>\n</corpus>')
output_file_stand_off.write('</metadata>\n</text>\n</corpus>') output_file_stand_off.write('</text>\n</corpus>')