Update nlp software metadata represantation

2026-08-02 04:33:33 +00:00 · 2020-06-10 13:14:34 +02:00
parent 91708308bc
commit fe7ab93513
1 changed files with 14 additions and 16 deletions
@@ -66,23 +66,20 @@ nlp = spacy.load(SPACY_MODELS[args.language])
 # See: http://cwb.sourceforge.net/files/CWB_Encoding_Tutorial/node3.html
 output_file_original_filename = args.o
 output_file_stand_off_filename = args.o.replace('.vrt', '.stand-off.vrt')
-xml_head = '''<?xml version="1.0" encoding="UTF-8"?>\n\
-<corpus>\n\
-<text>\n\
-<metadata\n\
-    spacyVersion="{spacy_version}"
-    spacyModel="{spacy_model}"
-    spacyModelVersion="{spacy_model_version}"
-    md5HashOfInput="{md5_hash}">\n'''.format(md5_hash=md5_hash,
-                                             spacy_version=spacy.__version__,
-                                             spacy_model=SPACY_MODELS[args.language],
-                                             spacy_model_version=nlp.meta['version'])
+common_xml = ('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'
+              + '<corpus>\n'
+              + '<text>\n'
+              + '<nlp name="spaCy"\n'
+              + '     version="{}"\n'.format(spacy.__version__)
+              + '     model="{}"\n'.format(SPACY_MODELS[args.language])
+              + '     model_version="{}"\n'.format(nlp.meta['version'])
+              + '     md5_hash_of_input="{}" />'.format(md5_hash))

 with open(output_file_original_filename, 'w+') as output_file_original, \
     open(output_file_stand_off_filename, 'w+') as output_file_stand_off:

-    output_file_original.write(xml_head)
-    output_file_stand_off.write(xml_head)
+    output_file_original.write(common_xml)
+    output_file_stand_off.write(common_xml)
    text_offset = 0
    for text_chunk in text_chunks:
        doc = nlp(text_chunk)
@@ -91,7 +88,8 @@ with open(output_file_original_filename, 'w+') as output_file_original, \
            output_file_stand_off.write('<s>\n')
            space_flag = False
            # Skip whitespace tokens
-            sent_no_space = [token for token in sent if not token.text.isspace()]
+            sent_no_space = [token for token in sent
+                             if not token.text.isspace()]
            # No space variant for cwb original .vrt file input.
            for token in sent_no_space:
                output_file_original.write('{}'.format(escape(token.text))
@@ -112,5 +110,5 @@ with open(output_file_original_filename, 'w+') as output_file_original, \
            output_file_original.write('</s>\n')
            output_file_stand_off.write('</s>\n')
        text_offset = token_end + 1
-    output_file_original.write('</metadata>\n</text>\n</corpus>')
-    output_file_stand_off.write('</metadata>\n</text>\n</corpus>')
+    output_file_original.write('</text>\n</corpus>')
+    output_file_stand_off.write('</text>\n</corpus>')