mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git
synced 2025-01-14 00:04:04 +00:00
Update nlp software metadata represantation
This commit is contained in:
parent
91708308bc
commit
fe7ab93513
30
spacy-nlp
30
spacy-nlp
@ -66,23 +66,20 @@ nlp = spacy.load(SPACY_MODELS[args.language])
|
|||||||
# See: http://cwb.sourceforge.net/files/CWB_Encoding_Tutorial/node3.html
|
# See: http://cwb.sourceforge.net/files/CWB_Encoding_Tutorial/node3.html
|
||||||
output_file_original_filename = args.o
|
output_file_original_filename = args.o
|
||||||
output_file_stand_off_filename = args.o.replace('.vrt', '.stand-off.vrt')
|
output_file_stand_off_filename = args.o.replace('.vrt', '.stand-off.vrt')
|
||||||
xml_head = '''<?xml version="1.0" encoding="UTF-8"?>\n\
|
common_xml = ('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'
|
||||||
<corpus>\n\
|
+ '<corpus>\n'
|
||||||
<text>\n\
|
+ '<text>\n'
|
||||||
<metadata\n\
|
+ '<nlp name="spaCy"\n'
|
||||||
spacyVersion="{spacy_version}"
|
+ ' version="{}"\n'.format(spacy.__version__)
|
||||||
spacyModel="{spacy_model}"
|
+ ' model="{}"\n'.format(SPACY_MODELS[args.language])
|
||||||
spacyModelVersion="{spacy_model_version}"
|
+ ' model_version="{}"\n'.format(nlp.meta['version'])
|
||||||
md5HashOfInput="{md5_hash}">\n'''.format(md5_hash=md5_hash,
|
+ ' md5_hash_of_input="{}" />'.format(md5_hash))
|
||||||
spacy_version=spacy.__version__,
|
|
||||||
spacy_model=SPACY_MODELS[args.language],
|
|
||||||
spacy_model_version=nlp.meta['version'])
|
|
||||||
|
|
||||||
with open(output_file_original_filename, 'w+') as output_file_original, \
|
with open(output_file_original_filename, 'w+') as output_file_original, \
|
||||||
open(output_file_stand_off_filename, 'w+') as output_file_stand_off:
|
open(output_file_stand_off_filename, 'w+') as output_file_stand_off:
|
||||||
|
|
||||||
output_file_original.write(xml_head)
|
output_file_original.write(common_xml)
|
||||||
output_file_stand_off.write(xml_head)
|
output_file_stand_off.write(common_xml)
|
||||||
text_offset = 0
|
text_offset = 0
|
||||||
for text_chunk in text_chunks:
|
for text_chunk in text_chunks:
|
||||||
doc = nlp(text_chunk)
|
doc = nlp(text_chunk)
|
||||||
@ -91,7 +88,8 @@ with open(output_file_original_filename, 'w+') as output_file_original, \
|
|||||||
output_file_stand_off.write('<s>\n')
|
output_file_stand_off.write('<s>\n')
|
||||||
space_flag = False
|
space_flag = False
|
||||||
# Skip whitespace tokens
|
# Skip whitespace tokens
|
||||||
sent_no_space = [token for token in sent if not token.text.isspace()]
|
sent_no_space = [token for token in sent
|
||||||
|
if not token.text.isspace()]
|
||||||
# No space variant for cwb original .vrt file input.
|
# No space variant for cwb original .vrt file input.
|
||||||
for token in sent_no_space:
|
for token in sent_no_space:
|
||||||
output_file_original.write('{}'.format(escape(token.text))
|
output_file_original.write('{}'.format(escape(token.text))
|
||||||
@ -112,5 +110,5 @@ with open(output_file_original_filename, 'w+') as output_file_original, \
|
|||||||
output_file_original.write('</s>\n')
|
output_file_original.write('</s>\n')
|
||||||
output_file_stand_off.write('</s>\n')
|
output_file_stand_off.write('</s>\n')
|
||||||
text_offset = token_end + 1
|
text_offset = token_end + 1
|
||||||
output_file_original.write('</metadata>\n</text>\n</corpus>')
|
output_file_original.write('</text>\n</corpus>')
|
||||||
output_file_stand_off.write('</metadata>\n</text>\n</corpus>')
|
output_file_stand_off.write('</text>\n</corpus>')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user