Remove id xml attribute from output file

This commit is contained in:
Patrick Jentsch 2020-01-27 15:59:32 +01:00
parent b0a402b3ac
commit 5f20f9be40

View File

@ -40,11 +40,9 @@ with open(args.i) as input_file:
# Create and open the output file # Create and open the output file
output_file = open(args.o, 'w+') output_file = open(args.o, 'w+')
output_file.write( output_file.write('<?xml version="1.0" encoding="UTF-8"?>\n'
'<?xml version="1.0" encoding="UTF-8"?>\n' '<corpus>\n'
'<corpus>\n' '<text>\n')
'<text id="{}">\n'.format(os.path.basename(args.i).rsplit(".", 1)[0])
)
for text in texts: for text in texts:
# Run spacy nlp over the text (partial string if above 1 million chars) # Run spacy nlp over the text (partial string if above 1 million chars)
doc = nlp(text) doc = nlp(text)
@ -66,9 +64,7 @@ for text in texts:
) )
) )
output_file.write('</s>\n') output_file.write('</s>\n')
output_file.write( output_file.write('</text>\n'
'</text>\n' '</corpus>')
'</corpus>'
)
output_file.close() output_file.close()