Remove id xml attribute from output file

This commit is contained in:
Patrick Jentsch 2020-01-27 15:59:32 +01:00
parent b0a402b3ac
commit 5f20f9be40

View File

@ -40,11 +40,9 @@ with open(args.i) as input_file:
# Create and open the output file
output_file = open(args.o, 'w+')
output_file.write(
'<?xml version="1.0" encoding="UTF-8"?>\n'
output_file.write('<?xml version="1.0" encoding="UTF-8"?>\n'
'<corpus>\n'
'<text id="{}">\n'.format(os.path.basename(args.i).rsplit(".", 1)[0])
)
'<text>\n')
for text in texts:
# Run spacy nlp over the text (partial string if above 1 million chars)
doc = nlp(text)
@ -66,9 +64,7 @@ for text in texts:
)
)
output_file.write('</s>\n')
output_file.write(
'</text>\n'
'</corpus>'
)
output_file.write('</text>\n'
'</corpus>')
output_file.close()