diff --git a/spacy_nlp b/spacy_nlp index e5d086d..0844ad1 100755 --- a/spacy_nlp +++ b/spacy_nlp @@ -1,6 +1,7 @@ #!/usr/bin/env python3.5 # coding=utf-8 +from xml.sax.saxutils import escape import argparse import os import spacy @@ -56,8 +57,8 @@ for text in texts: # text, lemma, simple_pos, pos, ner output_file.write( '{}\t{}\t{}\t{}\t{}\n'.format( - token.text, - token.lemma_, + escape(token.text), + escape(token.lemma_), token.pos_, token.tag_, token.ent_type_ if token.ent_type_ != '' else 'NULL'