Add nlp.text_length option.

This commit is contained in:
Stephan Porada 2019-03-05 15:01:57 +01:00
parent 2a0662bccc
commit e578f5a8ff

View File

@ -32,6 +32,10 @@ SPACY_MODELS = {"de": "de_core_news_sm", "en": "en_core_web_sm",
# Set the language model for spacy
nlp = spacy.load(SPACY_MODELS[args.lang])
# Set maximum character length for input documents. According to documentation
# every value above 1000000 (1 Million) can cause memory allocation errors.
# We are testing it with 10 Million for now.
nlp.max_length = 10000000 # 10 Million character limit
# Read text from the input file
with open(args.input) as input_file:
@ -56,4 +60,4 @@ for sent in doc.sents:
+ (token.ent_type_ if token.ent_type_ != "" else "NULL") + "\n")
output_file.write('</s>\n')
output_file.write('</text>\n</corpus>')
output_file.close()
output_file.close()