From 887e814020297c228a987e893e5941c24fa82d4a Mon Sep 17 00:00:00 2001 From: Stephan Porada Date: Wed, 20 May 2020 15:01:52 +0200 Subject: [PATCH] Fix --- spacy-nlp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/spacy-nlp b/spacy-nlp index 245d8b6..d451dae 100755 --- a/spacy-nlp +++ b/spacy-nlp @@ -52,7 +52,7 @@ with open(args.i, encoding=encoding) as input_file: text = input_file.read() # spaCys NLP is limited to strings with maximum 1 million characters at # once. So we split it into suitable chunks. - text_chunks = textwrap.wrap(text, 1000, break_long_words=False) + text_chunks = textwrap.wrap(text, 1000000, break_long_words=False) # the text variable potentially occupies a lot of system memory and is no # longer needed... del text @@ -66,7 +66,6 @@ nlp = spacy.load(SPACY_MODELS[args.language]) # See: http://cwb.sourceforge.net/files/CWB_Encoding_Tutorial/node3.html output_file_original_filename = args.o output_file_stand_off_filename = args.o.replace('.vrt', '.stand-off.vrt') -output_file_tokens_filename = args.o.replace('.vrt', '.tokens.txt') xml_head = '''\n\ \n\ \n\ @@ -78,12 +77,10 @@ xml_head = '''\n\ spacy_model=SPACY_MODELS[args.language]) with open(output_file_original_filename, 'w+') as output_file_original, \ - open(output_file_stand_off_filename, 'w+') as output_file_stand_off, \ - open(output_file_tokens_filename, 'w+') as output_file_tokens: + open(output_file_stand_off_filename, 'w+') as output_file_stand_off: output_file_original.write(xml_head) output_file_stand_off.write(xml_head) - output_file_tokens.write(xml_head) text_offset = 0 for text_chunk in text_chunks: doc = nlp(text_chunk) @@ -110,10 +107,8 @@ with open(output_file_original_filename, 'w+') as output_file_original, \ + '\t{}'.format(token.pos_) + '\t{}'.format(token.tag_) + '\t{}\n'.format(token.ent_type_ or 'NULL')) - output_file_tokens.write('{}\n'.format(escape(token.text))) output_file_original.write('\n') output_file_stand_off.write('\n') text_offset = token_end + 1 output_file_original.write('\n\n') output_file_stand_off.write('\n\n') - output_file_tokens.write('\n\n')