diff --git a/spacy-nlp b/spacy-nlp index 8b7a39c..6849a2e 100755 --- a/spacy-nlp +++ b/spacy-nlp @@ -66,23 +66,20 @@ nlp = spacy.load(SPACY_MODELS[args.language]) # See: http://cwb.sourceforge.net/files/CWB_Encoding_Tutorial/node3.html output_file_original_filename = args.o output_file_stand_off_filename = args.o.replace('.vrt', '.stand-off.vrt') -xml_head = '''\n\ -\n\ -\n\ -\n'''.format(md5_hash=md5_hash, - spacy_version=spacy.__version__, - spacy_model=SPACY_MODELS[args.language], - spacy_model_version=nlp.meta['version']) +common_xml = ('\n' + + '\n' + + '\n' + + ''.format(md5_hash)) with open(output_file_original_filename, 'w+') as output_file_original, \ open(output_file_stand_off_filename, 'w+') as output_file_stand_off: - output_file_original.write(xml_head) - output_file_stand_off.write(xml_head) + output_file_original.write(common_xml) + output_file_stand_off.write(common_xml) text_offset = 0 for text_chunk in text_chunks: doc = nlp(text_chunk) @@ -91,7 +88,8 @@ with open(output_file_original_filename, 'w+') as output_file_original, \ output_file_stand_off.write('\n') space_flag = False # Skip whitespace tokens - sent_no_space = [token for token in sent if not token.text.isspace()] + sent_no_space = [token for token in sent + if not token.text.isspace()] # No space variant for cwb original .vrt file input. for token in sent_no_space: output_file_original.write('{}'.format(escape(token.text)) @@ -112,5 +110,5 @@ with open(output_file_original_filename, 'w+') as output_file_original, \ output_file_original.write('\n') output_file_stand_off.write('\n') text_offset = token_end + 1 - output_file_original.write('\n\n') - output_file_stand_off.write('\n\n') + output_file_original.write('\n') + output_file_stand_off.write('\n')