Add linewrap function and test.py for fun.

This commit is contained in:
Stephan Porada 2019-03-06 14:17:03 +01:00
parent ff1e0a51c4
commit d582d9771a
2 changed files with 55 additions and 16 deletions

View File

@ -5,6 +5,7 @@
import argparse
import os
import spacy
import textwrap
parser = argparse.ArgumentParser(description="Tag a .txt file with spaCy and \
@ -29,31 +30,33 @@ SPACY_MODELS = {"de": "de_core_news_sm", "en": "en_core_web_sm",
"es": "es_core_news_sm", "fr": "fr_core_news_sm",
"pt": "pt_core_news_sm"}
# Set the language model for spacy
nlp = spacy.load(SPACY_MODELS[args.lang])
# Read text from the input file
# Read text from the input file and if neccessary split it into parts with a
# length of less than 1 million characters.
with open(args.input) as input_file:
text = input_file.read()
texts = textwrap.wrap(text, 1000000, break_long_words=False)
# Run spacy nlp over the text
doc = nlp(text)
# Create and open the output file
output_file = open(args.output, "w+")
output_file.write('<?xml version="1.0" encoding="UTF-8"?>\n<corpus>\n<text id="' + args.input.rsplit(".", 1)[0] + '">\n')
for sent in doc.sents:
output_file.write('<s>\n')
for token in sent:
# Skip whitespace tokens like "\n" or "\t"
if token.text.isspace():
continue
# Write all information in .vrt style to the output file
# text, lemma, simple_pos, pos, ner
output_file.write(token.text + "\t" + token.lemma_ + "\t"
+ token.pos_ + "\t" + token.tag_ + "\t"
+ (token.ent_type_ if token.ent_type_ != "" else "NULL") + "\n")
output_file.write('</s>\n')
for text in texts:
# Run spacy nlp over the text (partial string if above 1 million chars)
doc = nlp(text)
for sent in doc.sents:
output_file.write('<s>\n')
for token in sent:
# Skip whitespace tokens like "\n" or "\t"
if token.text.isspace():
continue
# Write all information in .vrt style to the output file
# text, lemma, simple_pos, pos, ner
output_file.write(token.text + "\t" + token.lemma_ + "\t"
+ token.pos_ + "\t" + token.tag_ + "\t"
+ (token.ent_type_ if token.ent_type_ != "" else "NULL") + "\n")
output_file.write('</s>\n')
output_file.write('</text>\n</corpus>')
output_file.close()

36
test.py Normal file
View File

@ -0,0 +1,36 @@
import textwrap
def limit_text(text, character_limit):
"""
This function checks if a string is below 1000000 (1 Million characters).
If it is below that limmit the text will be processed. If it is above the
limit, the text will be splitted into parts below 1 million characters.
Parts will be as long as possible.
Returns a list of strings each below the character limit.
"""
str_list = []
if(len(text) > character_limit):
cut_off = text.index(" ", character_limit - 10, character_limit)
tmp_strings = [text[:cut_off]]
tmp_strings.append(text[cut_off:])
for string in tmp_strings:
if(len(string) < character_limit):
str_list.append(string)
elif(len(string) > character_limit):
tmp_strings = limit_text(string, character_limit)
for string in tmp_strings:
str_list.append(string)
else:
str_list.append(text)
return str_list
def main():
text = "If true, TextWrapper attempts to detect sentence endings and ensure that sentences are always separated by exactly two spaces. This is generally desired for text in a monospaced font. However, the sentence detection algorithm is imperfect:"
texts = limit_text(text, 50)
lines = textwrap.wrap(text, 50, break_long_words=False)
print("Own version:", texts)
print("Lib:", lines)
if __name__ == '__main__':
main()