nlp/test.py

37 lines
1.4 KiB
Python
Raw Normal View History

import textwrap
def limit_text(text, character_limit):
"""
This function checks if a string is below 1000000 (1 Million characters).
If it is below that limmit the text will be processed. If it is above the
limit, the text will be splitted into parts below 1 million characters.
Parts will be as long as possible.
Returns a list of strings each below the character limit.
"""
str_list = []
if(len(text) > character_limit):
cut_off = text.index(" ", character_limit - 10, character_limit)
tmp_strings = [text[:cut_off]]
tmp_strings.append(text[cut_off:])
for string in tmp_strings:
if(len(string) < character_limit):
str_list.append(string)
elif(len(string) > character_limit):
tmp_strings = limit_text(string, character_limit)
for string in tmp_strings:
str_list.append(string)
else:
str_list.append(text)
return str_list
def main():
text = "If true, TextWrapper attempts to detect sentence endings and ensure that sentences are always separated by exactly two spaces. This is generally desired for text in a monospaced font. However, the sentence detection algorithm is imperfect:"
texts = limit_text(text, 50)
lines = textwrap.wrap(text, 50, break_long_words=False)
print("Own version:", texts)
print("Lib:", lines)
if __name__ == '__main__':
main()