Add linewrap function and test.py for fun.

2025-12-15 18:30:49 +00:00 · 2019-03-06 14:17:03 +01:00
parent ff1e0a51c4
commit d582d9771a
2 changed files with 55 additions and 16 deletions
--- a/35
+++ b/35
@@ -5,6 +5,7 @@
 import argparse
 import os
 import spacy
 import textwrap
 parser = argparse.ArgumentParser(description="Tag a .txt file with spaCy and \
@@ -29,31 +30,33 @@ SPACY_MODELS = {"de": "de_core_news_sm", "en": "en_core_web_sm",
                "es": "es_core_news_sm", "fr": "fr_core_news_sm",
                "pt": "pt_core_news_sm"}
 # Set the language model for spacy
 nlp = spacy.load(SPACY_MODELS[args.lang])
-# Read text from the input file
+# Read text from the input file and if neccessary split it into parts with a
 # length of less than 1 million characters.
 with open(args.input) as input_file:
    text = input_file.read()
    texts = textwrap.wrap(text, 1000000, break_long_words=False)
 # Run spacy nlp over the text
 doc = nlp(text)
 # Create and open the output file
 output_file = open(args.output, "w+")
 output_file.write('<?xml version="1.0" encoding="UTF-8"?>\n<corpus>\n<text id="' + args.input.rsplit(".", 1)[0] + '">\n')
-for sent in doc.sents:
+for text in texts:
-    output_file.write('<s>\n')
+    # Run spacy nlp over the text (partial string if above 1 million chars)
-    for token in sent:
+    doc = nlp(text)
-        # Skip whitespace tokens like "\n" or "\t"
+    for sent in doc.sents:
-        if token.text.isspace():
+        output_file.write('<s>\n')
-            continue
+        for token in sent:
-        # Write all information in .vrt style to the output file
+            # Skip whitespace tokens like "\n" or "\t"
-        # text, lemma, simple_pos, pos, ner
+            if token.text.isspace():
-        output_file.write(token.text + "\t" + token.lemma_ + "\t"
+                continue
-                          + token.pos_ + "\t" + token.tag_ + "\t"
+            # Write all information in .vrt style to the output file
-                          + (token.ent_type_ if token.ent_type_ != "" else "NULL") + "\n")
+            # text, lemma, simple_pos, pos, ner
-    output_file.write('</s>\n')
+            output_file.write(token.text + "\t" + token.lemma_ + "\t"
                              + token.pos_ + "\t" + token.tag_ + "\t"
                              + (token.ent_type_ if token.ent_type_ != "" else "NULL") + "\n")
        output_file.write('</s>\n')
 output_file.write('</text>\n</corpus>')
 output_file.close()
--- a/test.py
+++ b/test.py
@@ -0,0 +1,36 @@
 import textwrap
 def limit_text(text, character_limit):
    """
    This function checks if a string is below 1000000 (1 Million characters).
    If it is below that limmit the text will be processed. If it is above the
    limit, the text will be splitted into parts below 1 million characters.
    Parts will be as long as possible.
    Returns a list of strings each below the character limit.
    """
    str_list = []
    if(len(text) > character_limit):
        cut_off = text.index(" ", character_limit - 10, character_limit)
        tmp_strings = [text[:cut_off]]
        tmp_strings.append(text[cut_off:])
        for string in tmp_strings:
            if(len(string) < character_limit):
                str_list.append(string)
            elif(len(string) > character_limit):
                tmp_strings = limit_text(string, character_limit)
                for string in tmp_strings:
                    str_list.append(string)
    else:
        str_list.append(text)
    return str_list
 def main():
    text = "If true, TextWrapper attempts to detect sentence endings and ensure that sentences are always separated by exactly two spaces. This is generally desired for text in a monospaced font. However, the sentence detection algorithm is imperfect:"
    texts = limit_text(text, 50)
    lines = textwrap.wrap(text, 50, break_long_words=False)
    print("Own version:", texts)
    print("Lib:", lines)
 if __name__ == '__main__':
    main()