Fix problems caused by wrong textwrap.wrap usage

2025-10-11 02:32:09 +00:00 · 2021-04-30 09:44:35 +02:00
parent f7b7da2b1f
commit bd5d8ddedb
2 changed files with 63 additions and 33 deletions
--- a/29
+++ b/29
@@ -27,24 +27,28 @@ args = parser.parse_args()

 # If requested: Check the encoding of the text contents from the input file
 # Else: Use utf-8
-with open(args.input, "rb") as input_file:
+with open(args.input, "rb") as text_file:
    if args.check_encoding:
-        encoding = chardet.detect(input_file.read())['encoding']
+        encoding = chardet.detect(text_file.read())['encoding']
    else:
        encoding = 'utf-8'
    text_md5 = hashlib.md5()
-    for chunk in iter(lambda: input_file.read(128 * text_md5.block_size), b''):
+    for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''):
        text_md5.update(chunk)

 # Load the text contents from the input file
-with open(args.input, encoding=encoding) as input_file:
-    text = input_file.read()
-    # spaCys NLP is limited to strings with maximum 1 million characters at
+with open(args.input, encoding=encoding) as text_file:
+    # spaCy NLP is limited to strings with maximum 1 million characters at
    # once. So we split it into suitable chunks.
-    text_chunks = textwrap.wrap(text, 1000000, break_long_words=False)
-    # the text variable potentially occupies a lot of system memory and is no
-    # longer needed...
-    del text
+    text_chunks = textwrap.wrap(
+        text_file.read(),
+        1000000,
+        break_long_words=False,
+        break_on_hyphens=False,
+        drop_whitespace=False,
+        expand_tabs=False,
+        replace_whitespace=False
+    )

 model = spacy_models[args.language]
 nlp = spacy.load(model)
@@ -59,6 +63,7 @@ meta = {
        }
    },
    'file': {
+        'encoding': encoding,
        'md5': text_md5.hexdigest(),
        'name': os.path.basename(args.input)
    }
@@ -127,7 +132,8 @@ tags = {
 annotations = []

 chunk_offset = 0
-for text_chunk in text_chunks:
+while text_chunks:
+    text_chunk = text_chunks.pop(0)
    doc = nlp(text_chunk)
    for token in doc:
        if token.is_space:
@@ -158,6 +164,7 @@ for text_chunk in text_chunks:
            annotation['properties']['ner'] = token.ent_type_
        annotations.append(annotation)
    chunk_offset += len(text_chunk)
+    text_chunk = None

 with open(args.output, 'w') as output_file:
    json.dump({'meta': meta, 'tags': tags, 'annotations': annotations},