Fix problems caused by wrong textwrap.wrap usage

2025-07-01 19:20:33 +00:00 · 2021-04-30 09:44:35 +02:00
parent f7b7da2b1f
commit bd5d8ddedb
2 changed files with 63 additions and 33 deletions
--- a/29
+++ b/29
@ -27,24 +27,28 @@ args = parser.parse_args()
 # If requested: Check the encoding of the text contents from the input file
 # Else: Use utf-8
-with open(args.input, "rb") as input_file:
+with open(args.input, "rb") as text_file:
    if args.check_encoding:
-        encoding = chardet.detect(input_file.read())['encoding']
+        encoding = chardet.detect(text_file.read())['encoding']
    else:
        encoding = 'utf-8'
    text_md5 = hashlib.md5()
-    for chunk in iter(lambda: input_file.read(128 * text_md5.block_size), b''):
+    for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''):
        text_md5.update(chunk)
 # Load the text contents from the input file
-with open(args.input, encoding=encoding) as input_file:
+with open(args.input, encoding=encoding) as text_file:
-    text = input_file.read()
+    # spaCy NLP is limited to strings with maximum 1 million characters at
    # spaCys NLP is limited to strings with maximum 1 million characters at
    # once. So we split it into suitable chunks.
-    text_chunks = textwrap.wrap(text, 1000000, break_long_words=False)
+    text_chunks = textwrap.wrap(
-    # the text variable potentially occupies a lot of system memory and is no
+        text_file.read(),
-    # longer needed...
+        1000000,
-    del text
+        break_long_words=False,
        break_on_hyphens=False,
        drop_whitespace=False,
        expand_tabs=False,
        replace_whitespace=False
    )
 model = spacy_models[args.language]
 nlp = spacy.load(model)
@ -59,6 +63,7 @@ meta = {
        }
    },
    'file': {
        'encoding': encoding,
        'md5': text_md5.hexdigest(),
        'name': os.path.basename(args.input)
    }
@ -127,7 +132,8 @@ tags = {
 annotations = []
 chunk_offset = 0
-for text_chunk in text_chunks:
+while text_chunks:
    text_chunk = text_chunks.pop(0)
    doc = nlp(text_chunk)
    for token in doc:
        if token.is_space:
@ -158,6 +164,7 @@ for text_chunk in text_chunks:
            annotation['properties']['ner'] = token.ent_type_
        annotations.append(annotation)
    chunk_offset += len(text_chunk)
    text_chunk = None
 with open(args.output, 'w') as output_file:
    json.dump({'meta': meta, 'tags': tags, 'annotations': annotations},
--- a/45
+++ b/45
@ -3,19 +3,13 @@
 from argparse import ArgumentParser
 from xml.sax.saxutils import escape
 import hashlib
 import json
 # Parse the given arguments
 parser = ArgumentParser(description='Create annotations for a given txt file')
 parser.add_argument('input', metavar='Path to txt input file')
 parser.add_argument('annotations', metavar='Path to JSON annotation file')
 parser.add_argument('output', metavar='Path to vrt output file')
 args = parser.parse_args()
-with open(args.input) as text_file, \
+# Two global ressources - Not very elegant but it works for now
-     open(args.annotations) as data_file:
+stand_off_data = None
-    text = text_file.read()
+text = None
    stand_off_data = json.load(data_file)
 def meta_to_string():
@ -26,7 +20,8 @@ def meta_to_string():
        stand_off_data['meta']['generator']['arguments']['check_encoding'],
        stand_off_data['meta']['generator']['arguments']['language']
    )
-    string += '<file name="{}" md5="{}"/>\n'.format(
+    string += '<file encoding="{}" name="{}" md5="{}"/>\n'.format(
        stand_off_data['meta']['file']['encoding'],
        stand_off_data['meta']['file']['name'],
        stand_off_data['meta']['file']['md5']
    )
@ -93,6 +88,30 @@ def annotations_to_string(end=float('inf')):
    return string
 def main():
    global stand_off_data
    global text
    # Parse the given arguments
    parser = ArgumentParser(description='Create a vrt from JSON and txt')
    parser.add_argument('text', metavar='Path to txt file')
    parser.add_argument('stand_off_data', metavar='Path to JSON file')
    parser.add_argument('output', metavar='Path to vrt output file')
    args = parser.parse_args()
    with open(args.stand_off_data) as stand_of_data_file:
        stand_off_data = json.load(stand_of_data_file)
    with open(args.text, "rb") as text_file:
        text_md5 = hashlib.md5()
        for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''):  # noqa
            text_md5.update(chunk)
        if text_md5.hexdigest() != stand_off_data['meta']['file']['md5']:
            raise Exception('md5 not equal')
    with open(args.text, encoding=stand_off_data['meta']['file']['encoding']) as text_file:  # noqa
        text = text_file.read()
    vrt = ''
    vrt += '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'
    vrt += '<corpus>\n'
@ -105,3 +124,7 @@ vrt += '</corpus>'
    with open(args.output, 'w') as vrt_file:
        vrt_file.write(vrt)
 if __name__ == '__main__':
    main()