Fix problems caused by wrong textwrap.wrap usage

2026-08-02 04:33:33 +00:00 · 2021-04-30 09:44:35 +02:00
parent f7b7da2b1f
commit bd5d8ddedb
2 changed files with 63 additions and 33 deletions
@@ -27,24 +27,28 @@ args = parser.parse_args()

 # If requested: Check the encoding of the text contents from the input file
 # Else: Use utf-8
-with open(args.input, "rb") as input_file:
+with open(args.input, "rb") as text_file:
    if args.check_encoding:
-        encoding = chardet.detect(input_file.read())['encoding']
+        encoding = chardet.detect(text_file.read())['encoding']
    else:
        encoding = 'utf-8'
    text_md5 = hashlib.md5()
-    for chunk in iter(lambda: input_file.read(128 * text_md5.block_size), b''):
+    for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''):
        text_md5.update(chunk)

 # Load the text contents from the input file
-with open(args.input, encoding=encoding) as input_file:
-    text = input_file.read()
-    # spaCys NLP is limited to strings with maximum 1 million characters at
+with open(args.input, encoding=encoding) as text_file:
+    # spaCy NLP is limited to strings with maximum 1 million characters at
    # once. So we split it into suitable chunks.
-    text_chunks = textwrap.wrap(text, 1000000, break_long_words=False)
-    # the text variable potentially occupies a lot of system memory and is no
-    # longer needed...
-    del text
+    text_chunks = textwrap.wrap(
+        text_file.read(),
+        1000000,
+        break_long_words=False,
+        break_on_hyphens=False,
+        drop_whitespace=False,
+        expand_tabs=False,
+        replace_whitespace=False
+    )

 model = spacy_models[args.language]
 nlp = spacy.load(model)
@@ -59,6 +63,7 @@ meta = {
        }
    },
    'file': {
+        'encoding': encoding,
        'md5': text_md5.hexdigest(),
        'name': os.path.basename(args.input)
    }
@@ -127,7 +132,8 @@ tags = {
 annotations = []

 chunk_offset = 0
-for text_chunk in text_chunks:
+while text_chunks:
+    text_chunk = text_chunks.pop(0)
    doc = nlp(text_chunk)
    for token in doc:
        if token.is_space:
@@ -158,6 +164,7 @@ for text_chunk in text_chunks:
            annotation['properties']['ner'] = token.ent_type_
        annotations.append(annotation)
    chunk_offset += len(text_chunk)
+    text_chunk = None

 with open(args.output, 'w') as output_file:
    json.dump({'meta': meta, 'tags': tags, 'annotations': annotations},
@@ -3,19 +3,13 @@

 from argparse import ArgumentParser
 from xml.sax.saxutils import escape
+import hashlib
 import json

-# Parse the given arguments
-parser = ArgumentParser(description='Create annotations for a given txt file')
-parser.add_argument('input', metavar='Path to txt input file')
-parser.add_argument('annotations', metavar='Path to JSON annotation file')
-parser.add_argument('output', metavar='Path to vrt output file')
-args = parser.parse_args()

-with open(args.input) as text_file, \
-     open(args.annotations) as data_file:
-    text = text_file.read()
-    stand_off_data = json.load(data_file)
+# Two global ressources - Not very elegant but it works for now
+stand_off_data = None
+text = None


 def meta_to_string():
@@ -26,7 +20,8 @@ def meta_to_string():
        stand_off_data['meta']['generator']['arguments']['check_encoding'],
        stand_off_data['meta']['generator']['arguments']['language']
    )
-    string += '<file name="{}" md5="{}"/>\n'.format(
+    string += '<file encoding="{}" name="{}" md5="{}"/>\n'.format(
+        stand_off_data['meta']['file']['encoding'],
        stand_off_data['meta']['file']['name'],
        stand_off_data['meta']['file']['md5']
    )
@@ -93,6 +88,30 @@ def annotations_to_string(end=float('inf')):
    return string


+def main():
+    global stand_off_data
+    global text
+
+    # Parse the given arguments
+    parser = ArgumentParser(description='Create a vrt from JSON and txt')
+    parser.add_argument('text', metavar='Path to txt file')
+    parser.add_argument('stand_off_data', metavar='Path to JSON file')
+    parser.add_argument('output', metavar='Path to vrt output file')
+    args = parser.parse_args()
+
+    with open(args.stand_off_data) as stand_of_data_file:
+        stand_off_data = json.load(stand_of_data_file)
+
+    with open(args.text, "rb") as text_file:
+        text_md5 = hashlib.md5()
+        for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''):  # noqa
+            text_md5.update(chunk)
+        if text_md5.hexdigest() != stand_off_data['meta']['file']['md5']:
+            raise Exception('md5 not equal')
+
+    with open(args.text, encoding=stand_off_data['meta']['file']['encoding']) as text_file:  # noqa
+        text = text_file.read()
+
    vrt = ''
    vrt += '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'
    vrt += '<corpus>\n'
@@ -105,3 +124,7 @@ vrt += '</corpus>'

    with open(args.output, 'w') as vrt_file:
        vrt_file.write(vrt)
+
+
+if __name__ == '__main__':
+    main()