From bd5d8ddedbf51a51e1a91aa7974f3b97f29864d8 Mon Sep 17 00:00:00 2001
From: Patrick Jentsch <p.jentsch@uni-bielefeld.de>
Date: Fri, 30 Apr 2021 09:44:35 +0200
Subject: [PATCH] Fix problems caused by wrong textwrap.wrap usage

---
 spacy-nlp   | 29 ++++++++++++++---------
 vrt-creator | 67 +++++++++++++++++++++++++++++++++++------------------
 2 files changed, 63 insertions(+), 33 deletions(-)
diff --git a/spacy-nlp b/spacy-nlp
index 1950a6d..af114e6 100755
--- a/spacy-nlp
+++ b/spacy-nlp
@@ -27,24 +27,28 @@ args = parser.parse_args()
 
 # If requested: Check the encoding of the text contents from the input file
 # Else: Use utf-8
-with open(args.input, "rb") as input_file:
+with open(args.input, "rb") as text_file:
     if args.check_encoding:
-        encoding = chardet.detect(input_file.read())['encoding']
+        encoding = chardet.detect(text_file.read())['encoding']
     else:
         encoding = 'utf-8'
     text_md5 = hashlib.md5()
-    for chunk in iter(lambda: input_file.read(128 * text_md5.block_size), b''):
+    for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''):
         text_md5.update(chunk)
 
 # Load the text contents from the input file
-with open(args.input, encoding=encoding) as input_file:
-    text = input_file.read()
-    # spaCys NLP is limited to strings with maximum 1 million characters at
+with open(args.input, encoding=encoding) as text_file:
+    # spaCy NLP is limited to strings with maximum 1 million characters at
     # once. So we split it into suitable chunks.
-    text_chunks = textwrap.wrap(text, 1000000, break_long_words=False)
-    # the text variable potentially occupies a lot of system memory and is no
-    # longer needed...
-    del text
+    text_chunks = textwrap.wrap(
+        text_file.read(),
+        1000000,
+        break_long_words=False,
+        break_on_hyphens=False,
+        drop_whitespace=False,
+        expand_tabs=False,
+        replace_whitespace=False
+    )
 
 model = spacy_models[args.language]
 nlp = spacy.load(model)
@@ -59,6 +63,7 @@ meta = {
         }
     },
     'file': {
+        'encoding': encoding,
         'md5': text_md5.hexdigest(),
         'name': os.path.basename(args.input)
     }
@@ -127,7 +132,8 @@ tags = {
 annotations = []
 
 chunk_offset = 0
-for text_chunk in text_chunks:
+while text_chunks:
+    text_chunk = text_chunks.pop(0)
     doc = nlp(text_chunk)
     for token in doc:
         if token.is_space:
@@ -158,6 +164,7 @@ for text_chunk in text_chunks:
             annotation['properties']['ner'] = token.ent_type_
         annotations.append(annotation)
     chunk_offset += len(text_chunk)
+    text_chunk = None
 
 with open(args.output, 'w') as output_file:
     json.dump({'meta': meta, 'tags': tags, 'annotations': annotations},
diff --git a/vrt-creator b/vrt-creator
index 48902f1..88ab455 100755
--- a/vrt-creator
+++ b/vrt-creator
@@ -3,19 +3,13 @@
 
 from argparse import ArgumentParser
 from xml.sax.saxutils import escape
+import hashlib
 import json
 
-# Parse the given arguments
-parser = ArgumentParser(description='Create annotations for a given txt file')
-parser.add_argument('input', metavar='Path to txt input file')
-parser.add_argument('annotations', metavar='Path to JSON annotation file')
-parser.add_argument('output', metavar='Path to vrt output file')
-args = parser.parse_args()
 
-with open(args.input) as text_file, \
-     open(args.annotations) as data_file:
-    text = text_file.read()
-    stand_off_data = json.load(data_file)
+# Two global ressources - Not very elegant but it works for now
+stand_off_data = None
+text = None
 
 
 def meta_to_string():
@@ -26,7 +20,8 @@ def meta_to_string():
         stand_off_data['meta']['generator']['arguments']['check_encoding'],
         stand_off_data['meta']['generator']['arguments']['language']
     )
-    string += '<file name="{}" md5="{}"/>\n'.format(
+    string += '<file encoding="{}" name="{}" md5="{}"/>\n'.format(
+        stand_off_data['meta']['file']['encoding'],
         stand_off_data['meta']['file']['name'],
         stand_off_data['meta']['file']['md5']
     )
@@ -93,15 +88,43 @@ def annotations_to_string(end=float('inf')):
     return string
 
 
-vrt = ''
-vrt += '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'
-vrt += '<corpus>\n'
-vrt += '<text>\n'
-vrt += meta_to_string()
-vrt += tags_to_string()
-vrt += annotations_to_string()
-vrt += '</text>\n'
-vrt += '</corpus>'
+def main():
+    global stand_off_data
+    global text
 
-with open(args.output, 'w') as vrt_file:
-    vrt_file.write(vrt)
+    # Parse the given arguments
+    parser = ArgumentParser(description='Create a vrt from JSON and txt')
+    parser.add_argument('text', metavar='Path to txt file')
+    parser.add_argument('stand_off_data', metavar='Path to JSON file')
+    parser.add_argument('output', metavar='Path to vrt output file')
+    args = parser.parse_args()
+
+    with open(args.stand_off_data) as stand_of_data_file:
+        stand_off_data = json.load(stand_of_data_file)
+
+    with open(args.text, "rb") as text_file:
+        text_md5 = hashlib.md5()
+        for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''):  # noqa
+            text_md5.update(chunk)
+        if text_md5.hexdigest() != stand_off_data['meta']['file']['md5']:
+            raise Exception('md5 not equal')
+
+    with open(args.text, encoding=stand_off_data['meta']['file']['encoding']) as text_file:  # noqa
+        text = text_file.read()
+
+    vrt = ''
+    vrt += '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'
+    vrt += '<corpus>\n'
+    vrt += '<text>\n'
+    vrt += meta_to_string()
+    vrt += tags_to_string()
+    vrt += annotations_to_string()
+    vrt += '</text>\n'
+    vrt += '</corpus>'
+
+    with open(args.output, 'w') as vrt_file:
+        vrt_file.write(vrt)
+
+
+if __name__ == '__main__':
+    main()