From bd5d8ddedbf51a51e1a91aa7974f3b97f29864d8 Mon Sep 17 00:00:00 2001 From: Patrick Jentsch Date: Fri, 30 Apr 2021 09:44:35 +0200 Subject: [PATCH] Fix problems caused by wrong textwrap.wrap usage --- spacy-nlp | 29 ++++++++++++++--------- vrt-creator | 67 +++++++++++++++++++++++++++++++++++------------------ 2 files changed, 63 insertions(+), 33 deletions(-) diff --git a/spacy-nlp b/spacy-nlp index 1950a6d..af114e6 100755 --- a/spacy-nlp +++ b/spacy-nlp @@ -27,24 +27,28 @@ args = parser.parse_args() # If requested: Check the encoding of the text contents from the input file # Else: Use utf-8 -with open(args.input, "rb") as input_file: +with open(args.input, "rb") as text_file: if args.check_encoding: - encoding = chardet.detect(input_file.read())['encoding'] + encoding = chardet.detect(text_file.read())['encoding'] else: encoding = 'utf-8' text_md5 = hashlib.md5() - for chunk in iter(lambda: input_file.read(128 * text_md5.block_size), b''): + for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''): text_md5.update(chunk) # Load the text contents from the input file -with open(args.input, encoding=encoding) as input_file: - text = input_file.read() - # spaCys NLP is limited to strings with maximum 1 million characters at +with open(args.input, encoding=encoding) as text_file: + # spaCy NLP is limited to strings with maximum 1 million characters at # once. So we split it into suitable chunks. - text_chunks = textwrap.wrap(text, 1000000, break_long_words=False) - # the text variable potentially occupies a lot of system memory and is no - # longer needed... - del text + text_chunks = textwrap.wrap( + text_file.read(), + 1000000, + break_long_words=False, + break_on_hyphens=False, + drop_whitespace=False, + expand_tabs=False, + replace_whitespace=False + ) model = spacy_models[args.language] nlp = spacy.load(model) @@ -59,6 +63,7 @@ meta = { } }, 'file': { + 'encoding': encoding, 'md5': text_md5.hexdigest(), 'name': os.path.basename(args.input) } @@ -127,7 +132,8 @@ tags = { annotations = [] chunk_offset = 0 -for text_chunk in text_chunks: +while text_chunks: + text_chunk = text_chunks.pop(0) doc = nlp(text_chunk) for token in doc: if token.is_space: @@ -158,6 +164,7 @@ for text_chunk in text_chunks: annotation['properties']['ner'] = token.ent_type_ annotations.append(annotation) chunk_offset += len(text_chunk) + text_chunk = None with open(args.output, 'w') as output_file: json.dump({'meta': meta, 'tags': tags, 'annotations': annotations}, diff --git a/vrt-creator b/vrt-creator index 48902f1..88ab455 100755 --- a/vrt-creator +++ b/vrt-creator @@ -3,19 +3,13 @@ from argparse import ArgumentParser from xml.sax.saxutils import escape +import hashlib import json -# Parse the given arguments -parser = ArgumentParser(description='Create annotations for a given txt file') -parser.add_argument('input', metavar='Path to txt input file') -parser.add_argument('annotations', metavar='Path to JSON annotation file') -parser.add_argument('output', metavar='Path to vrt output file') -args = parser.parse_args() -with open(args.input) as text_file, \ - open(args.annotations) as data_file: - text = text_file.read() - stand_off_data = json.load(data_file) +# Two global ressources - Not very elegant but it works for now +stand_off_data = None +text = None def meta_to_string(): @@ -26,7 +20,8 @@ def meta_to_string(): stand_off_data['meta']['generator']['arguments']['check_encoding'], stand_off_data['meta']['generator']['arguments']['language'] ) - string += '\n'.format( + string += '\n'.format( + stand_off_data['meta']['file']['encoding'], stand_off_data['meta']['file']['name'], stand_off_data['meta']['file']['md5'] ) @@ -93,15 +88,43 @@ def annotations_to_string(end=float('inf')): return string -vrt = '' -vrt += '\n' -vrt += '\n' -vrt += '\n' -vrt += meta_to_string() -vrt += tags_to_string() -vrt += annotations_to_string() -vrt += '\n' -vrt += '' +def main(): + global stand_off_data + global text -with open(args.output, 'w') as vrt_file: - vrt_file.write(vrt) + # Parse the given arguments + parser = ArgumentParser(description='Create a vrt from JSON and txt') + parser.add_argument('text', metavar='Path to txt file') + parser.add_argument('stand_off_data', metavar='Path to JSON file') + parser.add_argument('output', metavar='Path to vrt output file') + args = parser.parse_args() + + with open(args.stand_off_data) as stand_of_data_file: + stand_off_data = json.load(stand_of_data_file) + + with open(args.text, "rb") as text_file: + text_md5 = hashlib.md5() + for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''): # noqa + text_md5.update(chunk) + if text_md5.hexdigest() != stand_off_data['meta']['file']['md5']: + raise Exception('md5 not equal') + + with open(args.text, encoding=stand_off_data['meta']['file']['encoding']) as text_file: # noqa + text = text_file.read() + + vrt = '' + vrt += '\n' + vrt += '\n' + vrt += '\n' + vrt += meta_to_string() + vrt += tags_to_string() + vrt += annotations_to_string() + vrt += '\n' + vrt += '' + + with open(args.output, 'w') as vrt_file: + vrt_file.write(vrt) + + +if __name__ == '__main__': + main()