diff --git a/nlp b/nlp index 2fd79f5..031126f 100755 --- a/nlp +++ b/nlp @@ -148,9 +148,11 @@ def parse_args(): required=True) parser.add_argument('-l', '--language', choices=SPACY_MODELS.keys(), + help='Language of the input (2-character ISO 639-1 language codes)', # noqa required=True) parser.add_argument('--check-encoding', - action='store_true') + action='store_true', + help='Check encoding of the input file, UTF-8 is used instead') # noqa parser.add_argument('--log-dir', help='Logging directory') parser.add_argument('--mem-mb', diff --git a/spacy-nlp b/spacy-nlp index af114e6..7dde3ac 100755 --- a/spacy-nlp +++ b/spacy-nlp @@ -16,29 +16,32 @@ spacy_models = {spacy.info(pipeline)['lang']: pipeline # Parse the given arguments parser = ArgumentParser(description='Create annotations for a given txt file') -parser.add_argument('input', metavar='Path to txt input file') -parser.add_argument('output', metavar='Path to JSON output file') +parser.add_argument('input', help='Path to txt input file') +parser.add_argument('output', help='Path to JSON output file') parser.add_argument('-l', '--language', choices=spacy_models.keys(), + help='Language of the input (2-character ISO 639-1 language codes)', # noqa required=True) -parser.add_argument('-c', '--check-encoding', action='store_true') +parser.add_argument('-c', '--check-encoding', + action='store_true', + help='Check encoding of the input file, UTF-8 is used instead') # noqa args = parser.parse_args() +if args.check_encoding: + with open(args.input, "rb") as text_file: + if args.check_encoding: + encoding = chardet.detect(text_file.read())['encoding'] + else: + encoding = 'utf-8' -# If requested: Check the encoding of the text contents from the input file -# Else: Use utf-8 with open(args.input, "rb") as text_file: - if args.check_encoding: - encoding = chardet.detect(text_file.read())['encoding'] - else: - encoding = 'utf-8' text_md5 = hashlib.md5() for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''): text_md5.update(chunk) # Load the text contents from the input file with open(args.input, encoding=encoding) as text_file: - # spaCy NLP is limited to strings with maximum 1 million characters at + # spaCy NLP is limited to strings with a maximum of 1 million characters at # once. So we split it into suitable chunks. text_chunks = textwrap.wrap( text_file.read(), diff --git a/vrt-creator b/vrt-creator index 88ab455..e998903 100755 --- a/vrt-creator +++ b/vrt-creator @@ -94,9 +94,9 @@ def main(): # Parse the given arguments parser = ArgumentParser(description='Create a vrt from JSON and txt') - parser.add_argument('text', metavar='Path to txt file') - parser.add_argument('stand_off_data', metavar='Path to JSON file') - parser.add_argument('output', metavar='Path to vrt output file') + parser.add_argument('text', help='Path to txt file') + parser.add_argument('stand_off_data', help='Path to JSON file') + parser.add_argument('output', help='Path to vrt output file') args = parser.parse_args() with open(args.stand_off_data) as stand_of_data_file: diff --git a/wrapper/nlp b/wrapper/nlp index e41f17b..7e0a3a4 100755 --- a/wrapper/nlp +++ b/wrapper/nlp @@ -6,7 +6,7 @@ import os import subprocess import sys -CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0' +CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0b' CONTAINER_INPUT_DIR = '/input' CONTAINER_OUTPUT_DIR = '/output' CONTAINER_LOG_DIR = '/logs'