mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git
				synced 2025-11-04 10:52:43 +00:00 
			
		
		
		
	Update file handling. Now md5 is correct
This commit is contained in:
		
							
								
								
									
										4
									
								
								nlp
									
									
									
									
									
								
							
							
						
						
									
										4
									
								
								nlp
									
									
									
									
									
								
							@@ -148,9 +148,11 @@ def parse_args():
 | 
			
		||||
                        required=True)
 | 
			
		||||
    parser.add_argument('-l', '--language',
 | 
			
		||||
                        choices=SPACY_MODELS.keys(),
 | 
			
		||||
                        help='Language of the input (2-character ISO 639-1 language codes)',  # noqa
 | 
			
		||||
                        required=True)
 | 
			
		||||
    parser.add_argument('--check-encoding',
 | 
			
		||||
                        action='store_true')
 | 
			
		||||
                        action='store_true',
 | 
			
		||||
                        help='Check encoding of the input file, UTF-8 is used instead')  # noqa
 | 
			
		||||
    parser.add_argument('--log-dir',
 | 
			
		||||
                        help='Logging directory')
 | 
			
		||||
    parser.add_argument('--mem-mb',
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										17
									
								
								spacy-nlp
									
									
									
									
									
								
							
							
						
						
									
										17
									
								
								spacy-nlp
									
									
									
									
									
								
							@@ -16,29 +16,32 @@ spacy_models = {spacy.info(pipeline)['lang']: pipeline
 | 
			
		||||
 | 
			
		||||
# Parse the given arguments
 | 
			
		||||
parser = ArgumentParser(description='Create annotations for a given txt file')
 | 
			
		||||
parser.add_argument('input', metavar='Path to txt input file')
 | 
			
		||||
parser.add_argument('output', metavar='Path to JSON output file')
 | 
			
		||||
parser.add_argument('input', help='Path to txt input file')
 | 
			
		||||
parser.add_argument('output', help='Path to JSON output file')
 | 
			
		||||
parser.add_argument('-l', '--language',
 | 
			
		||||
                    choices=spacy_models.keys(),
 | 
			
		||||
                    help='Language of the input (2-character ISO 639-1 language codes)',  # noqa
 | 
			
		||||
                    required=True)
 | 
			
		||||
parser.add_argument('-c', '--check-encoding', action='store_true')
 | 
			
		||||
parser.add_argument('-c', '--check-encoding',
 | 
			
		||||
                    action='store_true',
 | 
			
		||||
                    help='Check encoding of the input file, UTF-8 is used instead')  # noqa
 | 
			
		||||
args = parser.parse_args()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# If requested: Check the encoding of the text contents from the input file
 | 
			
		||||
# Else: Use utf-8
 | 
			
		||||
if args.check_encoding:
 | 
			
		||||
    with open(args.input, "rb") as text_file:
 | 
			
		||||
        if args.check_encoding:
 | 
			
		||||
            encoding = chardet.detect(text_file.read())['encoding']
 | 
			
		||||
        else:
 | 
			
		||||
            encoding = 'utf-8'
 | 
			
		||||
 | 
			
		||||
with open(args.input, "rb") as text_file:
 | 
			
		||||
    text_md5 = hashlib.md5()
 | 
			
		||||
    for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''):
 | 
			
		||||
        text_md5.update(chunk)
 | 
			
		||||
 | 
			
		||||
# Load the text contents from the input file
 | 
			
		||||
with open(args.input, encoding=encoding) as text_file:
 | 
			
		||||
    # spaCy NLP is limited to strings with maximum 1 million characters at
 | 
			
		||||
    # spaCy NLP is limited to strings with a maximum of 1 million characters at
 | 
			
		||||
    # once. So we split it into suitable chunks.
 | 
			
		||||
    text_chunks = textwrap.wrap(
 | 
			
		||||
        text_file.read(),
 | 
			
		||||
 
 | 
			
		||||
@@ -94,9 +94,9 @@ def main():
 | 
			
		||||
 | 
			
		||||
    # Parse the given arguments
 | 
			
		||||
    parser = ArgumentParser(description='Create a vrt from JSON and txt')
 | 
			
		||||
    parser.add_argument('text', metavar='Path to txt file')
 | 
			
		||||
    parser.add_argument('stand_off_data', metavar='Path to JSON file')
 | 
			
		||||
    parser.add_argument('output', metavar='Path to vrt output file')
 | 
			
		||||
    parser.add_argument('text', help='Path to txt file')
 | 
			
		||||
    parser.add_argument('stand_off_data', help='Path to JSON file')
 | 
			
		||||
    parser.add_argument('output', help='Path to vrt output file')
 | 
			
		||||
    args = parser.parse_args()
 | 
			
		||||
 | 
			
		||||
    with open(args.stand_off_data) as stand_of_data_file:
 | 
			
		||||
 
 | 
			
		||||
@@ -6,7 +6,7 @@ import os
 | 
			
		||||
import subprocess
 | 
			
		||||
import sys
 | 
			
		||||
 | 
			
		||||
CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0'
 | 
			
		||||
CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0b'
 | 
			
		||||
CONTAINER_INPUT_DIR = '/input'
 | 
			
		||||
CONTAINER_OUTPUT_DIR = '/output'
 | 
			
		||||
CONTAINER_LOG_DIR = '/logs'
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user