mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git
				synced 2025-10-31 13:02:44 +00:00 
			
		
		
		
	Update file handling. Now md5 is correct
This commit is contained in:
		
							
								
								
									
										4
									
								
								nlp
									
									
									
									
									
								
							
							
						
						
									
										4
									
								
								nlp
									
									
									
									
									
								
							| @@ -148,9 +148,11 @@ def parse_args(): | ||||
|                         required=True) | ||||
|     parser.add_argument('-l', '--language', | ||||
|                         choices=SPACY_MODELS.keys(), | ||||
|                         help='Language of the input (2-character ISO 639-1 language codes)',  # noqa | ||||
|                         required=True) | ||||
|     parser.add_argument('--check-encoding', | ||||
|                         action='store_true') | ||||
|                         action='store_true', | ||||
|                         help='Check encoding of the input file, UTF-8 is used instead')  # noqa | ||||
|     parser.add_argument('--log-dir', | ||||
|                         help='Logging directory') | ||||
|     parser.add_argument('--mem-mb', | ||||
|   | ||||
							
								
								
									
										23
									
								
								spacy-nlp
									
									
									
									
									
								
							
							
						
						
									
										23
									
								
								spacy-nlp
									
									
									
									
									
								
							| @@ -16,29 +16,32 @@ spacy_models = {spacy.info(pipeline)['lang']: pipeline | ||||
|  | ||||
| # Parse the given arguments | ||||
| parser = ArgumentParser(description='Create annotations for a given txt file') | ||||
| parser.add_argument('input', metavar='Path to txt input file') | ||||
| parser.add_argument('output', metavar='Path to JSON output file') | ||||
| parser.add_argument('input', help='Path to txt input file') | ||||
| parser.add_argument('output', help='Path to JSON output file') | ||||
| parser.add_argument('-l', '--language', | ||||
|                     choices=spacy_models.keys(), | ||||
|                     help='Language of the input (2-character ISO 639-1 language codes)',  # noqa | ||||
|                     required=True) | ||||
| parser.add_argument('-c', '--check-encoding', action='store_true') | ||||
| parser.add_argument('-c', '--check-encoding', | ||||
|                     action='store_true', | ||||
|                     help='Check encoding of the input file, UTF-8 is used instead')  # noqa | ||||
| args = parser.parse_args() | ||||
|  | ||||
| if args.check_encoding: | ||||
|     with open(args.input, "rb") as text_file: | ||||
|         if args.check_encoding: | ||||
|             encoding = chardet.detect(text_file.read())['encoding'] | ||||
|         else: | ||||
|             encoding = 'utf-8' | ||||
|  | ||||
| # If requested: Check the encoding of the text contents from the input file | ||||
| # Else: Use utf-8 | ||||
| with open(args.input, "rb") as text_file: | ||||
|     if args.check_encoding: | ||||
|         encoding = chardet.detect(text_file.read())['encoding'] | ||||
|     else: | ||||
|         encoding = 'utf-8' | ||||
|     text_md5 = hashlib.md5() | ||||
|     for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''): | ||||
|         text_md5.update(chunk) | ||||
|  | ||||
| # Load the text contents from the input file | ||||
| with open(args.input, encoding=encoding) as text_file: | ||||
|     # spaCy NLP is limited to strings with maximum 1 million characters at | ||||
|     # spaCy NLP is limited to strings with a maximum of 1 million characters at | ||||
|     # once. So we split it into suitable chunks. | ||||
|     text_chunks = textwrap.wrap( | ||||
|         text_file.read(), | ||||
|   | ||||
| @@ -94,9 +94,9 @@ def main(): | ||||
|  | ||||
|     # Parse the given arguments | ||||
|     parser = ArgumentParser(description='Create a vrt from JSON and txt') | ||||
|     parser.add_argument('text', metavar='Path to txt file') | ||||
|     parser.add_argument('stand_off_data', metavar='Path to JSON file') | ||||
|     parser.add_argument('output', metavar='Path to vrt output file') | ||||
|     parser.add_argument('text', help='Path to txt file') | ||||
|     parser.add_argument('stand_off_data', help='Path to JSON file') | ||||
|     parser.add_argument('output', help='Path to vrt output file') | ||||
|     args = parser.parse_args() | ||||
|  | ||||
|     with open(args.stand_off_data) as stand_of_data_file: | ||||
|   | ||||
| @@ -6,7 +6,7 @@ import os | ||||
| import subprocess | ||||
| import sys | ||||
|  | ||||
| CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0' | ||||
| CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0b' | ||||
| CONTAINER_INPUT_DIR = '/input' | ||||
| CONTAINER_OUTPUT_DIR = '/output' | ||||
| CONTAINER_LOG_DIR = '/logs' | ||||
|   | ||||
		Reference in New Issue
	
	Block a user