mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git
				synced 2025-10-31 12:52:47 +00:00 
			
		
		
		
	Add function to check the encoding of input text files.
This commit is contained in:
		
							
								
								
									
										15
									
								
								nlp
									
									
									
									
									
								
							
							
						
						
									
										15
									
								
								nlp
									
									
									
									
									
								
							| @@ -41,6 +41,15 @@ def parse_arguments(): | ||||
|                         dest='zip', | ||||
|                         help='package result files in zip bundles', | ||||
|                         required=False) | ||||
|     parser.add_argument('--check-encoding', | ||||
|                         action='store_true', | ||||
|                         default=False, | ||||
|                         dest="check_encoding", | ||||
|                         help='''if used the nlp process will know hat the encoding of  | ||||
|                         the input files is unkown and thus != utf-8. The process will | ||||
|                         try to determine the encoding of the input files and use this. | ||||
|                         encoding.''' | ||||
|                         ) | ||||
|     return parser.parse_args() | ||||
|  | ||||
|  | ||||
| @@ -51,6 +60,7 @@ class NLPWorkflow(WorkflowRunner): | ||||
|         self.n_cores = args.n_cores | ||||
|         self.output_dir = args.output_dir | ||||
|         self.zip = args.zip | ||||
|         self.check_encoding | ||||
|  | ||||
|     def workflow(self): | ||||
|         if len(self.jobs) == 0: | ||||
| @@ -82,10 +92,11 @@ class NLPWorkflow(WorkflowRunner): | ||||
|             max(1, int(self.n_cores / len(self.jobs))) | ||||
|         ) | ||||
|         for index, job in enumerate(self.jobs): | ||||
|             cmd = 'spacy_nlp -l "{}" "{}" "{}"'.format( | ||||
|             cmd = 'spacy_nlp -l "{}" "{}" "{}" "{}"'.format( | ||||
|                 self.lang, | ||||
|                 job['path'], | ||||
|                 os.path.join(job['output_dir'], job['name'] + '.vrt') | ||||
|                 os.path.join(job['output_dir'], job['name'] + '.vrt', | ||||
|                 if self.check_encoding "--check-encoding" else "") | ||||
|             ) | ||||
|             nlp_jobs.append( | ||||
|                 self.addTask( | ||||
|   | ||||
							
								
								
									
										14
									
								
								spacy_nlp
									
									
									
									
									
								
							
							
						
						
									
										14
									
								
								spacy_nlp
									
									
									
									
									
								
							| @@ -17,6 +17,11 @@ parser.add_argument('-l', | ||||
|                     dest='lang', | ||||
|                     required=True) | ||||
| parser.add_argument('o', metavar='vrt-destfile') | ||||
| parser.add_argument('--check-encoding', | ||||
|                     default=False, | ||||
|                     action='store_true', | ||||
|                     dest='check_encoding' | ||||
|                     ) | ||||
| args = parser.parse_args() | ||||
|  | ||||
| SPACY_MODELS = {'de': 'de_core_news_sm', | ||||
| @@ -31,9 +36,16 @@ SPACY_MODELS = {'de': 'de_core_news_sm', | ||||
| # Set the language model for spacy | ||||
| nlp = spacy.load(SPACY_MODELS[args.lang]) | ||||
|  | ||||
| # Try to determine the encoding of the text in the input file | ||||
| if args.check_encoding: | ||||
|     with open(args.i, "rb") as input_file: | ||||
|         bytes = input_file.read() | ||||
|         encoding = chardet.detect(bytes)['encoding'] | ||||
| else: | ||||
|     encoding='utf-8' | ||||
| # Read text from the input file and if neccessary split it into parts with a | ||||
| # length of less than 1 million characters. | ||||
| with open(args.i) as input_file: | ||||
| with open(args.i, encoding=encoding) as input_file: | ||||
|     text = input_file.read() | ||||
|     texts = textwrap.wrap(text, 1000000, break_long_words=False) | ||||
|     text = None | ||||
|   | ||||
		Reference in New Issue
	
	Block a user