mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git
				synced 2025-10-31 13:02:44 +00:00 
			
		
		
		
	Codestyle
This commit is contained in:
		
							
								
								
									
										60
									
								
								nlp
									
									
									
									
									
								
							
							
						
						
									
										60
									
								
								nlp
									
									
									
									
									
								
							| @@ -18,33 +18,23 @@ from pyflow import WorkflowRunner | ||||
|  | ||||
| def parse_arguments(): | ||||
|     parser = argparse.ArgumentParser( | ||||
|         description='Performs NLP of documents utilizing spaCy. The results are served as verticalized text files.' | ||||
|     ) | ||||
|  | ||||
|     parser.add_argument( | ||||
|         '-i', | ||||
|         dest='input_dir', | ||||
|         required=True | ||||
|         description=('Performs NLP of documents utilizing spaCy. The results ' | ||||
|                      'are served as verticalized text files.') | ||||
|     ) | ||||
|     parser.add_argument('-i', dest='input_dir', required=True) | ||||
|     parser.add_argument( | ||||
|         '-l', | ||||
|         choices=['de', 'el', 'en', 'es', 'fr', 'it', 'nl', 'pt'], | ||||
|         dest='lang', | ||||
|         required=True | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         '-o', | ||||
|         dest='output_dir', | ||||
|         required=True | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         '--nCores', | ||||
|         default=min(4, multiprocessing.cpu_count()), | ||||
|         dest='n_cores', | ||||
|         help='total number of cores available', | ||||
|         required=False, | ||||
|         type=int | ||||
|     ) | ||||
|     parser.add_argument('-o', dest='output_dir', required=True) | ||||
|     parser.add_argument('--nCores', | ||||
|                         default=min(4, multiprocessing.cpu_count()), | ||||
|                         dest='n_cores', | ||||
|                         help='total number of cores available', | ||||
|                         required=False, | ||||
|                         type=int) | ||||
|     return parser.parse_args() | ||||
|  | ||||
|  | ||||
| @@ -65,11 +55,11 @@ class NLPWorkflow(WorkflowRunner): | ||||
|         ''' | ||||
|         create_output_directories_jobs = [] | ||||
|         for index, job in enumerate(self.jobs): | ||||
|             cmd = 'mkdir -p "%s"' % (job['output_dir']) | ||||
|             cmd = 'mkdir -p "{}"'.format(job['output_dir']) | ||||
|             create_output_directories_jobs.append( | ||||
|                 self.addTask( | ||||
|                     command=cmd, | ||||
|                     label='create_output_directories_job_-_%i' % (index) | ||||
|                     label='create_output_directories_job_-_{}'.format(index) | ||||
|                 ) | ||||
|             ) | ||||
|  | ||||
| @@ -84,7 +74,7 @@ class NLPWorkflow(WorkflowRunner): | ||||
|             max(1, int(self.n_cores / len(self.jobs))) | ||||
|         ) | ||||
|         for index, job in enumerate(self.jobs): | ||||
|             cmd = 'spacy_nlp -l "%s" "%s" "%s"' % ( | ||||
|             cmd = 'spacy_nlp -l "{}" "{}" "{}"'.format( | ||||
|                 self.lang, | ||||
|                 job['path'], | ||||
|                 os.path.join(job['output_dir'], job['name'] + '.vrt') | ||||
| @@ -92,8 +82,10 @@ class NLPWorkflow(WorkflowRunner): | ||||
|             nlp_jobs.append( | ||||
|                 self.addTask( | ||||
|                     command=cmd, | ||||
|                     dependencies='create_output_directories_job_-_%i' % (index), | ||||
|                     label='nlp_job_-_%i' % (index), | ||||
|                     dependencies='create_output_directories_job_-_{}'.format( | ||||
|                         index | ||||
|                     ), | ||||
|                     label='nlp_job_-_{}'.format(index), | ||||
|                     nCores=nlp_job_n_cores | ||||
|                 ) | ||||
|             ) | ||||
| @@ -104,19 +96,13 @@ def analyze_jobs(input_dir, output_dir): | ||||
|  | ||||
|     for file in os.listdir(input_dir): | ||||
|         if os.path.isdir(os.path.join(input_dir, file)): | ||||
|             jobs += analyze_jobs( | ||||
|                 os.path.join(input_dir, file), | ||||
|                 os.path.join(output_dir, file), | ||||
|             ) | ||||
|             jobs += analyze_jobs(os.path.join(input_dir, file), | ||||
|                                  os.path.join(output_dir, file)) | ||||
|         elif file.endswith('.txt'): | ||||
|             jobs.append( | ||||
|                 { | ||||
|                     'filename': file, | ||||
|                     'name': file.rsplit('.', 1)[0], | ||||
|                     'output_dir': os.path.join(output_dir, file), | ||||
|                     'path': os.path.join(input_dir, file) | ||||
|                 } | ||||
|             ) | ||||
|             jobs.append({'filename': file, | ||||
|                          'name': file.rsplit('.', 1)[0], | ||||
|                          'output_dir': os.path.join(output_dir, file), | ||||
|                          'path': os.path.join(input_dir, file)}) | ||||
|  | ||||
|     return jobs | ||||
|  | ||||
|   | ||||
							
								
								
									
										57
									
								
								spacy_nlp
									
									
									
									
									
								
							
							
						
						
									
										57
									
								
								spacy_nlp
									
									
									
									
									
								
							| @@ -7,29 +7,25 @@ import spacy | ||||
| import textwrap | ||||
|  | ||||
| parser = argparse.ArgumentParser( | ||||
|     description='Tag a text file with spaCy and save it as a verticalized text file.' | ||||
| ) | ||||
| parser.add_argument( | ||||
|     'i', | ||||
|     metavar='txt-sourcefile', | ||||
| ) | ||||
| parser.add_argument( | ||||
|     '-l', | ||||
|     choices=['de', 'el', 'en', 'es', 'fr', 'it', 'nl', 'pt'], | ||||
|     dest='lang', | ||||
|     required=True | ||||
| ) | ||||
| parser.add_argument( | ||||
|     'o', | ||||
|     metavar='vrt-destfile', | ||||
|     description=('Tag a text file with spaCy and save it as a verticalized ' | ||||
|                  'text file.') | ||||
| ) | ||||
| parser.add_argument('i', metavar='txt-sourcefile') | ||||
| parser.add_argument('-l', | ||||
|                     choices=['de', 'el', 'en', 'es', 'fr', 'it', 'nl', 'pt'], | ||||
|                     dest='lang', | ||||
|                     required=True) | ||||
| parser.add_argument('o', metavar='vrt-destfile') | ||||
| args = parser.parse_args() | ||||
|  | ||||
| SPACY_MODELS = { | ||||
|     'de': 'de_core_news_sm', 'el': 'el_core_news_sm', 'en': 'en_core_web_sm', | ||||
|     'es': 'es_core_news_sm', 'fr': 'fr_core_news_sm', 'it': 'it_core_news_sm', | ||||
|     'nl': 'nl_core_news_sm', 'pt': 'pt_core_news_sm' | ||||
| } | ||||
| SPACY_MODELS = {'de': 'de_core_news_sm', | ||||
|                 'el': 'el_core_news_sm', | ||||
|                 'en': 'en_core_web_sm', | ||||
|                 'es': 'es_core_news_sm', | ||||
|                 'fr': 'fr_core_news_sm', | ||||
|                 'it': 'it_core_news_sm', | ||||
|                 'nl': 'nl_core_news_sm', | ||||
|                 'pt': 'pt_core_news_sm'} | ||||
|  | ||||
| # Set the language model for spacy | ||||
| nlp = spacy.load(SPACY_MODELS[args.lang]) | ||||
| @@ -45,9 +41,9 @@ with open(args.i) as input_file: | ||||
| output_file = open(args.o, 'w+') | ||||
|  | ||||
| output_file.write( | ||||
|     '<?xml version="1.0" encoding="UTF-8"?>\n<corpus>\n<text id="%s">\n' % ( | ||||
|         os.path.basename(args.i).rsplit(".", 1)[0] | ||||
|     ) | ||||
|     '<?xml version="1.0" encoding="UTF-8"?>\n' | ||||
|     '<corpus>\n' | ||||
|     '<text id="{}">\n'.format(os.path.basename(args.i).rsplit(".", 1)[0]) | ||||
| ) | ||||
| for text in texts: | ||||
|     # Run spacy nlp over the text (partial string if above 1 million chars) | ||||
| @@ -61,11 +57,18 @@ for text in texts: | ||||
|             # Write all information in .vrt style to the output file | ||||
|             # text, lemma, simple_pos, pos, ner | ||||
|             output_file.write( | ||||
|                 token.text + '\t' + token.lemma_ + '\t' | ||||
|                 + token.pos_ + '\t' + token.tag_ + '\t' | ||||
|                 + (token.ent_type_ if token.ent_type_ != '' else 'NULL') + '\n' | ||||
|                 '{}\t{}\t{}\t{}\t{}\n'.format( | ||||
|                     token.text, | ||||
|                     token.lemma_, | ||||
|                     token.pos_, | ||||
|                     token.tag_, | ||||
|                     token.ent_type_ if token.ent_type_ != '' else 'NULL' | ||||
|                 ) | ||||
|             ) | ||||
|         output_file.write('</s>\n') | ||||
| output_file.write('</text>\n</corpus>') | ||||
| output_file.write( | ||||
|     '</text>\n' | ||||
|     '</corpus>' | ||||
| ) | ||||
|  | ||||
| output_file.close() | ||||
|   | ||||
		Reference in New Issue
	
	Block a user