mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git
				synced 2025-10-31 17:32:45 +00:00 
			
		
		
		
	Update
This commit is contained in:
		
							
								
								
									
										16
									
								
								Dockerfile
									
									
									
									
									
								
							
							
						
						
									
										16
									
								
								Dockerfile
									
									
									
									
									
								
							| @@ -1,7 +1,8 @@ | ||||
| FROM debian:stretch-slim | ||||
|  | ||||
| MAINTAINER Patrick Jentsch <p.jentsch@uni-bielefeld.de> | ||||
| LABEL maintainer="inf_sfb1288@lists.uni-bielefeld.de" | ||||
|  | ||||
| ENV DEBIAN_FRONTEND=noninteractive | ||||
| ENV LANG=C.UTF-8 | ||||
|  | ||||
| RUN apt-get update && \ | ||||
| @@ -9,22 +10,20 @@ RUN apt-get update && \ | ||||
|     build-essential \ | ||||
|     ca-certificates \ | ||||
|     python2.7 \ | ||||
|     python3 \ | ||||
|     python3.5 \ | ||||
|     python3-dev \ | ||||
|     python3-pip \ | ||||
|     python3-setuptools \ | ||||
|     wget | ||||
|  | ||||
| WORKDIR /root | ||||
|  | ||||
| # Install pyFlow | ||||
| ENV PYFLOW_VERSION 1.1.20 | ||||
| RUN wget -nv https://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \ | ||||
|     tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \ | ||||
|     rm pyflow-"$PYFLOW_VERSION".tar.gz && \ | ||||
|     cd pyflow-"$PYFLOW_VERSION" && \ | ||||
|     python2.7 setup.py build install && \ | ||||
|     cd .. | ||||
|     cd .. && \ | ||||
|     rm -r pyflow-"$PYFLOW_VERSION".tar.gz pyflow-"$PYFLOW_VERSION" | ||||
|  | ||||
| # Install spaCy | ||||
| RUN pip3 install wheel && pip3 install -U spacy && \ | ||||
| @@ -34,9 +33,8 @@ RUN pip3 install wheel && pip3 install -U spacy && \ | ||||
|     python3 -m spacy download fr && \ | ||||
|     python3 -m spacy download pt | ||||
|  | ||||
| RUN mkdir files_for_nlp files_from_nlp | ||||
|  | ||||
| COPY nlp /usr/local/bin | ||||
| COPY spacy_nlp /usr/local/bin | ||||
|  | ||||
| CMD ["/bin/bash"] | ||||
| ENTRYPOINT ["nlp"] | ||||
| CMD ["--help"] | ||||
|   | ||||
							
								
								
									
										154
									
								
								nlp
									
									
									
									
									
								
							
							
						
						
									
										154
									
								
								nlp
									
									
									
									
									
								
							| @@ -18,84 +18,105 @@ from pyflow import WorkflowRunner | ||||
|  | ||||
| def parse_arguments(): | ||||
|     parser = argparse.ArgumentParser( | ||||
|         "Performs NLP of documents utilizing spaCy. \ | ||||
|         Output is .vrt." | ||||
|         description='Performs NLP of documents utilizing spaCy. The results are served as verticalized text files.' | ||||
|     ) | ||||
|  | ||||
|     parser.add_argument("-i", | ||||
|                         dest="inputDir", | ||||
|                         help="Input directory.", | ||||
|                         required=True) | ||||
|     parser.add_argument("-l", | ||||
|                         dest='lang', | ||||
|                         help="Language for NLP", | ||||
|                         required=True) | ||||
|     parser.add_argument("-o", | ||||
|                         dest="outputDir", | ||||
|                         help="Output directory.", | ||||
|                         required=True) | ||||
|     parser.add_argument("--nCores", | ||||
|                         default=min(4, multiprocessing.cpu_count()), | ||||
|                         dest="nCores", | ||||
|                         help="Total number of cores available.", | ||||
|                         required=False, | ||||
|                         type=int) | ||||
|     parser.add_argument( | ||||
|         '-i', | ||||
|         dest='input_dir', | ||||
|         required=True | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         '-l', | ||||
|         choices=['de', 'en', 'es', 'fr', 'pt'], | ||||
|         dest='lang', | ||||
|         required=True | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         '-o', | ||||
|         dest='output_dir', | ||||
|         required=True | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         '--nCores', | ||||
|         default=min(4, multiprocessing.cpu_count()), | ||||
|         dest='n_cores', | ||||
|         help='total number of cores available', | ||||
|         required=False, | ||||
|         type=int | ||||
|     ) | ||||
|     return parser.parse_args() | ||||
|  | ||||
|  | ||||
| class NLPWorkflow(WorkflowRunner): | ||||
|     def __init__(self, jobs, lang, nCores): | ||||
|         self.jobs = jobs | ||||
|         self.lang = lang | ||||
|         self.nCores = nCores | ||||
|  | ||||
|     def __init__(self, args): | ||||
|         self.jobs = analyze_jobs(args.input_dir, args.output_dir) | ||||
|         self.lang = args.lang | ||||
|         self.n_cores = args.n_cores | ||||
|  | ||||
|     def workflow(self): | ||||
|         ### | ||||
|         # Task "mkdir_job": create output directories | ||||
|         # Dependencies: None | ||||
|         ### | ||||
|         mkdir_jobs = [] | ||||
|         mkdir_job_number = 0 | ||||
|         for job in self.jobs: | ||||
|             mkdir_job_number += 1 | ||||
|             cmd = 'mkdir -p "%s"' % ( | ||||
|                 job["output_dir"] | ||||
|             ) | ||||
|             mkdir_jobs.append(self.addTask(label="mkdir_job_-_%i" % (mkdir_job_number), command=cmd)) | ||||
|         if len(self.jobs) == 0: | ||||
|             return | ||||
|  | ||||
|         ### | ||||
|         # Task "spacy_nlp_job": perform NLP | ||||
|         # Dependencies: mkdir_jobs | ||||
|         ### | ||||
|         self.waitForTasks() | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # Create output directories                      # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         create_output_directories_jobs = [] | ||||
|         for index, job in enumerate(self.jobs): | ||||
|             cmd = 'mkdir -p "%s"' % (job['output_dir']) | ||||
|             create_output_directories_jobs.append( | ||||
|                 self.addTask( | ||||
|                     command=cmd, | ||||
|                     label='create_output_directories_job_-_%i' % (index) | ||||
|                 ) | ||||
|             ) | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # Natural language processing                    # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         nlp_jobs = [] | ||||
|         nlp_job_number = 0 | ||||
|         for job in self.jobs: | ||||
|             nlp_job_number += 1 | ||||
|             cmd = 'spacy_nlp -i "%s" -o "%s" -l "%s"' % ( | ||||
|                 job["path"], | ||||
|                 os.path.join(job["output_dir"], os.path.basename(job["path"]).rsplit(".", 1)[0] + ".vrt"), | ||||
|                 self.lang | ||||
|         nlp_job_n_cores = min( | ||||
|             self.n_cores, | ||||
|             max(1, int(self.n_cores / len(self.jobs))) | ||||
|         ) | ||||
|         for index, job in enumerate(self.jobs): | ||||
|             cmd = 'spacy_nlp -l "%s" "%s" "%s"' % ( | ||||
|                 self.lang, | ||||
|                 job['path'], | ||||
|                 os.path.join(job['output_dir'], job['name'] + '.vrt') | ||||
|             ) | ||||
|             nlp_jobs.append( | ||||
|                 self.addTask( | ||||
|                     command=cmd, | ||||
|                     dependencies='create_output_directories_job_-_%i' % (index), | ||||
|                     label='nlp_job_-_%i' % (index), | ||||
|                     nCores=nlp_job_n_cores | ||||
|                 ) | ||||
|             ) | ||||
|             nlp_jobs.append(self.addTask(label="nlp_job_-_%i" % (nlp_job_number), command=cmd, dependencies=mkdir_jobs, nCores=min(4, self.nCores))) | ||||
|  | ||||
|  | ||||
| def analyze_jobs(inputDir, outputDir, level=1): | ||||
| def analyze_jobs(input_dir, output_dir): | ||||
|     jobs = [] | ||||
|  | ||||
|     if level > 2: | ||||
|         return jobs | ||||
|  | ||||
|     for file in os.listdir(inputDir): | ||||
|         if os.path.isdir(os.path.join(inputDir, file)): | ||||
|     for file in os.listdir(input_dir): | ||||
|         if os.path.isdir(os.path.join(input_dir, file)): | ||||
|             jobs += analyze_jobs( | ||||
|                 os.path.join(inputDir, file), | ||||
|                 os.path.join(outputDir, file), | ||||
|                 level + 1 | ||||
|                 os.path.join(input_dir, file), | ||||
|                 os.path.join(output_dir, file), | ||||
|             ) | ||||
|         elif file.endswith('.txt'): | ||||
|             jobs.append( | ||||
|                 { | ||||
|                     'filename': file, | ||||
|                     'name': file.rsplit('.', 1)[0], | ||||
|                     'output_dir': os.path.join(output_dir, file), | ||||
|                     'path': os.path.join(input_dir, file) | ||||
|                 } | ||||
|             ) | ||||
|         elif file.endswith(".txt"): | ||||
|             jobs.append({"path": os.path.join(inputDir, file), "output_dir": os.path.join(outputDir, file.rsplit(".", 1)[0])}) | ||||
|  | ||||
|     return jobs | ||||
|  | ||||
| @@ -103,15 +124,12 @@ def analyze_jobs(inputDir, outputDir, level=1): | ||||
| def main(): | ||||
|     args = parse_arguments() | ||||
|  | ||||
|     wflow = NLPWorkflow( | ||||
|         analyze_jobs(args.inputDir, args.outputDir), | ||||
|         args.lang, | ||||
|         args.nCores | ||||
|     ) | ||||
|     wflow = NLPWorkflow(args) | ||||
|  | ||||
|     retval = wflow.run(dataDirRoot=args.output_dir, nCores=args.n_cores) | ||||
|  | ||||
|     retval = wflow.run(nCores=args.nCores) | ||||
|     sys.exit(retval) | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
| if __name__ == '__main__': | ||||
|     main() | ||||
|   | ||||
							
								
								
									
										62
									
								
								spacy_nlp
									
									
									
									
									
								
							
							
						
						
									
										62
									
								
								spacy_nlp
									
									
									
									
									
								
							| @@ -1,48 +1,53 @@ | ||||
| #!/usr/bin/env python3 | ||||
| # coding=utf-8 | ||||
|  | ||||
|  | ||||
| import argparse | ||||
| import os | ||||
| import spacy | ||||
| import textwrap | ||||
|  | ||||
|  | ||||
| parser = argparse.ArgumentParser(description="Tag a .txt file with spaCy and \ | ||||
|                                               save it in .vrt format") | ||||
| parser.add_argument("-i", | ||||
|                     dest="input", | ||||
|                     help="Input file.", | ||||
|                     required=True) | ||||
| parser.add_argument("-l", | ||||
|                     choices=["de", "en", "es", "fr", "pt"], | ||||
|                     dest="lang", | ||||
|                     help="Language for tagging", | ||||
|                     required=True) | ||||
| parser.add_argument("-o", | ||||
|                     dest="output", | ||||
|                     help="Output file.", | ||||
|                     required=True) | ||||
| parser = argparse.ArgumentParser( | ||||
|     description='Tag a text file with spaCy and save it as a verticalized text file.' | ||||
| ) | ||||
| parser.add_argument( | ||||
|     'i', | ||||
|     metavar='txt-sourcefile', | ||||
| ) | ||||
| parser.add_argument( | ||||
|     '-l', | ||||
|     choices=['de', 'en', 'es', 'fr', 'pt'], | ||||
|     dest='lang', | ||||
|     required=True | ||||
| ) | ||||
| parser.add_argument( | ||||
|     'o', | ||||
|     metavar='vrt-destfile', | ||||
| ) | ||||
| args = parser.parse_args() | ||||
|  | ||||
|  | ||||
| SPACY_MODELS = {"de": "de_core_news_sm", "en": "en_core_web_sm", | ||||
|                 "es": "es_core_news_sm", "fr": "fr_core_news_sm", | ||||
|                 "pt": "pt_core_news_sm"} | ||||
| SPACY_MODELS = { | ||||
|     'de': 'de_core_news_sm', 'en': 'en_core_web_sm', 'es': 'es_core_news_sm', | ||||
|     'fr': 'fr_core_news_sm', 'pt': 'pt_core_news_sm' | ||||
| } | ||||
|  | ||||
| # Set the language model for spacy | ||||
| nlp = spacy.load(SPACY_MODELS[args.lang]) | ||||
|  | ||||
| # Read text from the input file and if neccessary split it into parts with a | ||||
| # length of less than 1 million characters. | ||||
| with open(args.input) as input_file: | ||||
| with open(args.i) as input_file: | ||||
|     text = input_file.read() | ||||
|     texts = textwrap.wrap(text, 1000000, break_long_words=False) | ||||
|     text = None | ||||
|  | ||||
| # Create and open the output file | ||||
| output_file = open(args.output, "w+") | ||||
| output_file.write('<?xml version="1.0" encoding="UTF-8"?>\n<corpus>\n<text id="' + os.path.basename(args.input).rsplit(".", 1)[0] + '">\n') | ||||
| output_file = open(args.o, 'w+') | ||||
|  | ||||
| output_file.write( | ||||
|     '<?xml version="1.0" encoding="UTF-8"?>\n<corpus>\n<text id="%s">\n' % ( | ||||
|         os.path.basename(args.i).rsplit(".", 1)[0] | ||||
|     ) | ||||
| ) | ||||
| for text in texts: | ||||
|     # Run spacy nlp over the text (partial string if above 1 million chars) | ||||
|     doc = nlp(text) | ||||
| @@ -54,9 +59,12 @@ for text in texts: | ||||
|                 continue | ||||
|             # Write all information in .vrt style to the output file | ||||
|             # text, lemma, simple_pos, pos, ner | ||||
|             output_file.write(token.text + "\t" + token.lemma_ + "\t" | ||||
|                               + token.pos_ + "\t" + token.tag_ + "\t" | ||||
|                               + (token.ent_type_ if token.ent_type_ != "" else "NULL") + "\n") | ||||
|             output_file.write( | ||||
|                 token.text + '\t' + token.lemma_ + '\t' | ||||
|                 + token.pos_ + '\t' + token.tag_ + '\t' | ||||
|                 + (token.ent_type_ if token.ent_type_ != '' else 'NULL') + '\n' | ||||
|             ) | ||||
|         output_file.write('</s>\n') | ||||
| output_file.write('</text>\n</corpus>') | ||||
|  | ||||
| output_file.close() | ||||
|   | ||||
		Reference in New Issue
	
	Block a user