mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git
				synced 2025-10-31 04:02:45 +00:00 
			
		
		
		
	fix pipeline
This commit is contained in:
		| @@ -30,7 +30,7 @@ RUN tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \ | ||||
|  | ||||
| ENV SPACY_VERSION=2.2.4 | ||||
| ENV SPACY_MODELS_VERSION=2.2.5 | ||||
| RUN pip3 install "spacy==${SPACY_VERSION}" \ | ||||
| RUN pip3 install setuptools wheel && pip3 install "spacy==${SPACY_VERSION}" \ | ||||
|  && python3 -m spacy download "de_core_news_sm-${SPACY_MODELS_VERSION}" --direct \ | ||||
|  && python3 -m spacy download "el_core_news_sm-${SPACY_MODELS_VERSION}" --direct \ | ||||
|  && python3 -m spacy download "en_core_web_sm-${SPACY_MODELS_VERSION}" --direct \ | ||||
|   | ||||
							
								
								
									
										68
									
								
								nlp
									
									
									
									
									
								
							
							
						
						
									
										68
									
								
								nlp
									
									
									
									
									
								
							| @@ -28,16 +28,21 @@ SPACY_MODELS = {'de': 'de_core_news_sm', | ||||
|  | ||||
| def parse_args(): | ||||
|     parser = ArgumentParser(description='NLP Pipeline utilizing spaCy.') | ||||
|     parser.add_argument('i') | ||||
|     parser.add_argument('o') | ||||
|     parser.add_argument('-i', '--input-directory', | ||||
|                         help='Input directory (only txt files get processed)', | ||||
|                         required=True) | ||||
|     parser.add_argument('-o', '--output-directory', | ||||
|                         help='Output directory', | ||||
|                         required=True) | ||||
|     parser.add_argument('-l', '--language', choices=SPACY_MODELS.keys(), | ||||
|                         required=True) | ||||
|     parser.add_argument('--check-encoding', action='store_true') | ||||
|     parser.add_argument('--log-dir') | ||||
|     parser.add_argument('--n-cores', | ||||
|                         default=min(4, multiprocessing.cpu_count()), | ||||
|                         help='total number of cores available', type=int) | ||||
|     parser.add_argument('--check-encoding', action='store_true') | ||||
|     parser.add_argument('--log-dir') | ||||
|     parser.add_argument('--zip') | ||||
|     parser.add_argument('--zip', | ||||
|                         help='Zips everything into one archive.') | ||||
|     return parser.parse_args() | ||||
|  | ||||
|  | ||||
| @@ -63,41 +68,47 @@ class NLPPipeline(WorkflowRunner): | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # mkdir_jobs                                     # | ||||
|         ' # setup output directory                         # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         mkdir_jobs = [] | ||||
|         setup_output_directory_jobs = [] | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             cmd = 'mkdir' | ||||
|             cmd += ' -p' | ||||
|             cmd += ' "{}"'.format(job.output_dir) | ||||
|             lbl = 'mkdir_job_-_{}'.format(i) | ||||
|             mkdir_jobs.append(self.addTask(command=cmd, label=lbl)) | ||||
|             lbl = 'setup_output_directory_-_{}'.format(i) | ||||
|             setup_output_directory_jobs.append(self.addTask(command=cmd, | ||||
|                                                             label=lbl)) | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # spacy_nlp_jobs                                 # | ||||
|         ' # nlp                                 # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         spacy_nlp_jobs = [] | ||||
|         nlp_jobs = [] | ||||
|         n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs)))) | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             output_file = os.path.join(job.output_dir, | ||||
|                                        '{}.vrt'.format(job.name)) | ||||
|             cmd = 'spacy-nlp "{}" "{}"'.format(job.file, output_file) | ||||
|             cmd = 'spacy-nlp' | ||||
|             cmd += ' -i "{}"'.format(job.file) | ||||
|             cmd += ' -l "{}"'.format(self.lang) | ||||
|             cmd += ' --check-encoding' if self.check_encoding else '' | ||||
|             deps = 'mkdir_job_-_{}'.format(i) | ||||
|             lbl = 'spacy_nlp_job_-_{}'.format(i) | ||||
|             spacy_nlp_jobs.append(self.addTask(command=cmd, dependencies=deps, | ||||
|                                                label=lbl, nCores=n_cores)) | ||||
|             cmd += ' -o "{}"'.format(output_file) | ||||
|             if self.check_encoding: | ||||
|                 cmd += ' --check-encoding' | ||||
|             deps = 'setup_output_directory_-_{}'.format(i) | ||||
|             lbl = 'nlp_-_{}'.format(i) | ||||
|             nlp_jobs.append(self.addTask(command=cmd, | ||||
|                                          dependencies=deps, | ||||
|                                          label=lbl, | ||||
|                                          nCores=n_cores)) | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # zip_jobs                                       # | ||||
|         ' # zip creation                                   # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         zip_jobs = [] | ||||
|         zip_creation_jobs = [] | ||||
|         if self.zip is not None: | ||||
|             # Remove .zip file extension if provided | ||||
|             if self.zip.endswith('.zip'): | ||||
| @@ -112,10 +123,11 @@ class NLPPipeline(WorkflowRunner): | ||||
|             cmd += ' -i "*.vrt"' | ||||
|             cmd += ' && ' | ||||
|             cmd += 'cd -' | ||||
|             deps = spacy_nlp_jobs | ||||
|             lbl = 'zip_job' | ||||
|             zip_jobs.append(self.addTask(command=cmd, dependencies=deps, | ||||
|                                          label=lbl)) | ||||
|             deps = nlp_jobs | ||||
|             lbl = 'zip_creation' | ||||
|             zip_creation_jobs.append(self.addTask(command=cmd, | ||||
|                                                   dependencies=deps, | ||||
|                                                   label=lbl)) | ||||
|  | ||||
|  | ||||
| def collect_jobs(input_dir, output_dir): | ||||
| @@ -132,11 +144,13 @@ def collect_jobs(input_dir, output_dir): | ||||
|  | ||||
| def main(): | ||||
|     args = parse_args() | ||||
|     jobs = collect_jobs(args.i, args.o) | ||||
|     jobs = collect_jobs(args.input_directory, args.output_directory) | ||||
|     nlp_pipeline = NLPPipeline(args.check_encoding, jobs, args.language, | ||||
|                                args.n_cores, args.o, args.zip) | ||||
|     retval = nlp_pipeline.run(dataDirRoot=(args.log_dir or args.o), | ||||
|                               nCores=args.n_cores) | ||||
|                                args.n_cores, args.output_directory, args.zip) | ||||
|     retval = nlp_pipeline.run( | ||||
|         dataDirRoot=(args.log_dir or args.output_directory), | ||||
|         nCores=args.n_cores | ||||
|     ) | ||||
|     sys.exit(retval) | ||||
|  | ||||
|  | ||||
|   | ||||
							
								
								
									
										17
									
								
								spacy-nlp
									
									
									
									
									
								
							
							
						
						
									
										17
									
								
								spacy-nlp
									
									
									
									
									
								
							| @@ -22,9 +22,10 @@ SPACY_MODELS = {'de': 'de_core_news_sm', | ||||
| # Parse the given arguments | ||||
| parser = ArgumentParser(description=('Tag a text file with spaCy and save it ' | ||||
|                                      'as a verticalized text file.')) | ||||
| parser.add_argument('i', metavar='txt-sourcefile') | ||||
| parser.add_argument('o', metavar='vrt-destfile') | ||||
| parser.add_argument('-l', '--language', choices=SPACY_MODELS.keys(), | ||||
| parser.add_argument('-i', '--input', metavar='txt-sourcefile', required=True) | ||||
| parser.add_argument('-o', '--output', metavar='vrt-destfile', required=True) | ||||
| parser.add_argument('-l', '--language', | ||||
|                     choices=SPACY_MODELS.keys(), | ||||
|                     required=True) | ||||
| parser.add_argument('--check-encoding', action='store_true') | ||||
| args = parser.parse_args() | ||||
| @@ -33,7 +34,7 @@ args = parser.parse_args() | ||||
| # If requested: Check the encoding of the text contents from the input file | ||||
| # Else: Use utf-8 | ||||
| if args.check_encoding: | ||||
|     with open(args.i, "rb") as input_file: | ||||
|     with open(args.input, "rb") as input_file: | ||||
|         bytes = input_file.read() | ||||
|         encoding = chardet.detect(bytes)['encoding'] | ||||
| else: | ||||
| @@ -41,14 +42,14 @@ else: | ||||
|  | ||||
|  | ||||
| # hashing in chunks to avoid full RAM with huge files. | ||||
| with open(args.i, 'rb') as input_file: | ||||
| with open(args.input, 'rb') as input_file: | ||||
|     md5_hash = hashlib.md5() | ||||
|     for chunk in iter(lambda: input_file.read(128 * md5_hash.block_size), b''): | ||||
|         md5_hash.update(chunk) | ||||
|     md5_hash = md5_hash.hexdigest() | ||||
|  | ||||
| # Load the text contents from the input file | ||||
| with open(args.i, encoding=encoding) as input_file: | ||||
| with open(args.input, encoding=encoding) as input_file: | ||||
|     text = input_file.read() | ||||
|     # spaCys NLP is limited to strings with maximum 1 million characters at | ||||
|     # once. So we split it into suitable chunks. | ||||
| @@ -64,8 +65,8 @@ nlp = spacy.load(SPACY_MODELS[args.language]) | ||||
|  | ||||
| # Create the output file in verticalized text format | ||||
| # See: http://cwb.sourceforge.net/files/CWB_Encoding_Tutorial/node3.html | ||||
| output_file_original_filename = args.o | ||||
| output_file_stand_off_filename = args.o.replace('.vrt', '.stand-off.vrt') | ||||
| output_file_original_filename = args.output | ||||
| output_file_stand_off_filename = args.output.replace('.vrt', '.stand-off.vrt') | ||||
| common_xml = ('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n' | ||||
|               + '<corpus>\n' | ||||
|               + '<text>\n' | ||||
|   | ||||
							
								
								
									
										16
									
								
								wrapper/nlp
									
									
									
									
									
								
							
							
						
						
									
										16
									
								
								wrapper/nlp
									
									
									
									
									
								
							| @@ -12,17 +12,21 @@ UID = str(os.getuid()) | ||||
| GID = str(os.getgid()) | ||||
|  | ||||
| parser = ArgumentParser(add_help=False) | ||||
| parser.add_argument('-i') | ||||
| parser.add_argument('-o') | ||||
| parser.add_argument('-i', '--input-directory') | ||||
| parser.add_argument('-o', '--output-directory') | ||||
| args, remaining_args = parser.parse_known_args() | ||||
|  | ||||
| cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)] | ||||
| if args.o is not None: | ||||
|     cmd += ['-v', '{}:{}'.format(os.path.abspath(args.o), CONTAINER_OUTPUT_DIR)] | ||||
| if args.output_directory is not None: | ||||
|     cmd += ['-v', '{}:{}'.format(os.path.abspath(args.output_directory), | ||||
|                                  CONTAINER_OUTPUT_DIR)] | ||||
|     remaining_args.insert(0, CONTAINER_OUTPUT_DIR) | ||||
| if args.i is not None: | ||||
|     cmd += ['-v', '{}:{}'.format(os.path.abspath(args.i), CONTAINER_INPUT_DIR)] | ||||
|     remaining_args.insert(0, '-o') | ||||
| if args.input_directory is not None: | ||||
|     cmd += ['-v', '{}:{}'.format(os.path.abspath(args.input_directory), | ||||
|                                  CONTAINER_INPUT_DIR)] | ||||
|     remaining_args.insert(0, CONTAINER_INPUT_DIR) | ||||
|     remaining_args.insert(0, '-i') | ||||
| cmd.append(CONTAINER_IMAGE) | ||||
| cmd += remaining_args | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user