mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git
				synced 2025-11-04 03:12:43 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			136 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			136 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
#!/usr/bin/env python2.7
 | 
						|
# coding=utf-8
 | 
						|
 | 
						|
"""
 | 
						|
nlp
 | 
						|
 | 
						|
Usage:  For usage instructions run with option --help
 | 
						|
Author: Patrick Jentsch <p.jentsch@uni-bielefeld.de>
 | 
						|
"""
 | 
						|
 | 
						|
 | 
						|
import argparse
 | 
						|
import multiprocessing
 | 
						|
import os
 | 
						|
import sys
 | 
						|
from pyflow import WorkflowRunner
 | 
						|
 | 
						|
 | 
						|
def parse_arguments():
 | 
						|
    parser = argparse.ArgumentParser(
 | 
						|
        description='Performs NLP of documents utilizing spaCy. The results are served as verticalized text files.'
 | 
						|
    )
 | 
						|
 | 
						|
    parser.add_argument(
 | 
						|
        '-i',
 | 
						|
        dest='input_dir',
 | 
						|
        required=True
 | 
						|
    )
 | 
						|
    parser.add_argument(
 | 
						|
        '-l',
 | 
						|
        choices=['de', 'el', 'en', 'es', 'fr', 'it', 'nl', 'pt'],
 | 
						|
        dest='lang',
 | 
						|
        required=True
 | 
						|
    )
 | 
						|
    parser.add_argument(
 | 
						|
        '-o',
 | 
						|
        dest='output_dir',
 | 
						|
        required=True
 | 
						|
    )
 | 
						|
    parser.add_argument(
 | 
						|
        '--nCores',
 | 
						|
        default=min(4, multiprocessing.cpu_count()),
 | 
						|
        dest='n_cores',
 | 
						|
        help='total number of cores available',
 | 
						|
        required=False,
 | 
						|
        type=int
 | 
						|
    )
 | 
						|
    return parser.parse_args()
 | 
						|
 | 
						|
 | 
						|
class NLPWorkflow(WorkflowRunner):
 | 
						|
    def __init__(self, args):
 | 
						|
        self.jobs = analyze_jobs(args.input_dir, args.output_dir)
 | 
						|
        self.lang = args.lang
 | 
						|
        self.n_cores = args.n_cores
 | 
						|
 | 
						|
    def workflow(self):
 | 
						|
        if len(self.jobs) == 0:
 | 
						|
            return
 | 
						|
 | 
						|
        '''
 | 
						|
        ' ##################################################
 | 
						|
        ' # Create output directories                      #
 | 
						|
        ' ##################################################
 | 
						|
        '''
 | 
						|
        create_output_directories_jobs = []
 | 
						|
        for index, job in enumerate(self.jobs):
 | 
						|
            cmd = 'mkdir -p "%s"' % (job['output_dir'])
 | 
						|
            create_output_directories_jobs.append(
 | 
						|
                self.addTask(
 | 
						|
                    command=cmd,
 | 
						|
                    label='create_output_directories_job_-_%i' % (index)
 | 
						|
                )
 | 
						|
            )
 | 
						|
 | 
						|
        '''
 | 
						|
        ' ##################################################
 | 
						|
        ' # Natural language processing                    #
 | 
						|
        ' ##################################################
 | 
						|
        '''
 | 
						|
        nlp_jobs = []
 | 
						|
        nlp_job_n_cores = min(
 | 
						|
            self.n_cores,
 | 
						|
            max(1, int(self.n_cores / len(self.jobs)))
 | 
						|
        )
 | 
						|
        for index, job in enumerate(self.jobs):
 | 
						|
            cmd = 'spacy_nlp -l "%s" "%s" "%s"' % (
 | 
						|
                self.lang,
 | 
						|
                job['path'],
 | 
						|
                os.path.join(job['output_dir'], job['name'] + '.vrt')
 | 
						|
            )
 | 
						|
            nlp_jobs.append(
 | 
						|
                self.addTask(
 | 
						|
                    command=cmd,
 | 
						|
                    dependencies='create_output_directories_job_-_%i' % (index),
 | 
						|
                    label='nlp_job_-_%i' % (index),
 | 
						|
                    nCores=nlp_job_n_cores
 | 
						|
                )
 | 
						|
            )
 | 
						|
 | 
						|
 | 
						|
def analyze_jobs(input_dir, output_dir):
 | 
						|
    jobs = []
 | 
						|
 | 
						|
    for file in os.listdir(input_dir):
 | 
						|
        if os.path.isdir(os.path.join(input_dir, file)):
 | 
						|
            jobs += analyze_jobs(
 | 
						|
                os.path.join(input_dir, file),
 | 
						|
                os.path.join(output_dir, file),
 | 
						|
            )
 | 
						|
        elif file.endswith('.txt'):
 | 
						|
            jobs.append(
 | 
						|
                {
 | 
						|
                    'filename': file,
 | 
						|
                    'name': file.rsplit('.', 1)[0],
 | 
						|
                    'output_dir': os.path.join(output_dir, file),
 | 
						|
                    'path': os.path.join(input_dir, file)
 | 
						|
                }
 | 
						|
            )
 | 
						|
 | 
						|
    return jobs
 | 
						|
 | 
						|
 | 
						|
def main():
 | 
						|
    args = parse_arguments()
 | 
						|
 | 
						|
    wflow = NLPWorkflow(args)
 | 
						|
 | 
						|
    retval = wflow.run(dataDirRoot=args.output_dir, nCores=args.n_cores)
 | 
						|
 | 
						|
    sys.exit(retval)
 | 
						|
 | 
						|
 | 
						|
if __name__ == '__main__':
 | 
						|
    main()
 |