diff --git a/nlp b/nlp index 203c922..79af9af 100755 --- a/nlp +++ b/nlp @@ -18,33 +18,23 @@ from pyflow import WorkflowRunner def parse_arguments(): parser = argparse.ArgumentParser( - description='Performs NLP of documents utilizing spaCy. The results are served as verticalized text files.' - ) - - parser.add_argument( - '-i', - dest='input_dir', - required=True + description=('Performs NLP of documents utilizing spaCy. The results ' + 'are served as verticalized text files.') ) + parser.add_argument('-i', dest='input_dir', required=True) parser.add_argument( '-l', choices=['de', 'el', 'en', 'es', 'fr', 'it', 'nl', 'pt'], dest='lang', required=True ) - parser.add_argument( - '-o', - dest='output_dir', - required=True - ) - parser.add_argument( - '--nCores', - default=min(4, multiprocessing.cpu_count()), - dest='n_cores', - help='total number of cores available', - required=False, - type=int - ) + parser.add_argument('-o', dest='output_dir', required=True) + parser.add_argument('--nCores', + default=min(4, multiprocessing.cpu_count()), + dest='n_cores', + help='total number of cores available', + required=False, + type=int) return parser.parse_args() @@ -65,11 +55,11 @@ class NLPWorkflow(WorkflowRunner): ''' create_output_directories_jobs = [] for index, job in enumerate(self.jobs): - cmd = 'mkdir -p "%s"' % (job['output_dir']) + cmd = 'mkdir -p "{}"'.format(job['output_dir']) create_output_directories_jobs.append( self.addTask( command=cmd, - label='create_output_directories_job_-_%i' % (index) + label='create_output_directories_job_-_{}'.format(index) ) ) @@ -84,7 +74,7 @@ class NLPWorkflow(WorkflowRunner): max(1, int(self.n_cores / len(self.jobs))) ) for index, job in enumerate(self.jobs): - cmd = 'spacy_nlp -l "%s" "%s" "%s"' % ( + cmd = 'spacy_nlp -l "{}" "{}" "{}"'.format( self.lang, job['path'], os.path.join(job['output_dir'], job['name'] + '.vrt') @@ -92,8 +82,10 @@ class NLPWorkflow(WorkflowRunner): nlp_jobs.append( self.addTask( command=cmd, - dependencies='create_output_directories_job_-_%i' % (index), - label='nlp_job_-_%i' % (index), + dependencies='create_output_directories_job_-_{}'.format( + index + ), + label='nlp_job_-_{}'.format(index), nCores=nlp_job_n_cores ) ) @@ -104,19 +96,13 @@ def analyze_jobs(input_dir, output_dir): for file in os.listdir(input_dir): if os.path.isdir(os.path.join(input_dir, file)): - jobs += analyze_jobs( - os.path.join(input_dir, file), - os.path.join(output_dir, file), - ) + jobs += analyze_jobs(os.path.join(input_dir, file), + os.path.join(output_dir, file)) elif file.endswith('.txt'): - jobs.append( - { - 'filename': file, - 'name': file.rsplit('.', 1)[0], - 'output_dir': os.path.join(output_dir, file), - 'path': os.path.join(input_dir, file) - } - ) + jobs.append({'filename': file, + 'name': file.rsplit('.', 1)[0], + 'output_dir': os.path.join(output_dir, file), + 'path': os.path.join(input_dir, file)}) return jobs diff --git a/spacy_nlp b/spacy_nlp index a20fdf5..57904d5 100755 --- a/spacy_nlp +++ b/spacy_nlp @@ -7,29 +7,25 @@ import spacy import textwrap parser = argparse.ArgumentParser( - description='Tag a text file with spaCy and save it as a verticalized text file.' -) -parser.add_argument( - 'i', - metavar='txt-sourcefile', -) -parser.add_argument( - '-l', - choices=['de', 'el', 'en', 'es', 'fr', 'it', 'nl', 'pt'], - dest='lang', - required=True -) -parser.add_argument( - 'o', - metavar='vrt-destfile', + description=('Tag a text file with spaCy and save it as a verticalized ' + 'text file.') ) +parser.add_argument('i', metavar='txt-sourcefile') +parser.add_argument('-l', + choices=['de', 'el', 'en', 'es', 'fr', 'it', 'nl', 'pt'], + dest='lang', + required=True) +parser.add_argument('o', metavar='vrt-destfile') args = parser.parse_args() -SPACY_MODELS = { - 'de': 'de_core_news_sm', 'el': 'el_core_news_sm', 'en': 'en_core_web_sm', - 'es': 'es_core_news_sm', 'fr': 'fr_core_news_sm', 'it': 'it_core_news_sm', - 'nl': 'nl_core_news_sm', 'pt': 'pt_core_news_sm' -} +SPACY_MODELS = {'de': 'de_core_news_sm', + 'el': 'el_core_news_sm', + 'en': 'en_core_web_sm', + 'es': 'es_core_news_sm', + 'fr': 'fr_core_news_sm', + 'it': 'it_core_news_sm', + 'nl': 'nl_core_news_sm', + 'pt': 'pt_core_news_sm'} # Set the language model for spacy nlp = spacy.load(SPACY_MODELS[args.lang]) @@ -45,9 +41,9 @@ with open(args.i) as input_file: output_file = open(args.o, 'w+') output_file.write( - '\n\n\n' % ( - os.path.basename(args.i).rsplit(".", 1)[0] - ) + '\n' + '\n' + '\n'.format(os.path.basename(args.i).rsplit(".", 1)[0]) ) for text in texts: # Run spacy nlp over the text (partial string if above 1 million chars) @@ -61,11 +57,18 @@ for text in texts: # Write all information in .vrt style to the output file # text, lemma, simple_pos, pos, ner output_file.write( - token.text + '\t' + token.lemma_ + '\t' - + token.pos_ + '\t' + token.tag_ + '\t' - + (token.ent_type_ if token.ent_type_ != '' else 'NULL') + '\n' + '{}\t{}\t{}\t{}\t{}\n'.format( + token.text, + token.lemma_, + token.pos_, + token.tag_, + token.ent_type_ if token.ent_type_ != '' else 'NULL' + ) ) output_file.write('\n') -output_file.write('\n') +output_file.write( + '\n' + '' +) output_file.close()