#!/usr/bin/env python3 # coding=utf-8 import argparse import os import spacy import textwrap parser = argparse.ArgumentParser( description='Tag a text file with spaCy and save it as a verticalized text file.' ) parser.add_argument( 'i', metavar='txt-sourcefile', ) parser.add_argument( '-l', choices=['de', 'en', 'es', 'fr', 'pt'], dest='lang', required=True ) parser.add_argument( 'o', metavar='vrt-destfile', ) args = parser.parse_args() SPACY_MODELS = { 'de': 'de_core_news_sm', 'en': 'en_core_web_sm', 'es': 'es_core_news_sm', 'fr': 'fr_core_news_sm', 'pt': 'pt_core_news_sm' } # Set the language model for spacy nlp = spacy.load(SPACY_MODELS[args.lang]) # Read text from the input file and if neccessary split it into parts with a # length of less than 1 million characters. with open(args.i) as input_file: text = input_file.read() texts = textwrap.wrap(text, 1000000, break_long_words=False) text = None # Create and open the output file output_file = open(args.o, 'w+') output_file.write( '\n\n\n' % ( os.path.basename(args.i).rsplit(".", 1)[0] ) ) for text in texts: # Run spacy nlp over the text (partial string if above 1 million chars) doc = nlp(text) for sent in doc.sents: output_file.write('\n') for token in sent: # Skip whitespace tokens like "\n" or "\t" if token.text.isspace(): continue # Write all information in .vrt style to the output file # text, lemma, simple_pos, pos, ner output_file.write( token.text + '\t' + token.lemma_ + '\t' + token.pos_ + '\t' + token.tag_ + '\t' + (token.ent_type_ if token.ent_type_ != '' else 'NULL') + '\n' ) output_file.write('\n') output_file.write('\n') output_file.close()