#!/usr/bin/env python3.5 # coding=utf-8 import argparse import os import spacy import textwrap parser = argparse.ArgumentParser( description=('Tag a text file with spaCy and save it as a verticalized ' 'text file.') ) parser.add_argument('i', metavar='txt-sourcefile') parser.add_argument('-l', choices=['de', 'el', 'en', 'es', 'fr', 'it', 'nl', 'pt'], dest='lang', required=True) parser.add_argument('o', metavar='vrt-destfile') args = parser.parse_args() SPACY_MODELS = {'de': 'de_core_news_sm', 'el': 'el_core_news_sm', 'en': 'en_core_web_sm', 'es': 'es_core_news_sm', 'fr': 'fr_core_news_sm', 'it': 'it_core_news_sm', 'nl': 'nl_core_news_sm', 'pt': 'pt_core_news_sm'} # Set the language model for spacy nlp = spacy.load(SPACY_MODELS[args.lang]) # Read text from the input file and if neccessary split it into parts with a # length of less than 1 million characters. with open(args.i) as input_file: text = input_file.read() texts = textwrap.wrap(text, 1000000, break_long_words=False) text = None # Create and open the output file output_file = open(args.o, 'w+') output_file.write('\n' '\n' '\n') for text in texts: # Run spacy nlp over the text (partial string if above 1 million chars) doc = nlp(text) for sent in doc.sents: output_file.write('\n') for token in sent: # Skip whitespace tokens like "\n" or "\t" if token.text.isspace(): continue # Write all information in .vrt style to the output file # text, lemma, simple_pos, pos, ner output_file.write( '{}\t{}\t{}\t{}\t{}\n'.format( token.text, token.lemma_, token.pos_, token.tag_, token.ent_type_ if token.ent_type_ != '' else 'NULL' ) ) output_file.write('\n') output_file.write('\n' '') output_file.close()