#!/usr/bin/env python3.7 # coding=utf-8 from argparse import ArgumentParser from xml.sax.saxutils import escape import chardet import spacy import textwrap SPACY_MODELS = {'de': 'de_core_news_sm', 'el': 'el_core_news_sm', 'en': 'en_core_web_sm', 'es': 'es_core_news_sm', 'fr': 'fr_core_news_sm', 'it': 'it_core_news_sm', 'nl': 'nl_core_news_sm', 'pt': 'pt_core_news_sm'} # Parse the given arguments parser = ArgumentParser(description=('Tag a text file with spaCy and save it ' 'as a verticalized text file.')) parser.add_argument('i', metavar='txt-sourcefile') parser.add_argument('o', metavar='vrt-destfile') parser.add_argument('-l', '--language', choices=SPACY_MODELS.keys(), required=True) parser.add_argument('--check-encoding', action='store_true') args = parser.parse_args() # If requested: Check the encoding of the text contents from the input file # Else: Use utf-8 if args.check_encoding: with open(args.i, "rb") as input_file: bytes = input_file.read() encoding = chardet.detect(bytes)['encoding'] else: encoding = 'utf-8' # Load the text contents from the input file with open(args.i, encoding=encoding) as input_file: text = input_file.read() # spaCys NLP is limited to strings with maximum 1 million characters at # once. So we split it into suitable chunks. text_chunks = textwrap.wrap(text, 1000000, break_long_words=False) # the text variable potentially occupies a lot of system memory and is no # longer needed... del text # Setup the spaCy toolkit by loading the chosen language model nlp = spacy.load(SPACY_MODELS[args.language]) # Create the output file in verticalized text format # See: http://cwb.sourceforge.net/files/CWB_Encoding_Tutorial/node3.html output_file = open(args.o, 'w+') output_file.write('\n\n\n') for text_chunk in text_chunks: doc = nlp(text_chunk) for sent in doc.sents: output_file.write('\n') for token in sent: # Skip whitespace tokens if token.text.isspace(): continue output_file.write('{}'.format(escape(token.text)) + '\t{}'.format(escape(token.lemma_)) + '\t{}'.format(token.pos_) + '\t{}'.format(token.tag_) + '\t{}\n'.format(token.ent_type_ or 'NULL')) output_file.write('\n') output_file.write('\n') output_file.close()