#!/usr/bin/env python3.7 # coding=utf-8 from argparse import ArgumentParser from xml.sax.saxutils import escape import chardet import spacy import textwrap import hashlib SPACY_MODELS = {'de': 'de_core_news_sm', 'el': 'el_core_news_sm', 'en': 'en_core_web_sm', 'es': 'es_core_news_sm', 'fr': 'fr_core_news_sm', 'it': 'it_core_news_sm', 'nl': 'nl_core_news_sm', 'pt': 'pt_core_news_sm'} # Parse the given arguments parser = ArgumentParser(description=('Tag a text file with spaCy and save it ' 'as a verticalized text file.')) parser.add_argument('i', metavar='txt-sourcefile') parser.add_argument('o', metavar='vrt-destfile') parser.add_argument('-l', '--language', choices=SPACY_MODELS.keys(), required=True) parser.add_argument('--check-encoding', action='store_true') args = parser.parse_args() # If requested: Check the encoding of the text contents from the input file # Else: Use utf-8 if args.check_encoding: with open(args.i, "rb") as input_file: bytes = input_file.read() encoding = chardet.detect(bytes)['encoding'] else: encoding = 'utf-8' # hashing in chunks to avoid full RAM with huge files. with open(args.i, 'rb') as input_file: md5_hash = hashlib.md5() for chunk in iter(lambda: input_file.read(128 * md5_hash.block_size), b''): md5_hash.update(chunk) md5_hash = md5_hash.hexdigest() # Load the text contents from the input file with open(args.i, encoding=encoding) as input_file: text = input_file.read() # spaCys NLP is limited to strings with maximum 1 million characters at # once. So we split it into suitable chunks. text_chunks = textwrap.wrap(text, 1000, break_long_words=False) # the text variable potentially occupies a lot of system memory and is no # longer needed... del text # Setup the spaCy toolkit by loading the chosen language model nlp = spacy.load(SPACY_MODELS[args.language]) # Create the output file in verticalized text format # See: http://cwb.sourceforge.net/files/CWB_Encoding_Tutorial/node3.html output_file_original_filename = args.o output_file_stand_off_filename = args.o.replace('.vrt', '.stand-off.vrt') output_file_tokens_filename = args.o.replace('.vrt', '.tokens.txt') xml_head = '''\n\ \n\ \n\ \n'''.format(md5_hash=md5_hash, spacy_version=spacy.__version__, spacy_model=SPACY_MODELS[args.language]) with open(output_file_original_filename, 'w+') as output_file_original, \ open(output_file_stand_off_filename, 'w+') as output_file_stand_off, \ open(output_file_tokens_filename, 'w+') as output_file_tokens: output_file_original.write(xml_head) output_file_stand_off.write(xml_head) output_file_tokens.write(xml_head) text_offset = 0 for text_chunk in text_chunks: doc = nlp(text_chunk) for sent in doc.sents: output_file_original.write('\n') output_file_stand_off.write('\n') space_flag = False # Skip whitespace tokens sent_no_space = [token for token in sent if not token.text.isspace()] # No space variant for cwb original .vrt file input. for token in sent_no_space: output_file_original.write('{}'.format(escape(token.text)) + '\t{}'.format(escape(token.lemma_)) + '\t{}'.format(token.pos_) + '\t{}'.format(token.tag_) + '\t{}\n'.format(token.ent_type_ or 'NULL')) # Stand off variant with spaces. for token in sent: token_start = token.idx + text_offset token_end = token.idx + len(token.text) + text_offset output_file_stand_off.write('{}:{}'.format(token_start, token_end) + '\t{}'.format(escape(token.lemma_)) + '\t{}'.format(token.pos_) + '\t{}'.format(token.tag_) + '\t{}\n'.format(token.ent_type_ or 'NULL')) output_file_tokens.write('{}\n'.format(escape(token.text))) output_file_original.write('\n') output_file_stand_off.write('\n') text_offset = token_end + 1 output_file_original.write('\n\n') output_file_stand_off.write('\n\n') output_file_tokens.write('\n\n')