#!/usr/bin/env python3.7 # coding=utf-8 from argparse import ArgumentParser from xml.sax.saxutils import escape import chardet import hashlib import os import spacy import textwrap SPACY_MODELS = {'de': 'de_core_news_lg', 'el': 'el_core_news_lg', 'en': 'en_core_web_lg', 'es': 'es_core_news_lg', 'fr': 'fr_core_news_lg', 'it': 'it_core_news_lg', 'nl': 'nl_core_news_lg', 'pt': 'pt_core_news_lg'} SPACY_MODELS_VERSION = os.environ.get('SPACY_MODELS_VERSION') SPACY_VERSION = os.environ.get('SPACY_VERSION') # Parse the given arguments parser = ArgumentParser(description=('Tag a text file with spaCy and save it ' 'as a verticalized text file.')) parser.add_argument('-i', '--input', metavar='txt-sourcefile', required=True) parser.add_argument('-o', '--output', metavar='vrt-destfile', required=True) parser.add_argument('-l', '--language', choices=SPACY_MODELS.keys(), required=True) # noqa parser.add_argument('--check-encoding', action='store_true') args = parser.parse_args() # If requested: Check the encoding of the text contents from the input file # Else: Use utf-8 if args.check_encoding: with open(args.input, "rb") as input_file: bytes = input_file.read() encoding = chardet.detect(bytes)['encoding'] else: encoding = 'utf-8' # hashing in chunks to avoid full RAM with huge files. with open(args.input, 'rb') as input_file: source_md5 = hashlib.md5() for chunk in iter(lambda: input_file.read(128 * source_md5.block_size), b''): source_md5.update(chunk) source_md5 = source_md5.hexdigest() # Load the text contents from the input file with open(args.input, encoding=encoding) as input_file: text = input_file.read() # spaCys NLP is limited to strings with maximum 1 million characters at # once. So we split it into suitable chunks. text_chunks = textwrap.wrap(text, 1000000, break_long_words=False) # the text variable potentially occupies a lot of system memory and is no # longer needed... del text # Setup the spaCy toolkit by loading the chosen language model model = SPACY_MODELS[args.language] nlp = spacy.load(model) # Create the output file in verticalized text format # See: http://cwb.sourceforge.net/files/CWB_Encoding_Tutorial/node3.html output_file_original_filename = args.output output_file_stand_off_filename = args.output.replace('.vrt', '.stand-off.vrt') common_xml = ('\n' + '\n' + '\n' + '\n'.format(source_md5)) with open(output_file_original_filename, 'w+') as output_file_original, \ open(output_file_stand_off_filename, 'w+') as output_file_stand_off: output_file_original.write(common_xml) output_file_stand_off.write(common_xml) text_offset = 0 for text_chunk in text_chunks: doc = nlp(text_chunk) for sent in doc.sents: output_file_original.write('\n') output_file_stand_off.write('\n') space_flag = False # Skip whitespace tokens sent_no_space = [token for token in sent if not token.text.isspace()] # No space variant for cwb original .vrt file input. for token in sent_no_space: output_file_original.write('{}'.format(escape(token.text)) + '\t{}'.format(escape(token.lemma_)) + '\t{}'.format(token.pos_) + '\t{}'.format(token.tag_) + '\t{}\n'.format(token.ent_type_ or 'NULL')) # Stand off variant with spaces. for token in sent: token_start = token.idx + text_offset token_end = token.idx + len(token.text) + text_offset output_file_stand_off.write('{}:{}'.format(token_start, token_end) + '\t{}'.format(escape(token.lemma_)) + '\t{}'.format(token.pos_) + '\t{}'.format(token.tag_) + '\t{}\n'.format(token.ent_type_ or 'NULL')) output_file_original.write('\n') output_file_stand_off.write('\n') text_offset = token_end + 1 output_file_original.write('\n') output_file_stand_off.write('\n')