#!/usr/bin/env python
# -*- coding: utf-8 -*-

import de_core_news_sm
import configparser
from utility.XMLProtocol import XMLProtocol
from lxml import etree
from tqdm import tqdm
import re


def tokenize(files, no_stop_words=False):
    """
    Tokenizes the speeches of the input XML protocols. Can include or exclude
    stop words. Tokenized speeches will be written into a new element
    <rede_tokenisiert>. Always removes punctuation. Joines hyphenated strings
    before they will be tokenized.
    """
    nlp = de_core_news_sm.load()
    config = configparser.ConfigParser()
    config.read("config.ini")
    output_path = config["File paths"]["nlp_output"]
    for file_path in tqdm(sorted(files), desc="Tokenization file status"):
        xml = XMLProtocol()
        xml.read_xml(file_path)
        speeches = xml.xml_tree.xpath(".//rede | .//sitzungsbeginn")
        for speech in speeches:
            parts = speech.xpath(".//p")
            tmp_list = []
            for part in parts:
                if(part.text is not None):
                    tmp_list.append(re.sub(r"_", " ", str(part.text + "\n")))
                    """
                    replaces "_" with " ". Is needed because a string like
                    "Treffsicherheit einer Schrotflinte;_Sie haben nämlich kaum
                    den Punkt getroffen" will not be lemmatized correctly in spacy.
                    "Schrotflinte;_Sie" wil be recognized as one token.
                    Furthermore this meeses up the sorted ngram calculation.
                    Also adds \n at end of every line to help identifying
                    hyphenated words.
                    """
                part.getparent().remove(part)
            new_text = "".join(tmp_list)
            new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[a-ßzäüö])", "\g<wordend>\g<wordstart>", new_text)
            """
            joins hyphenated words together:
            'Länderfinanz- ausgleich' --> Länderfinanzausgleich.
            Better to do it here because most of the comments and metadata has
            already been marked.
            Ignores strings like: 'Finanz-, Handels- und Sicherheitspolitik'.
            Does not ignore them when they happen at a linebreak. This is a rare
            occasion though.
            """
            new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[A-ZÄÜÖ])", "\g<wordend>-\g<wordstart>", new_text)
            """
            Removes all line breaks again. This way compound names with a line
            break inbetween like "Sütterlin-\nWaack" will be recognized as one
            string by spacy. --> Sütterlin-Waack
            """
            tokenized_speech = etree.Element("rede_tokenisiert")
            doc = nlp(new_text)
            if(no_stop_words is False):
                tokenized = " ".join([token.text for token in doc
                                      if token.pos_ != "PUNCT"])
                filename_sufix = "_tokenized_with_stopwords.xml"
            elif(no_stop_words is True):
                tokenized = " ".join([token.text for token in doc
                                      if token.is_stop is False
                                      and token.pos_ != "PUNCT"])
                filename_sufix = "_tokenized_without_stopwords.xml"
            tokenized_speech.text = tokenized
            speech.append(tokenized_speech)
        xml.save_to_file(output_path, file_path, "tokenized", "File paths",
                         "nlp_lemmatized_tokenized", filename_sufix)


if __name__ == '__main__':
    tokenize()