#!/usr/bin/env python
# -*- coding: utf-8 -*-

from utility.XMLProtocol import XMLProtocol
from utility import update_config
from lxml import etree
from datetime import datetime
from babel.dates import format_date
import os
import re
import logging
import configparser


class MetadataMarkup(XMLProtocol):
    """
    This class is for opening one XML-protocoll, extracting the included
    metadata and creating a new valid metadata head.
    """

    def __init__(self):
        super().__init__()
        self.plenarprotokoll_string = str()  # will be extracted with extract_metadata()
        self.wahlperiode = int()  # will be extracted with extract_metadata()
        self.sitzungsnr = int()  # will be extracted with extract_metadata()
        self.herausgeber = "Deutscher Bundestag"  # Always the same in every protocoll
        self.berichtart = "Steongrafischer Bericht"  # Always the same in every protocoll
        self.sitzungstitel_string = ". Sitzung"  # Always the same in every protocoll
        self.ort = "Berlin"  # Always the same in every protocoll
        self.datum_ger_non_iso = str()  # will be extracted with extract_metadata()
        self.datum_iso = str()  # ISO-date will be built from self.datum_ger_non_iso
        self.datum_string = str()  # will be built from self.datum_iso
        self.attachment = str()  # will be extracted from a split. Will not work
        # all the time. But will not break the XML.
        self.logger = logging.getLogger(__name__)

    def extract_metadata(self, etree_element_object):
        """
        Extracts metadata from the given XML-tags and wirtes them into the
        instance variables
        """
        root = etree_element_object
        metadata_list = []
        for element in root.iter():
            if(element.tag != "TEXT"):
                metadata_list.append(element.text)
        metadata_list = metadata_list[1:]
        self.wahlperiode = metadata_list[0]
        self.plenarprotokoll_string = metadata_list[1].lower().title()
        self.sitzungsnr = metadata_list[2].split("/")[1]
        self.datum_ger_non_iso = metadata_list[3]
        self.logger.info("Metadata successfully extracted.")
        self.logger.info("Wahlperiode is:" + self.wahlperiode)
        self.logger.info("Plenarprotokoll is:" + self.plenarprotokoll_string)
        self.logger.info("Sitzungsnummer is:" + self.sitzungsnr)
        self.logger.info("German non ISO date is:" + self.datum_ger_non_iso)

    def built_iso_date(self, ger_date):
        """
        Gets the german date and converts it to an ISO standard date.
        """
        self.datum_iso = datetime.strptime(ger_date, "%d.%m.%Y").date()
        self.logger.info("ISO date created:" + str(self.datum_iso))

    def built_date_string(self, iso_date):
        """
        Gets the ISO date and creates from it an german full string date.
        """
        date_string = format_date(iso_date, format="full", locale="de_DE")
        date_string = re.sub(r",", ", den", date_string)
        self.datum_string = date_string
        self.logger.info("Date string created:" + self.datum_string)

    def delete_old_metadata(self, etree_element_object):
        """
        Deletes old metadata tags and text. Renames root tag.
        """
        for element in etree_element_object.iter():
            if(element.tag != "TEXT" and element.tag != "DOKUMENT"):
                element.getparent().remove(element)
            elif(element.tag == "DOKUMENT"):
                element.tag = "dbtplenarprotokoll"
            elif(element.tag == "TEXT"):
                self.full_content = element.text
                element.getparent().remove(element)
        self.logger.info("Old metadata deleted.")

    def insert_new_metadata(self, etree_element_object):
        """
        Inserts the extracted metadata and splitted content into new created
        and valid xml tags according to the official schema.
        """
        vorspann_element = etree.Element("vorspann")
        xml_string = """
    <kopfdaten>
        <plenarprotokoll-nummer>{} <wahlperiode>{}</wahlperiode>/<sitzungsnr>{}</sitzungsnr>
        (neu)</plenarprotokoll-nummer>
        <herausgeber>{}</herausgeber>
        <berichtart>{}</berichtart>
        <sitzungstitel><sitzungsnr>{}</sitzungsnr>. Sitzung</sitzungstitel>
        <veranstaltungsdaten><ort>{}</ort>, <datum date="{}">{}</datum></veranstaltungsdaten>
    </kopfdaten>"""\
            .format(self.plenarprotokoll_string, self.wahlperiode,
                    self.sitzungsnr, self.herausgeber, self.berichtart,
                    self.sitzungsnr, self.ort, self.datum_ger_non_iso,
                    self.datum_string)
        etree_from_str = etree.fromstring(xml_string)
        etree_element_object.insert(0, vorspann_element)
        vorspann_element.append(etree_from_str)
        toc_element = etree.Element("inhaltsverzeichnis")
        toc_element.text = self.toc
        vorspann_element.append(toc_element)
        content_element = etree.Element("sitzungsverlauf")
        content_element.text = self.president + self.content
        etree_element_object.insert(2, content_element)
        anlagen_element = etree.Element("anlagen")
        anlagen_element. text = self.attachment
        etree_element_object.insert(3, anlagen_element)
        rednerliste_element = etree.Element("rednerliste",
                                            sitzungsdatum=self.datum_ger_non_iso)
        etree_element_object.insert(4, rednerliste_element)
        self.xml_tree = etree_element_object
        self.logger.info("New metadata XML-head inserted." + xml_string)

    def split_content(self, etree_element_object):
        """Splits the full content to: table of content, speeches and in some
        cases attachments."""
        config = configparser.ConfigParser()
        config.read("config.ini")

        session_start_split = config["Regular expressions splits"]["session_start_president_split"]
        regex_start = re.compile(session_start_split)
        tmp_list = regex_start.split(self.full_content, maxsplit=1)
        self.toc = tmp_list[0]
        self.president = tmp_list[1]
        self.content = tmp_list[2]

        attachment_split = config["Regular expressions splits"]["attachment_split"]
        regex_att = re.compile(attachment_split)
        tmp_list = regex_att.split(self.content)
        tmp_list = [element for element in tmp_list if element is not None]
        if(tmp_list[-1] == ""):  # if the split does not match anything last item is empty string.
            self.content = "".join(tmp_list[0:-1])
            self.attachment = "Keine Anlage extrahiert."
            self.logger.warning(("There is no attachment."))
        else:
            self.content = "".join(tmp_list[0:-1])
            self.attachment = tmp_list[-1]
            self.logger.info("Attachment found.")
        self.logger.info("Contet splitted at:" + str(regex_start))
        self.logger.info("Contet splitted at:" + str(regex_att))

    def get_session_times(self):
        """This function looks into the entire protocoll content to extract the
        last closing time and the starting time. If only one of both or none are
        found, the missing time will be set to xx:xx."""
        config = configparser.ConfigParser()
        config.read("config.ini")
        regex_conf_values = config.items("Regular expressions time extraction")
        regex_conf_values = [regex[1] for regex in regex_conf_values]
        tmp_list = []
        identifier = 0
        start_time_found = True
        end_time_found = True

        for regex in (regex_conf_values):
            identifier += 1
            regex = re.compile(regex)
            if(identifier == 1):
                # Always gets first start time.
                matches = list(regex.finditer(self.full_content))
                if(len(matches) > 1):
                    match = matches[-1]
                elif(len(matches) == 0):
                    match = None
                else:
                    match = matches[0]
            elif(identifier == 2):
                # Always gets last closing time
                matches = list(regex.finditer(self.full_content))
                if(len(matches) > 1):
                    match = matches[-1]
                elif(len(matches) == 0):
                    match = None
                else:
                    match = matches[0]

            if(match is None and identifier == 1):
                self.logger.warning("No start time found for " + str(regex))
                start_time_found = False
            elif(match is None and identifier == 2):
                self.logger.warning("No end time found for " + str(regex))
                end_time_found = False
            elif(match):
                session_time = [group for group in match.groups()
                                if group is not None]
                session_time = ["0" + group if len(group) == 1 else group for
                                group in session_time]  # Adds a 0 in front if digit len is 1
                if(len(session_time) == 2):
                    tmp_list.append(":".join(session_time))
                elif(len(session_time) == 1):
                    tmp_list.append(session_time[0] + ":00")

        if(len(tmp_list) == 2):
            self.session_start_time = tmp_list[0]
            self.session_end_time = tmp_list[1]
            self.logger.info("Start time found: " + self.session_start_time)
            self.logger.info("End time found: " + self.session_end_time)
            self.logger.info("Successfully matched start and end times.")
        elif(len(tmp_list) == 1 and start_time_found is True and end_time_found
             is False):
            self.session_start_time = tmp_list[0]
            self.session_end_time = "xx:xx"
            self.logger.warning("Only start time found: "
                                + self.session_start_time)
            self.logger.warning("End time set to: "
                                + self.session_end_time)
        elif(len(tmp_list) == 1 and start_time_found is False and end_time_found
             is True):
            self.session_end_time = tmp_list[0]
            self.session_start_time = "xx:xx"
            self.logger.warning("Only end time found: "
                                + self.session_end_time)
            self.logger.warning("Start time set to: "
                                + self.session_start_time)

    def write_to_attr(self, element, attr_key, attr_value):
        """
        Writes two strings as a an attribute key value pair to a given
        element.
        """
        elements = self.xml_tree.findall(element)
        if(elements == []):
            element = self.tree.getroot()
            elements.append(element)
        for element in elements:
            element.set(attr_key, attr_value)
            self.xml_tree = self.xml_tree
            self.logger.info("Wrote attribute "
                             + attr_key
                             + "="
                             + "\""
                             + attr_value
                             + "\"")

    def save_to_file(self, output_path, file_path, subfolder, config_section,
                     config_key):
        """
        Writes the new markup to a new xml file. Takes the output path and
        creates a new folder there. Also updates the config file with the new
        path.
        """
        self.filename = os.path.basename(file_path)
        save_path = os.path.join(output_path, subfolder)
        if not os.path.exists(save_path):
            os.mkdir(save_path)
        tree = etree.ElementTree(self.xml_tree)
        new_filename = self.filename
        save_file_path = os.path.join(save_path, new_filename)
        tree.write(save_file_path,
                   pretty_print=True,
                   xml_declaration=True,
                   encoding="utf8",
                   doctype="<!DOCTYPE dbtplenarprotokoll SYSTEM 'dbtplenarprotokoll_minimal.dtd\'>")
        self.logger.info("New XML saved to:" + save_file_path)
        update_config.update_config("config.ini", config_section, config_key,
                                    save_path)