bundesdata_markup_nlp_software/bundesdata_markup_nlp/utility/XMLProtocol.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from utility import delete_folder
from utility import update_config
from xml.etree import ElementTree
from os import path
from lxml import etree
import os
import logging
import re


class XMLProtocol(object):
    """Class for standard operations on/with the XML protocols. Has functions
    for reading, saving and manipulationg an XML protocol. All other classes
    inherit from this one.
    """

    def __init__(self):
        super().__init__()
        self.logger = logging.getLogger(__name__)

    def read_protcol(self, file_path):
        """
        Takes a file path and parses the file as an XML returns a root element.
        """
        self.file_path = file_path
        self.filename = os.path.basename(self.file_path)
        parser = etree.XMLParser(remove_blank_text=True)
        self.tree = etree.parse(file_path, parser)  # for better xml indentation
        root = self.tree.getroot()
        self.logger.info("File successfully parsed as XML.")
        return root

    def read_xml(self, file_path):
        """Takes a file path and parses the file as an XML."""
        parser = etree.XMLParser(encoding='utf-8', remove_blank_text=True)
        tree = etree.parse(file_path, parser)  # for better xml indentation
        self.xml_tree = tree.getroot()

    def save_to_file(self, output_path, file_path, subfolder, config_section,
                     config_key, filename_sufix=""):
        """
        Writes the new markup to a new xml file. Takes the output path and
        creates a new folder there. Also updates the config file with the new
        path.
        """
        if(filename_sufix == ""):
            self.filename = path.basename(file_path)
        elif(filename_sufix != ""):
            self.filename = path.basename(file_path)[:-4] + filename_sufix
        save_path = os.path.join(output_path, subfolder)
        if not os.path.exists(save_path):
            os.mkdir(save_path)
        tree = etree.ElementTree(self.xml_tree)
        new_filename = self.filename
        save_file_path = os.path.join(save_path, new_filename)
        tree.write(save_file_path,
                   pretty_print=True,
                   xml_declaration=True,
                   encoding="utf8",
                   doctype="<!DOCTYPE dbtplenarprotokoll SYSTEM 'dbtplenarprotokoll_minimal.dtd\'>")
        self.logger.info("New XML saved to:" + save_file_path)
        update_config.update_config("config.ini", config_section, config_key,
                                    save_path)

    def beautify_xml_part(self, file_path, xpath, alter_lines=False,
                          line_width=80):
        """
        Beautifies part (element node) of an input XML.
        """
        tmp_path = os.path.join(os.path.dirname(file_path), "tmp")
        tree = etree.ElementTree(self.xml_tree)
        self.beautified_part = tree.find(xpath)
        self.beautified_part = ElementTree.tostring(self.beautified_part)
        self.beautified_part = etree.fromstring(self.beautified_part)
        self.beautified_part = etree.ElementTree(self.beautified_part)
        if not os.path.exists(tmp_path):
            os.mkdir(tmp_path)
        tmp_file_path = os.path.join(tmp_path, "tmp.xml")
        self.beautified_part.write(tmp_file_path,
                                   pretty_print=True,
                                   xml_declaration=True,
                                   encoding="utf8")
        if(alter_lines is True):
            os.system("html-beautify -r -q -w {} --no-preserve-newlines {}".format(line_width, tmp_file_path))
            self.beautified_part = etree.parse(tmp_file_path).getroot()
        elif(alter_lines is False):
            os.system("html-beautify -r -q {}".format(tmp_file_path))
            self.beautified_part = etree.parse(tmp_file_path).getroot()
        update_config.update_config("config.ini", "File paths", "tmp_path",
                                    tmp_path)
        delete_folder.delete_folder(tmp_path)

    def beautify_xml(self, file_path, alter_lines=False, line_width=80):
        if(alter_lines is True):
            os.system("html-beautify -r -q -w {} --no-preserve-newlines {}".format(line_width, file_path))
        elif(alter_lines is False):
            os.system("html-beautify -r -q {}".format(file_path))

    def expand_element(self, element_to_expand, expand_attr_key,
                       expand_attr_value, check_child=True):
        """
        This function takes an XPath expression for an xml element.
        The tag of this element will be expanded with the given
        expand_attrkey and expand_attr_value. Also needs a regex to determine if
        the current selected element is an element which should be replaced.
        For this the text of the first child of the current element is checked
        against the given regex. Per default the child element text of the
        current element is checked wether the regex matches the string or not.
        Set check_child to False to avoid this and just expand the current
        element.
        """
        elements = self.xml_tree.findall(element_to_expand)
        for element in elements:
            if(check_child is True):
                first_child = element.getchildren()[0]
                match = self.regex_compiled.search(first_child.text)
                if(match):
                    element.set(expand_attr_key, expand_attr_value)
                self.xml_tree = self.xml_tree
            else:
                element.set(expand_attr_key, expand_attr_value)
                self.xml_tree = self.xml_tree

    def replace_tag_name(self, element_to_replace, tag_name, check_child=True):
        """
        Replaces a given element tag(as XPath) name with a new tag name.
        """
        elements = self.xml_tree.findall(element_to_replace)
        for element in elements:
            if(check_child is True):
                first_child = element.getchildren()[0]
                match = self.regex_compiled.search(first_child.text)
                if(match):
                    element.tag = tag_name
            else:
                element.tag = tag_name
        self.xml_tree = self.xml_tree

    def replace_tag_attr(self, element_to_replace, tag_name, attr_key,
                         attr_value, check_child=True):
        """
        Replaces tag name of given element(as XPath) with new name and adds an
        attribute Can also check if the child of the current element contains
        some specific text like in the expand_element function.
        """
        elements = self.xml_tree.findall(element_to_replace)
        for element in elements:
            if(check_child is True):
                first_child = element.getchildren()[0]
                match = self.regex_compiled.search(first_child.text)
                if(match):
                    element.tag = tag_name
                    element.set(attr_key, attr_value)
            else:
                element.tag = tag_name
                element.set(attr_key, attr_value)
        self.xml_tree = self.xml_tree

    def replace_elements(self, elements_to_replace, replacment_elements,
                         keep_parent_text=False):
        """
        Replaces elements identifeid by XPath with new elements. Can either keep
        the text of the parent element or not.
        """
        elements = self.xml_tree.findall(elements_to_replace)
        parents_text_xpath = elements_to_replace + "/" + "parent::node()" + "/" + "text()"
        elements_text = self.xml_tree.xpath(parents_text_xpath)
        if(len(elements) == len(replacment_elements)):
            if(keep_parent_text is False):
                for element, replacement_element in zip(elements, replacment_elements):
                    element.getparent().replace(element, replacement_element)
            else:
                for element, replacement_element in zip(elements, replacment_elements):
                    element.getparent().replace(element, replacement_element)
                self.xml_tree = self.xml_tree
                elements = self.xml_tree.findall(elements_to_replace)
                for element, text in zip(elements, elements_text):
                    element.tail = text
            self.xml_tree = self.xml_tree
        else:
            self.logger.warning(("Elements missmatch. There are "
                                 + str(len(elements))
                                 + " that should be repalced."
                                 + " There are " + str(len(replacment_elements))
                                 + " present."
                                 + " No elements have been replaced."))

    def compile_regex(self, regex):
        self.regex_string = regex
        """
        Takes the input regex string and compiles it for better performance
        and redability.
        """
        self.regex_compiled = re.compile(self.regex_string, re.MULTILINE)

    def clean_text(self, regex, xpath, replacement_string="",):
        """
        Replaces regex matches with nothing by default or replacement string
        for an element matched by the xpath in the xml_tree. Works with
        matchgroups.
        """
        elements = self.xml_tree.xpath(xpath)
        for element in elements:
            replaced = re.sub(regex, replacement_string, element.text)
            element.text = replaced
        self.xml_tree = self.xml_tree