bundesdata_markup_nlp_software/bundesdata_markup_nlp/markup/EntityMarkup.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from markup.MetadataMarkup import MetadataMarkup
from lxml import etree
from xml.etree import ElementTree
from xml.sax.saxutils import escape
import logging
import os
import re


class EntityMarkup(MetadataMarkup):
    """Class for getting an XML node in which entities will be marked.
    In practice this class and its mehtods can be used to get the text of a
    given Node and marks every speaker in this text string.
    Also passes methods and fields to the more specific
    SimpleSpeakersMarkup."""

    def __init__(self, file_path, element_name=".//sitzungsverlauf"):
        super().__init__()
        self.file_path = file_path
        self.element_name = element_name
        self.xml_tree = None
        self.current_string = str()
        self.filename = os.path.basename(file_path)
        self.logger = logging.getLogger(__name__)

    def get_element_text(self):
        """
        Gets the strings of all elements matched by an element x-path. Element
        name will be passed when the class is istanced. Distunguishes between
        one string or several strings.
        """
        self.all_elements = self.xml_tree.iterfind(self.element_name)
        len_all_elements = len(list(self.all_elements))
        self.current_strings = []
        if(len_all_elements == 1):
            self.all_elements = self.xml_tree.iterfind(self.element_name)
            self.current_string = escape(list(self.all_elements)[0].text)
            self.current_strings.append(self.current_string)
        elif(len_all_elements > 1):
            self.current_strings = []
            self.all_elements = self.xml_tree.iterfind(self.element_name)
            for element in self.all_elements:
                string = escape(element.text)
                self.current_strings.append(string)
        self.all_elements = self.xml_tree.iterfind(self.element_name)

    def replace_string(self, replacement_string, element_name):
        """
        This function takes the newly manipulated xml string and overwrites
        the old string with it.
        """
        replacement_string = (
                              "<" + element_name + ">"
                              + replacement_string
                              + "</" + element_name + ">"
                              )
        for element in self.xml_tree.xpath("//%s" % element_name):
            element.getparent().remove(element)
        replacement_element = etree.fromstring(replacement_string)
        self.xml_tree.insert(1, replacement_element)

    def simple_check_xml(self, xml_string, file_name, save_valid, node=True):
        """
        Checks if a given xml element is well-formed xml. If it is checking a
        partial string it adds a root element. If node is False it is checking a
        document as a string.
        """
        try:
            if(node is True):
                folder_path = "logs/well-formed_strings/"
                file_path = os.path.join(folder_path, os.path.basename(file_name))
                xml_string = "<root>" + xml_string + "</root>"
                tree = etree.fromstring(xml_string)
                self.logger.info(("The node string is well-formed. Simple markup is"
                                  " correct. Node string can be found in "
                                  + folder_path))
                self.logger.info(tree)
                if(save_valid is True):
                    self.logger.info("Node string can be found in" + folder_path)
                    if not os.path.exists(folder_path):
                        os.mkdir(folder_path)
                    with open(file_path, "w") as text_file:
                        text_file.write(xml_string)
            else:
                folder_path = "logs/well-formed_files/"
                file_path = os.path.join(folder_path, os.path.basename(file_name))
                xml_string = xml_string
                tree = etree.fromstring(xml_string)
                self.logger.info("The XML file is well-formed.")
                self.logger.info(tree)
                if(save_valid is True):
                    self.logger.info("File can be found in" + folder_path)
                    if not os.path.exists(folder_path):
                        os.mkdir(folder_path)
                    with open(file_path, "w") as text_file:
                        text_file.write(xml_string.decode("utf-8"))
        except Exception as e:
            if(node is True):
                folder_path = "logs/not_well-formed_strings/"
                file_path = os.path.join(folder_path, os.path.basename(file_name))
                if not os.path.exists(folder_path):
                    os.mkdir(folder_path)
                with open(file_path, "w") as text_file:
                    text_file.write(xml_string)
                self.logger.error(("XML node string is not well-formed. XML can be"
                                   " found in " + folder_path))
                self.logger.error(e)
            else:
                folder_path = "logs/not_well-formed_files/"
                file_path = os.path.join(folder_path, os.path.basename(file_name))
                if not os.path.exists(folder_path):
                    os.mkdir(folder_path)
                with open(file_path, "w") as text_file:
                    text_file.write(xml_string.decode("utf-8"))
                self.logger.error(("XML file is not well-formed. XML can be"
                                   " found in " + folder_path))
                self.logger.error(e)
                return False

    def inject_element(self, current_element, regex, tagname,
                       strip_newlines=False):
        """
        Injects new xml elements into the selected element text. The new element
        will be created by using a regular expression which matches a partial
        string in the current_element text string. The match will be the
        new_element text string. The tagname sets the tagname of the
        new_element. Optionally Attributes can be set aswell.
        """
        element_string = ElementTree.tostring(current_element, encoding="unicode", method="xml")
        match = re.search(regex, element_string)
        if(match):
            index_shift = 0
            if(strip_newlines is True):
                counter = match.group().count("\n")
                match_str = re.sub(r"\n", "", match.group())
            else:
                counter = 0
                match_str = match.group()
            index_start = match.start() + index_shift - counter
            index_end = match.end() + index_shift - counter
            new_element = etree.Element(tagname)
            new_element.text = match_str
            new_element_str = ElementTree.tostring(new_element, encoding="unicode", method="xml")
            element_string = (element_string[:index_start]
                              + new_element_str
                              + element_string[index_end:])
            index_shift += len(new_element_str) - len(match_str)
            replacement_element = etree.fromstring(element_string.encode("utf8"))
            current_element.getparent().replace(current_element, replacement_element)

    def markup_speech_lines(self, current_element):
        """
        Inserts markup in every speech that marks every line <p> with
        attribute klasse="J". J is set for every line even if it is O. In the
        early protocols (period 1. to 10.) One line is most of the time a
        sentence. In the later periods one line is capped at around 80
        characters.
        """
        lines = current_element.xpath("text()")
        if(len(lines) > 0):
            lines = lines[0].splitlines()
        current_element.xpath(".//redner")[0].tail = ""
        for line in lines:
            part_element = etree.Element("p")
            part_element.set("klasse", "J")
            part_element.text = line
            current_element.append(part_element)

    def get_multiline_entities(self, elements, start_of_str, end_of_str,
                               tagname):
        """
        This function identifies multiline entities (i.e. Kommentare/Comments)
        wich are split over multiple elements which have been marked with the
        markup_speech_lines() function.
        Gets the text of those and joins them together into one
        string. The first elements text will be set to the newly created string
        surrounded by new xml tags with tagname set to input tagname.
        All other elements with the rest of the string will be deleted.
        start_of_str should be a regex that describes the pattern how the start
        of the supposed multiline entity looks like. end_of_str describes the
        pattern how the end of the supposed multiline entity looks like.
        """
        self.multiline_text = []
        self.multiline_elements = []
        start_found = False
        end_found = False
        for element in elements:
            if(start_found is False and end_found is False
               and element.text is not None):
                start_match = re.search(start_of_str, element.text)
                if(start_match is not None):
                    self.multiline_text.append(start_match.group())
                    self.multiline_elements.append(element)
                    start_found = True
                    continue
            elif(start_found is True and end_found is False
                 and element.text is not None):
                end_match = re.search(end_of_str, element.text)
                if(end_match):
                    self.multiline_text.append(end_match.group())
                    self.multiline_elements.append(element)
                    end_found = True
                    continue
                else:
                    self.multiline_text.append(element.text)
                    self.multiline_elements.append(element)
                    continue
            elif(start_found is True and end_found is True):
                new_element_text = re.sub(r"- ", "", " ".join(self.multiline_text)) # joins the sting parts and also removes hyphenation
                part_element = etree.Element("p")
                part_element.set("klasse", "J")
                comment_element = etree.Element(tagname)
                comment_element.text = new_element_text
                part_element.append(comment_element)
                self.multiline_elements[0].getparent().replace(self.multiline_elements[0], part_element)
                for element in self.multiline_elements[1:]:
                    element.getparent().remove(element)
                start_found = False
                end_found = False
                self.multiline_text = []
                self.multiline_elements = []
                continue