bundesdata_markup_nlp_software/bundesdata_markup_nlp/utility/XMLProtocol.py
2019-02-21 19:29:44 +01:00

210 lines
9.3 KiB
Python
Executable File

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from utility import delete_folder
from utility import update_config
from xml.etree import ElementTree
from os import path
from lxml import etree
import os
import logging
import re
class XMLProtocol(object):
"""Class for standard operations on/with the XML protocols. Has functions
for reading, saving and manipulationg an XML protocol. All other classes
inherit from this one.
"""
def __init__(self):
super().__init__()
self.logger = logging.getLogger(__name__)
def read_protcol(self, file_path):
"""
Takes a file path and parses the file as an XML returns a root element.
"""
self.file_path = file_path
self.filename = os.path.basename(self.file_path)
parser = etree.XMLParser(remove_blank_text=True)
self.tree = etree.parse(file_path, parser) # for better xml indentation
root = self.tree.getroot()
self.logger.info("File successfully parsed as XML.")
return root
def read_xml(self, file_path):
"""Takes a file path and parses the file as an XML."""
parser = etree.XMLParser(encoding='utf-8', remove_blank_text=True)
tree = etree.parse(file_path, parser) # for better xml indentation
self.xml_tree = tree.getroot()
def save_to_file(self, output_path, file_path, subfolder, config_section,
config_key, filename_sufix=""):
"""
Writes the new markup to a new xml file. Takes the output path and
creates a new folder there. Also updates the config file with the new
path.
"""
if(filename_sufix == ""):
self.filename = path.basename(file_path)
elif(filename_sufix != ""):
self.filename = path.basename(file_path)[:-4] + filename_sufix
save_path = os.path.join(output_path, subfolder)
if not os.path.exists(save_path):
os.mkdir(save_path)
tree = etree.ElementTree(self.xml_tree)
new_filename = self.filename
save_file_path = os.path.join(save_path, new_filename)
tree.write(save_file_path,
pretty_print=True,
xml_declaration=True,
encoding="utf8",
doctype="<!DOCTYPE dbtplenarprotokoll SYSTEM 'dbtplenarprotokoll_minimal.dtd\'>")
self.logger.info("New XML saved to:" + save_file_path)
update_config.update_config("config.ini", config_section, config_key,
save_path)
def beautify_xml_part(self, file_path, xpath, alter_lines=False,
line_width=80):
"""
Beautifies part (element node) of an input XML.
"""
tmp_path = os.path.join(os.path.dirname(file_path), "tmp")
tree = etree.ElementTree(self.xml_tree)
self.beautified_part = tree.find(xpath)
self.beautified_part = ElementTree.tostring(self.beautified_part)
self.beautified_part = etree.fromstring(self.beautified_part)
self.beautified_part = etree.ElementTree(self.beautified_part)
if not os.path.exists(tmp_path):
os.mkdir(tmp_path)
tmp_file_path = os.path.join(tmp_path, "tmp.xml")
self.beautified_part.write(tmp_file_path,
pretty_print=True,
xml_declaration=True,
encoding="utf8")
if(alter_lines is True):
os.system("html-beautify -r -q -w {} --no-preserve-newlines {}".format(line_width, tmp_file_path))
self.beautified_part = etree.parse(tmp_file_path).getroot()
elif(alter_lines is False):
os.system("html-beautify -r -q {}".format(tmp_file_path))
self.beautified_part = etree.parse(tmp_file_path).getroot()
update_config.update_config("config.ini", "File paths", "tmp_path",
tmp_path)
delete_folder.delete_folder(tmp_path)
def beautify_xml(self, file_path, alter_lines=False, line_width=80):
if(alter_lines is True):
os.system("html-beautify -r -q -w {} --no-preserve-newlines {}".format(line_width, file_path))
elif(alter_lines is False):
os.system("html-beautify -r -q {}".format(file_path))
def expand_element(self, element_to_expand, expand_attr_key,
expand_attr_value, check_child=True):
"""
This function takes an XPath expression for an xml element.
The tag of this element will be expanded with the given
expand_attrkey and expand_attr_value. Also needs a regex to determine if
the current selected element is an element which should be replaced.
For this the text of the first child of the current element is checked
against the given regex. Per default the child element text of the
current element is checked wether the regex matches the string or not.
Set check_child to False to avoid this and just expand the current
element.
"""
elements = self.xml_tree.findall(element_to_expand)
for element in elements:
if(check_child is True):
first_child = element.getchildren()[0]
match = self.regex_compiled.search(first_child.text)
if(match):
element.set(expand_attr_key, expand_attr_value)
self.xml_tree = self.xml_tree
else:
element.set(expand_attr_key, expand_attr_value)
self.xml_tree = self.xml_tree
def replace_tag_name(self, element_to_replace, tag_name, check_child=True):
"""
Replaces a given element tag(as XPath) name with a new tag name.
"""
elements = self.xml_tree.findall(element_to_replace)
for element in elements:
if(check_child is True):
first_child = element.getchildren()[0]
match = self.regex_compiled.search(first_child.text)
if(match):
element.tag = tag_name
else:
element.tag = tag_name
self.xml_tree = self.xml_tree
def replace_tag_attr(self, element_to_replace, tag_name, attr_key,
attr_value, check_child=True):
"""
Replaces tag name of given element(as XPath) with new name and adds an
attribute Can also check if the child of the current element contains
some specific text like in the expand_element function.
"""
elements = self.xml_tree.findall(element_to_replace)
for element in elements:
if(check_child is True):
first_child = element.getchildren()[0]
match = self.regex_compiled.search(first_child.text)
if(match):
element.tag = tag_name
element.set(attr_key, attr_value)
else:
element.tag = tag_name
element.set(attr_key, attr_value)
self.xml_tree = self.xml_tree
def replace_elements(self, elements_to_replace, replacment_elements,
keep_parent_text=False):
"""
Replaces elements identifeid by XPath with new elements. Can either keep
the text of the parent element or not.
"""
elements = self.xml_tree.findall(elements_to_replace)
parents_text_xpath = elements_to_replace + "/" + "parent::node()" + "/" + "text()"
elements_text = self.xml_tree.xpath(parents_text_xpath)
if(len(elements) == len(replacment_elements)):
if(keep_parent_text is False):
for element, replacement_element in zip(elements, replacment_elements):
element.getparent().replace(element, replacement_element)
else:
for element, replacement_element in zip(elements, replacment_elements):
element.getparent().replace(element, replacement_element)
self.xml_tree = self.xml_tree
elements = self.xml_tree.findall(elements_to_replace)
for element, text in zip(elements, elements_text):
element.tail = text
self.xml_tree = self.xml_tree
else:
self.logger.warning(("Elements missmatch. There are "
+ str(len(elements))
+ " that should be repalced."
+ " There are " + str(len(replacment_elements))
+ " present."
+ " No elements have been replaced."))
def compile_regex(self, regex):
self.regex_string = regex
"""
Takes the input regex string and compiles it for better performance
and redability.
"""
self.regex_compiled = re.compile(self.regex_string, re.MULTILINE)
def clean_text(self, regex, xpath, replacement_string="",):
"""
Replaces regex matches with nothing by default or replacement string
for an element matched by the xpath in the xml_tree. Works with
matchgroups.
"""
elements = self.xml_tree.xpath(xpath)
for element in elements:
replaced = re.sub(regex, replacement_string, element.text)
element.text = replaced
self.xml_tree = self.xml_tree