226 lines
11 KiB
Python
226 lines
11 KiB
Python
|
#!/usr/bin/env python
|
||
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
from markup.MetadataMarkup import MetadataMarkup
|
||
|
from lxml import etree
|
||
|
from xml.etree import ElementTree
|
||
|
from xml.sax.saxutils import escape
|
||
|
import logging
|
||
|
import os
|
||
|
import re
|
||
|
|
||
|
|
||
|
class EntityMarkup(MetadataMarkup):
|
||
|
"""Class for getting an XML node in which entities will be marked.
|
||
|
In practice this class and its mehtods can be used to get the text of a
|
||
|
given Node and marks every speaker in this text string.
|
||
|
Also passes methods and fields to the more specific
|
||
|
SimpleSpeakersMarkup."""
|
||
|
|
||
|
def __init__(self, file_path, element_name=".//sitzungsverlauf"):
|
||
|
super().__init__()
|
||
|
self.file_path = file_path
|
||
|
self.element_name = element_name
|
||
|
self.xml_tree = None
|
||
|
self.current_string = str()
|
||
|
self.filename = os.path.basename(file_path)
|
||
|
self.logger = logging.getLogger(__name__)
|
||
|
|
||
|
def get_element_text(self):
|
||
|
"""
|
||
|
Gets the strings of all elements matched by an element x-path. Element
|
||
|
name will be passed when the class is istanced. Distunguishes between
|
||
|
one string or several strings.
|
||
|
"""
|
||
|
self.all_elements = self.xml_tree.iterfind(self.element_name)
|
||
|
len_all_elements = len(list(self.all_elements))
|
||
|
self.current_strings = []
|
||
|
if(len_all_elements == 1):
|
||
|
self.all_elements = self.xml_tree.iterfind(self.element_name)
|
||
|
self.current_string = escape(list(self.all_elements)[0].text)
|
||
|
self.current_strings.append(self.current_string)
|
||
|
elif(len_all_elements > 1):
|
||
|
self.current_strings = []
|
||
|
self.all_elements = self.xml_tree.iterfind(self.element_name)
|
||
|
for element in self.all_elements:
|
||
|
string = escape(element.text)
|
||
|
self.current_strings.append(string)
|
||
|
self.all_elements = self.xml_tree.iterfind(self.element_name)
|
||
|
|
||
|
def replace_string(self, replacement_string, element_name):
|
||
|
"""
|
||
|
This function takes the newly manipulated xml string and overwrites
|
||
|
the old string with it.
|
||
|
"""
|
||
|
replacement_string = (
|
||
|
"<" + element_name + ">"
|
||
|
+ replacement_string
|
||
|
+ "</" + element_name + ">"
|
||
|
)
|
||
|
for element in self.xml_tree.xpath("//%s" % element_name):
|
||
|
element.getparent().remove(element)
|
||
|
replacement_element = etree.fromstring(replacement_string)
|
||
|
self.xml_tree.insert(1, replacement_element)
|
||
|
|
||
|
def simple_check_xml(self, xml_string, file_name, save_valid, node=True):
|
||
|
"""
|
||
|
Checks if a given xml element is well-formed xml. If it is checking a
|
||
|
partial string it adds a root element. If node is False it is checking a
|
||
|
document as a string.
|
||
|
"""
|
||
|
try:
|
||
|
if(node is True):
|
||
|
folder_path = "logs/well-formed_strings/"
|
||
|
file_path = os.path.join(folder_path, os.path.basename(file_name))
|
||
|
xml_string = "<root>" + xml_string + "</root>"
|
||
|
tree = etree.fromstring(xml_string)
|
||
|
self.logger.info(("The node string is well-formed. Simple markup is"
|
||
|
" correct. Node string can be found in "
|
||
|
+ folder_path))
|
||
|
self.logger.info(tree)
|
||
|
if(save_valid is True):
|
||
|
self.logger.info("Node string can be found in" + folder_path)
|
||
|
if not os.path.exists(folder_path):
|
||
|
os.mkdir(folder_path)
|
||
|
with open(file_path, "w") as text_file:
|
||
|
text_file.write(xml_string)
|
||
|
else:
|
||
|
folder_path = "logs/well-formed_files/"
|
||
|
file_path = os.path.join(folder_path, os.path.basename(file_name))
|
||
|
xml_string = xml_string
|
||
|
tree = etree.fromstring(xml_string)
|
||
|
self.logger.info("The XML file is well-formed.")
|
||
|
self.logger.info(tree)
|
||
|
if(save_valid is True):
|
||
|
self.logger.info("File can be found in" + folder_path)
|
||
|
if not os.path.exists(folder_path):
|
||
|
os.mkdir(folder_path)
|
||
|
with open(file_path, "w") as text_file:
|
||
|
text_file.write(xml_string.decode("utf-8"))
|
||
|
except Exception as e:
|
||
|
if(node is True):
|
||
|
folder_path = "logs/not_well-formed_strings/"
|
||
|
file_path = os.path.join(folder_path, os.path.basename(file_name))
|
||
|
if not os.path.exists(folder_path):
|
||
|
os.mkdir(folder_path)
|
||
|
with open(file_path, "w") as text_file:
|
||
|
text_file.write(xml_string)
|
||
|
self.logger.error(("XML node string is not well-formed. XML can be"
|
||
|
" found in " + folder_path))
|
||
|
self.logger.error(e)
|
||
|
else:
|
||
|
folder_path = "logs/not_well-formed_files/"
|
||
|
file_path = os.path.join(folder_path, os.path.basename(file_name))
|
||
|
if not os.path.exists(folder_path):
|
||
|
os.mkdir(folder_path)
|
||
|
with open(file_path, "w") as text_file:
|
||
|
text_file.write(xml_string.decode("utf-8"))
|
||
|
self.logger.error(("XML file is not well-formed. XML can be"
|
||
|
" found in " + folder_path))
|
||
|
self.logger.error(e)
|
||
|
return False
|
||
|
|
||
|
def inject_element(self, current_element, regex, tagname,
|
||
|
strip_newlines=False):
|
||
|
"""
|
||
|
Injects new xml elements into the selected element text. The new element
|
||
|
will be created by using a regular expression which matches a partial
|
||
|
string in the current_element text string. The match will be the
|
||
|
new_element text string. The tagname sets the tagname of the
|
||
|
new_element. Optionally Attributes can be set aswell.
|
||
|
"""
|
||
|
element_string = ElementTree.tostring(current_element, encoding="unicode", method="xml")
|
||
|
match = re.search(regex, element_string)
|
||
|
if(match):
|
||
|
index_shift = 0
|
||
|
if(strip_newlines is True):
|
||
|
counter = match.group().count("\n")
|
||
|
match_str = re.sub(r"\n", "", match.group())
|
||
|
else:
|
||
|
counter = 0
|
||
|
match_str = match.group()
|
||
|
index_start = match.start() + index_shift - counter
|
||
|
index_end = match.end() + index_shift - counter
|
||
|
new_element = etree.Element(tagname)
|
||
|
new_element.text = match_str
|
||
|
new_element_str = ElementTree.tostring(new_element, encoding="unicode", method="xml")
|
||
|
element_string = (element_string[:index_start]
|
||
|
+ new_element_str
|
||
|
+ element_string[index_end:])
|
||
|
index_shift += len(new_element_str) - len(match_str)
|
||
|
replacement_element = etree.fromstring(element_string.encode("utf8"))
|
||
|
current_element.getparent().replace(current_element, replacement_element)
|
||
|
|
||
|
def markup_speech_lines(self, current_element):
|
||
|
"""
|
||
|
Inserts markup in every speech that marks every line <p> with
|
||
|
attribute klasse="J". J is set for every line even if it is O. In the
|
||
|
early protocols (period 1. to 10.) One line is most of the time a
|
||
|
sentence. In the later periods one line is capped at around 80
|
||
|
characters.
|
||
|
"""
|
||
|
lines = current_element.xpath("text()")
|
||
|
if(len(lines) > 0):
|
||
|
lines = lines[0].splitlines()
|
||
|
current_element.xpath(".//redner")[0].tail = ""
|
||
|
for line in lines:
|
||
|
part_element = etree.Element("p")
|
||
|
part_element.set("klasse", "J")
|
||
|
part_element.text = line
|
||
|
current_element.append(part_element)
|
||
|
|
||
|
def get_multiline_entities(self, elements, start_of_str, end_of_str,
|
||
|
tagname):
|
||
|
"""
|
||
|
This function identifies multiline entities (i.e. Kommentare/Comments)
|
||
|
wich are split over multiple elements which have been marked with the
|
||
|
markup_speech_lines() function.
|
||
|
Gets the text of those and joins them together into one
|
||
|
string. The first elements text will be set to the newly created string
|
||
|
surrounded by new xml tags with tagname set to input tagname.
|
||
|
All other elements with the rest of the string will be deleted.
|
||
|
start_of_str should be a regex that describes the pattern how the start
|
||
|
of the supposed multiline entity looks like. end_of_str describes the
|
||
|
pattern how the end of the supposed multiline entity looks like.
|
||
|
"""
|
||
|
self.multiline_text = []
|
||
|
self.multiline_elements = []
|
||
|
start_found = False
|
||
|
end_found = False
|
||
|
for element in elements:
|
||
|
if(start_found is False and end_found is False
|
||
|
and element.text is not None):
|
||
|
start_match = re.search(start_of_str, element.text)
|
||
|
if(start_match is not None):
|
||
|
self.multiline_text.append(start_match.group())
|
||
|
self.multiline_elements.append(element)
|
||
|
start_found = True
|
||
|
continue
|
||
|
elif(start_found is True and end_found is False
|
||
|
and element.text is not None):
|
||
|
end_match = re.search(end_of_str, element.text)
|
||
|
if(end_match):
|
||
|
self.multiline_text.append(end_match.group())
|
||
|
self.multiline_elements.append(element)
|
||
|
end_found = True
|
||
|
continue
|
||
|
else:
|
||
|
self.multiline_text.append(element.text)
|
||
|
self.multiline_elements.append(element)
|
||
|
continue
|
||
|
elif(start_found is True and end_found is True):
|
||
|
new_element_text = re.sub(r"- ", "", " ".join(self.multiline_text)) # joins the sting parts and also removes hyphenation
|
||
|
part_element = etree.Element("p")
|
||
|
part_element.set("klasse", "J")
|
||
|
comment_element = etree.Element(tagname)
|
||
|
comment_element.text = new_element_text
|
||
|
part_element.append(comment_element)
|
||
|
self.multiline_elements[0].getparent().replace(self.multiline_elements[0], part_element)
|
||
|
for element in self.multiline_elements[1:]:
|
||
|
element.getparent().remove(element)
|
||
|
start_found = False
|
||
|
end_found = False
|
||
|
self.multiline_text = []
|
||
|
self.multiline_elements = []
|
||
|
continue
|