2019-02-28 14:09:53 +01:00
|
|
|
import re
|
|
|
|
from lxml import etree
|
|
|
|
|
|
|
|
|
|
|
|
def create_html_speech(speech_content_xml_string):
|
|
|
|
"""
|
2019-03-01 20:55:41 +01:00
|
|
|
Converts the XML speech content into styled html. Also counts the words and
|
2019-02-28 14:09:53 +01:00
|
|
|
shows the vocabulary.
|
|
|
|
"""
|
|
|
|
speech_html = "<div>" + speech_content_xml_string + "</div>"
|
|
|
|
speech_html = etree.fromstring(speech_html)
|
|
|
|
raw_text = []
|
|
|
|
interruptions = 0
|
|
|
|
for element in speech_html.iter():
|
|
|
|
if(element.tag == "p"):
|
|
|
|
raw_text.append(element.text)
|
|
|
|
element.tag = "span"
|
|
|
|
element.attrib["class"]="line"
|
|
|
|
element.attrib.pop("klasse", None)
|
|
|
|
elif(element.tag == "kommentar"):
|
|
|
|
interruptions += 1
|
|
|
|
element.tag = "span"
|
|
|
|
element.attrib["class"]="comment"
|
|
|
|
element.attrib.pop("klasse", None)
|
|
|
|
elif(element.tag == "metadata"):
|
|
|
|
element.tag = "blockquote"
|
|
|
|
element.attrib["class"]="metadata"
|
|
|
|
element.attrib.pop("klasse", None)
|
|
|
|
element.text = "Metadaten/Kopzeile:" + "\\n" + element.text
|
|
|
|
raw_text = [element for element in raw_text if element != None]
|
|
|
|
raw_text = "".join(raw_text)
|
|
|
|
speech_html = etree.tostring(speech_html, pretty_print=True, encoding='unicode')
|
|
|
|
speech_html = re.sub(r"b'", "", speech_html)
|
|
|
|
speech_html = re.sub(r"\\n\s+\'", "<br/>", speech_html)
|
|
|
|
speech_html = re.sub(r"\\n", "<br/>", speech_html)
|
|
|
|
speech_html = re.sub(r"\\'", "'", speech_html)
|
|
|
|
return(speech_html, raw_text, interruptions)
|