import re from lxml import etree def create_html_speech(speech_content_xml_string): """ Converts the XML speech content into styled html. Also counts the words and shows the vocabulary. """ speech_html = "<div>" + speech_content_xml_string + "</div>" speech_html = etree.fromstring(speech_html) raw_text = [] interruptions = 0 for element in speech_html.iter(): if(element.tag == "p"): raw_text.append(element.text) element.tag = "span" element.attrib["class"]="line" element.attrib.pop("klasse", None) elif(element.tag == "kommentar"): interruptions += 1 element.tag = "span" element.attrib["class"]="comment" element.attrib.pop("klasse", None) elif(element.tag == "metadata"): element.tag = "blockquote" element.attrib["class"]="metadata" element.attrib.pop("klasse", None) element.text = "Metadaten/Kopzeile:" + "\\n" + element.text raw_text = [element for element in raw_text if element != None] raw_text = "".join(raw_text) speech_html = etree.tostring(speech_html, pretty_print=True, encoding='unicode') speech_html = re.sub(r"b'", "", speech_html) speech_html = re.sub(r"\\n\s+\'", "<br/>", speech_html) speech_html = re.sub(r"\\n", "<br/>", speech_html) speech_html = re.sub(r"\\'", "'", speech_html) return(speech_html, raw_text, interruptions)