bundesdata_web_app/app/speeches/utils.py

import re
from lxml import etree


def create_html_speech(speech_content_xml_string):
    """
    Converts the XML speech content into styled html. Also counts the words and
    shows the vocabulary.
    """
    speech_html = "<div>" + speech_content_xml_string + "</div>"
    speech_html = etree.fromstring(speech_html)
    raw_text = []
    interruptions = 0
    for element in speech_html.iter():
        if(element.tag == "p"):
            raw_text.append(element.text)
            element.tag = "span"
            element.attrib["class"]="line"
            element.attrib.pop("klasse", None)
        elif(element.tag == "kommentar"):
            interruptions += 1
            element.tag = "span"
            element.attrib["class"]="comment"
            element.attrib.pop("klasse", None)
        elif(element.tag == "metadata"):
            element.tag = "blockquote"
            element.attrib["class"]="metadata"
            element.attrib.pop("klasse", None)
            element.text = "Metadaten/Kopzeile:" + "\\n" + element.text
    raw_text = [element for element in raw_text if element != None]
    raw_text = "".join(raw_text)
    speech_html = etree.tostring(speech_html, pretty_print=True, encoding='unicode')
    speech_html = re.sub(r"b'", "", speech_html)
    speech_html = re.sub(r"\\n\s+\'", "<br/>", speech_html)
    speech_html = re.sub(r"\\n", "<br/>", speech_html)
    speech_html = re.sub(r"\\'", "'", speech_html)
    return(speech_html, raw_text, interruptions)