bundesdata_web_app/app/speeches/utils.py

import re
from lxml import etree


def create_html_speech(speech_content_xml_string):
    """
    Converts the XML speech content into styled html. Also counts the words and
    shows the vocabulary.
    """
    speech_html = "<div>" + speech_content_xml_string + "</div>"
    speech_html = etree.fromstring(speech_html)
    raw_text = []
    interruptions = 0
    for element in speech_html.iter():
        if(element.tag == "p"):
            raw_text.append(element.text)
            element.tag = "span"
            element.attrib["class"]="line"
            element.attrib.pop("klasse", None)
        elif(element.tag == "kommentar"):
            interruptions += 1
            element.tag = "span"
            element.attrib["class"]="comment"
            element.attrib.pop("klasse", None)
        elif(element.tag == "metadata"):
            element.tag = "blockquote"
            element.attrib["class"]="metadata"
            element.attrib.pop("klasse", None)
            element.text = "Metadaten/Kopzeile:" + "\\n" + element.text
    raw_text = [element for element in raw_text if element != None]
    raw_text = "".join(raw_text)
    speech_html = etree.tostring(speech_html, pretty_print=True, encoding='unicode')
    speech_html = re.sub(r"b'", "", speech_html)
    speech_html = re.sub(r"\\n\s+\'", "<br/>", speech_html)
    speech_html = re.sub(r"\\n", "<br/>", speech_html)
    speech_html = re.sub(r"\\'", "'", speech_html)
    return(speech_html, raw_text, interruptions)
Initial commit 2019-02-28 13:09:53 +00:00			`import re`
			`from lxml import etree`


			`def create_html_speech(speech_content_xml_string):`
			`"""`
Added some documentation. 2019-03-01 19:55:41 +00:00			`Converts the XML speech content into styled html. Also counts the words and`
Initial commit 2019-02-28 13:09:53 +00:00			`shows the vocabulary.`
			`"""`
			`speech_html = "<div>" + speech_content_xml_string + "</div>"`
			`speech_html = etree.fromstring(speech_html)`
			`raw_text = []`
			`interruptions = 0`
			`for element in speech_html.iter():`
			`if(element.tag == "p"):`
			`raw_text.append(element.text)`
			`element.tag = "span"`
			`element.attrib["class"]="line"`
			`element.attrib.pop("klasse", None)`
			`elif(element.tag == "kommentar"):`
			`interruptions += 1`
			`element.tag = "span"`
			`element.attrib["class"]="comment"`
			`element.attrib.pop("klasse", None)`
			`elif(element.tag == "metadata"):`
			`element.tag = "blockquote"`
			`element.attrib["class"]="metadata"`
			`element.attrib.pop("klasse", None)`
			`element.text = "Metadaten/Kopzeile:" + "\\n" + element.text`
			`raw_text = [element for element in raw_text if element != None]`
			`raw_text = "".join(raw_text)`
			`speech_html = etree.tostring(speech_html, pretty_print=True, encoding='unicode')`
			`speech_html = re.sub(r"b'", "", speech_html)`
			`speech_html = re.sub(r"\\n\s+\'", "<br/>", speech_html)`
			`speech_html = re.sub(r"\\n", "<br/>", speech_html)`
			`speech_html = re.sub(r"\\'", "'", speech_html)`
			`return(speech_html, raw_text, interruptions)`