bundesdata_web_app/app/speeches/utils.py

38 lines
1.5 KiB
Python
Raw Normal View History

2019-02-28 13:09:53 +00:00
import re
from lxml import etree
def create_html_speech(speech_content_xml_string):
"""
2019-03-01 19:55:41 +00:00
Converts the XML speech content into styled html. Also counts the words and
2019-02-28 13:09:53 +00:00
shows the vocabulary.
"""
speech_html = "<div>" + speech_content_xml_string + "</div>"
speech_html = etree.fromstring(speech_html)
raw_text = []
interruptions = 0
for element in speech_html.iter():
if(element.tag == "p"):
raw_text.append(element.text)
element.tag = "span"
element.attrib["class"]="line"
element.attrib.pop("klasse", None)
elif(element.tag == "kommentar"):
interruptions += 1
element.tag = "span"
element.attrib["class"]="comment"
element.attrib.pop("klasse", None)
elif(element.tag == "metadata"):
element.tag = "blockquote"
element.attrib["class"]="metadata"
element.attrib.pop("klasse", None)
element.text = "Metadaten/Kopzeile:" + "\\n" + element.text
raw_text = [element for element in raw_text if element != None]
raw_text = "".join(raw_text)
speech_html = etree.tostring(speech_html, pretty_print=True, encoding='unicode')
speech_html = re.sub(r"b'", "", speech_html)
speech_html = re.sub(r"\\n\s+\'", "<br/>", speech_html)
speech_html = re.sub(r"\\n", "<br/>", speech_html)
speech_html = re.sub(r"\\'", "'", speech_html)
return(speech_html, raw_text, interruptions)