Initial commit
This commit is contained in:
225
bundesdata_markup_nlp/markup/EntityMarkup.py
Executable file
225
bundesdata_markup_nlp/markup/EntityMarkup.py
Executable file
@ -0,0 +1,225 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from markup.MetadataMarkup import MetadataMarkup
|
||||
from lxml import etree
|
||||
from xml.etree import ElementTree
|
||||
from xml.sax.saxutils import escape
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
|
||||
|
||||
class EntityMarkup(MetadataMarkup):
|
||||
"""Class for getting an XML node in which entities will be marked.
|
||||
In practice this class and its mehtods can be used to get the text of a
|
||||
given Node and marks every speaker in this text string.
|
||||
Also passes methods and fields to the more specific
|
||||
SimpleSpeakersMarkup."""
|
||||
|
||||
def __init__(self, file_path, element_name=".//sitzungsverlauf"):
|
||||
super().__init__()
|
||||
self.file_path = file_path
|
||||
self.element_name = element_name
|
||||
self.xml_tree = None
|
||||
self.current_string = str()
|
||||
self.filename = os.path.basename(file_path)
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def get_element_text(self):
|
||||
"""
|
||||
Gets the strings of all elements matched by an element x-path. Element
|
||||
name will be passed when the class is istanced. Distunguishes between
|
||||
one string or several strings.
|
||||
"""
|
||||
self.all_elements = self.xml_tree.iterfind(self.element_name)
|
||||
len_all_elements = len(list(self.all_elements))
|
||||
self.current_strings = []
|
||||
if(len_all_elements == 1):
|
||||
self.all_elements = self.xml_tree.iterfind(self.element_name)
|
||||
self.current_string = escape(list(self.all_elements)[0].text)
|
||||
self.current_strings.append(self.current_string)
|
||||
elif(len_all_elements > 1):
|
||||
self.current_strings = []
|
||||
self.all_elements = self.xml_tree.iterfind(self.element_name)
|
||||
for element in self.all_elements:
|
||||
string = escape(element.text)
|
||||
self.current_strings.append(string)
|
||||
self.all_elements = self.xml_tree.iterfind(self.element_name)
|
||||
|
||||
def replace_string(self, replacement_string, element_name):
|
||||
"""
|
||||
This function takes the newly manipulated xml string and overwrites
|
||||
the old string with it.
|
||||
"""
|
||||
replacement_string = (
|
||||
"<" + element_name + ">"
|
||||
+ replacement_string
|
||||
+ "</" + element_name + ">"
|
||||
)
|
||||
for element in self.xml_tree.xpath("//%s" % element_name):
|
||||
element.getparent().remove(element)
|
||||
replacement_element = etree.fromstring(replacement_string)
|
||||
self.xml_tree.insert(1, replacement_element)
|
||||
|
||||
def simple_check_xml(self, xml_string, file_name, save_valid, node=True):
|
||||
"""
|
||||
Checks if a given xml element is well-formed xml. If it is checking a
|
||||
partial string it adds a root element. If node is False it is checking a
|
||||
document as a string.
|
||||
"""
|
||||
try:
|
||||
if(node is True):
|
||||
folder_path = "logs/well-formed_strings/"
|
||||
file_path = os.path.join(folder_path, os.path.basename(file_name))
|
||||
xml_string = "<root>" + xml_string + "</root>"
|
||||
tree = etree.fromstring(xml_string)
|
||||
self.logger.info(("The node string is well-formed. Simple markup is"
|
||||
" correct. Node string can be found in "
|
||||
+ folder_path))
|
||||
self.logger.info(tree)
|
||||
if(save_valid is True):
|
||||
self.logger.info("Node string can be found in" + folder_path)
|
||||
if not os.path.exists(folder_path):
|
||||
os.mkdir(folder_path)
|
||||
with open(file_path, "w") as text_file:
|
||||
text_file.write(xml_string)
|
||||
else:
|
||||
folder_path = "logs/well-formed_files/"
|
||||
file_path = os.path.join(folder_path, os.path.basename(file_name))
|
||||
xml_string = xml_string
|
||||
tree = etree.fromstring(xml_string)
|
||||
self.logger.info("The XML file is well-formed.")
|
||||
self.logger.info(tree)
|
||||
if(save_valid is True):
|
||||
self.logger.info("File can be found in" + folder_path)
|
||||
if not os.path.exists(folder_path):
|
||||
os.mkdir(folder_path)
|
||||
with open(file_path, "w") as text_file:
|
||||
text_file.write(xml_string.decode("utf-8"))
|
||||
except Exception as e:
|
||||
if(node is True):
|
||||
folder_path = "logs/not_well-formed_strings/"
|
||||
file_path = os.path.join(folder_path, os.path.basename(file_name))
|
||||
if not os.path.exists(folder_path):
|
||||
os.mkdir(folder_path)
|
||||
with open(file_path, "w") as text_file:
|
||||
text_file.write(xml_string)
|
||||
self.logger.error(("XML node string is not well-formed. XML can be"
|
||||
" found in " + folder_path))
|
||||
self.logger.error(e)
|
||||
else:
|
||||
folder_path = "logs/not_well-formed_files/"
|
||||
file_path = os.path.join(folder_path, os.path.basename(file_name))
|
||||
if not os.path.exists(folder_path):
|
||||
os.mkdir(folder_path)
|
||||
with open(file_path, "w") as text_file:
|
||||
text_file.write(xml_string.decode("utf-8"))
|
||||
self.logger.error(("XML file is not well-formed. XML can be"
|
||||
" found in " + folder_path))
|
||||
self.logger.error(e)
|
||||
return False
|
||||
|
||||
def inject_element(self, current_element, regex, tagname,
|
||||
strip_newlines=False):
|
||||
"""
|
||||
Injects new xml elements into the selected element text. The new element
|
||||
will be created by using a regular expression which matches a partial
|
||||
string in the current_element text string. The match will be the
|
||||
new_element text string. The tagname sets the tagname of the
|
||||
new_element. Optionally Attributes can be set aswell.
|
||||
"""
|
||||
element_string = ElementTree.tostring(current_element, encoding="unicode", method="xml")
|
||||
match = re.search(regex, element_string)
|
||||
if(match):
|
||||
index_shift = 0
|
||||
if(strip_newlines is True):
|
||||
counter = match.group().count("\n")
|
||||
match_str = re.sub(r"\n", "", match.group())
|
||||
else:
|
||||
counter = 0
|
||||
match_str = match.group()
|
||||
index_start = match.start() + index_shift - counter
|
||||
index_end = match.end() + index_shift - counter
|
||||
new_element = etree.Element(tagname)
|
||||
new_element.text = match_str
|
||||
new_element_str = ElementTree.tostring(new_element, encoding="unicode", method="xml")
|
||||
element_string = (element_string[:index_start]
|
||||
+ new_element_str
|
||||
+ element_string[index_end:])
|
||||
index_shift += len(new_element_str) - len(match_str)
|
||||
replacement_element = etree.fromstring(element_string.encode("utf8"))
|
||||
current_element.getparent().replace(current_element, replacement_element)
|
||||
|
||||
def markup_speech_lines(self, current_element):
|
||||
"""
|
||||
Inserts markup in every speech that marks every line <p> with
|
||||
attribute klasse="J". J is set for every line even if it is O. In the
|
||||
early protocols (period 1. to 10.) One line is most of the time a
|
||||
sentence. In the later periods one line is capped at around 80
|
||||
characters.
|
||||
"""
|
||||
lines = current_element.xpath("text()")
|
||||
if(len(lines) > 0):
|
||||
lines = lines[0].splitlines()
|
||||
current_element.xpath(".//redner")[0].tail = ""
|
||||
for line in lines:
|
||||
part_element = etree.Element("p")
|
||||
part_element.set("klasse", "J")
|
||||
part_element.text = line
|
||||
current_element.append(part_element)
|
||||
|
||||
def get_multiline_entities(self, elements, start_of_str, end_of_str,
|
||||
tagname):
|
||||
"""
|
||||
This function identifies multiline entities (i.e. Kommentare/Comments)
|
||||
wich are split over multiple elements which have been marked with the
|
||||
markup_speech_lines() function.
|
||||
Gets the text of those and joins them together into one
|
||||
string. The first elements text will be set to the newly created string
|
||||
surrounded by new xml tags with tagname set to input tagname.
|
||||
All other elements with the rest of the string will be deleted.
|
||||
start_of_str should be a regex that describes the pattern how the start
|
||||
of the supposed multiline entity looks like. end_of_str describes the
|
||||
pattern how the end of the supposed multiline entity looks like.
|
||||
"""
|
||||
self.multiline_text = []
|
||||
self.multiline_elements = []
|
||||
start_found = False
|
||||
end_found = False
|
||||
for element in elements:
|
||||
if(start_found is False and end_found is False
|
||||
and element.text is not None):
|
||||
start_match = re.search(start_of_str, element.text)
|
||||
if(start_match is not None):
|
||||
self.multiline_text.append(start_match.group())
|
||||
self.multiline_elements.append(element)
|
||||
start_found = True
|
||||
continue
|
||||
elif(start_found is True and end_found is False
|
||||
and element.text is not None):
|
||||
end_match = re.search(end_of_str, element.text)
|
||||
if(end_match):
|
||||
self.multiline_text.append(end_match.group())
|
||||
self.multiline_elements.append(element)
|
||||
end_found = True
|
||||
continue
|
||||
else:
|
||||
self.multiline_text.append(element.text)
|
||||
self.multiline_elements.append(element)
|
||||
continue
|
||||
elif(start_found is True and end_found is True):
|
||||
new_element_text = re.sub(r"- ", "", " ".join(self.multiline_text)) # joins the sting parts and also removes hyphenation
|
||||
part_element = etree.Element("p")
|
||||
part_element.set("klasse", "J")
|
||||
comment_element = etree.Element(tagname)
|
||||
comment_element.text = new_element_text
|
||||
part_element.append(comment_element)
|
||||
self.multiline_elements[0].getparent().replace(self.multiline_elements[0], part_element)
|
||||
for element in self.multiline_elements[1:]:
|
||||
element.getparent().remove(element)
|
||||
start_found = False
|
||||
end_found = False
|
||||
self.multiline_text = []
|
||||
self.multiline_elements = []
|
||||
continue
|
22
bundesdata_markup_nlp/markup/MdBData.py
Executable file
22
bundesdata_markup_nlp/markup/MdBData.py
Executable file
@ -0,0 +1,22 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from utility.XMLProtocol import XMLProtocol
|
||||
import logging
|
||||
|
||||
|
||||
class MdBData(XMLProtocol):
|
||||
"""Class to handel operations on the Stammdatenbank."""
|
||||
|
||||
def __init__(self):
|
||||
super(XMLProtocol, self).__init__()
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def get_set(self, element_path, element_tree):
|
||||
"""
|
||||
Creates Sets from input path on element_tree.
|
||||
"""
|
||||
tmp_list = [element.text for element in
|
||||
element_tree.iterfind(element_path) if element is not None]
|
||||
set_of_elements = set(tmp_list)
|
||||
return set_of_elements
|
267
bundesdata_markup_nlp/markup/MetadataMarkup.py
Executable file
267
bundesdata_markup_nlp/markup/MetadataMarkup.py
Executable file
@ -0,0 +1,267 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from utility.XMLProtocol import XMLProtocol
|
||||
from utility import update_config
|
||||
from lxml import etree
|
||||
from datetime import datetime
|
||||
from babel.dates import format_date
|
||||
import os
|
||||
import re
|
||||
import logging
|
||||
import configparser
|
||||
|
||||
|
||||
class MetadataMarkup(XMLProtocol):
|
||||
"""
|
||||
This class is for opening one XML-protocoll, extracting the included
|
||||
metadata and creating a new valid metadata head.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.plenarprotokoll_string = str() # will be extracted with extract_metadata()
|
||||
self.wahlperiode = int() # will be extracted with extract_metadata()
|
||||
self.sitzungsnr = int() # will be extracted with extract_metadata()
|
||||
self.herausgeber = "Deutscher Bundestag" # Always the same in every protocoll
|
||||
self.berichtart = "Steongrafischer Bericht" # Always the same in every protocoll
|
||||
self.sitzungstitel_string = ". Sitzung" # Always the same in every protocoll
|
||||
self.ort = "Berlin" # Always the same in every protocoll
|
||||
self.datum_ger_non_iso = str() # will be extracted with extract_metadata()
|
||||
self.datum_iso = str() # ISO-date will be built from self.datum_ger_non_iso
|
||||
self.datum_string = str() # will be built from self.datum_iso
|
||||
self.attachment = str() # will be extracted from a split. Will not work
|
||||
# all the time. But will not break the XML.
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def extract_metadata(self, etree_element_object):
|
||||
"""
|
||||
Extracts metadata from the given XML-tags and wirtes them into the
|
||||
instance variables
|
||||
"""
|
||||
root = etree_element_object
|
||||
metadata_list = []
|
||||
for element in root.iter():
|
||||
if(element.tag != "TEXT"):
|
||||
metadata_list.append(element.text)
|
||||
metadata_list = metadata_list[1:]
|
||||
self.wahlperiode = metadata_list[0]
|
||||
self.plenarprotokoll_string = metadata_list[1].lower().title()
|
||||
self.sitzungsnr = metadata_list[2].split("/")[1]
|
||||
self.datum_ger_non_iso = metadata_list[3]
|
||||
self.logger.info("Metadata successfully extracted.")
|
||||
self.logger.info("Wahlperiode is:" + self.wahlperiode)
|
||||
self.logger.info("Plenarprotokoll is:" + self.plenarprotokoll_string)
|
||||
self.logger.info("Sitzungsnummer is:" + self.sitzungsnr)
|
||||
self.logger.info("German non ISO date is:" + self.datum_ger_non_iso)
|
||||
|
||||
def built_iso_date(self, ger_date):
|
||||
"""
|
||||
Gets the german date and converts it to an ISO standard date.
|
||||
"""
|
||||
self.datum_iso = datetime.strptime(ger_date, "%d.%m.%Y").date()
|
||||
self.logger.info("ISO date created:" + str(self.datum_iso))
|
||||
|
||||
def built_date_string(self, iso_date):
|
||||
"""
|
||||
Gets the ISO date and creates from it an german full string date.
|
||||
"""
|
||||
date_string = format_date(iso_date, format="full", locale="de_DE")
|
||||
date_string = re.sub(r",", ", den", date_string)
|
||||
self.datum_string = date_string
|
||||
self.logger.info("Date string created:" + self.datum_string)
|
||||
|
||||
def delete_old_metadata(self, etree_element_object):
|
||||
"""
|
||||
Deletes old metadata tags and text. Renames root tag.
|
||||
"""
|
||||
for element in etree_element_object.iter():
|
||||
if(element.tag != "TEXT" and element.tag != "DOKUMENT"):
|
||||
element.getparent().remove(element)
|
||||
elif(element.tag == "DOKUMENT"):
|
||||
element.tag = "dbtplenarprotokoll"
|
||||
elif(element.tag == "TEXT"):
|
||||
self.full_content = element.text
|
||||
element.getparent().remove(element)
|
||||
self.logger.info("Old metadata deleted.")
|
||||
|
||||
def insert_new_metadata(self, etree_element_object):
|
||||
"""
|
||||
Inserts the extracted metadata and splitted content into new created
|
||||
and valid xml tags according to the official schema.
|
||||
"""
|
||||
vorspann_element = etree.Element("vorspann")
|
||||
xml_string = """
|
||||
<kopfdaten>
|
||||
<plenarprotokoll-nummer>{} <wahlperiode>{}</wahlperiode>/<sitzungsnr>{}</sitzungsnr>
|
||||
(neu)</plenarprotokoll-nummer>
|
||||
<herausgeber>{}</herausgeber>
|
||||
<berichtart>{}</berichtart>
|
||||
<sitzungstitel><sitzungsnr>{}</sitzungsnr>. Sitzung</sitzungstitel>
|
||||
<veranstaltungsdaten><ort>{}</ort>, <datum date="{}">{}</datum></veranstaltungsdaten>
|
||||
</kopfdaten>"""\
|
||||
.format(self.plenarprotokoll_string, self.wahlperiode,
|
||||
self.sitzungsnr, self.herausgeber, self.berichtart,
|
||||
self.sitzungsnr, self.ort, self.datum_ger_non_iso,
|
||||
self.datum_string)
|
||||
etree_from_str = etree.fromstring(xml_string)
|
||||
etree_element_object.insert(0, vorspann_element)
|
||||
vorspann_element.append(etree_from_str)
|
||||
toc_element = etree.Element("inhaltsverzeichnis")
|
||||
toc_element.text = self.toc
|
||||
vorspann_element.append(toc_element)
|
||||
content_element = etree.Element("sitzungsverlauf")
|
||||
content_element.text = self.president + self.content
|
||||
etree_element_object.insert(2, content_element)
|
||||
anlagen_element = etree.Element("anlagen")
|
||||
anlagen_element. text = self.attachment
|
||||
etree_element_object.insert(3, anlagen_element)
|
||||
rednerliste_element = etree.Element("rednerliste",
|
||||
sitzungsdatum=self.datum_ger_non_iso)
|
||||
etree_element_object.insert(4, rednerliste_element)
|
||||
self.xml_tree = etree_element_object
|
||||
self.logger.info("New metadata XML-head inserted." + xml_string)
|
||||
|
||||
def split_content(self, etree_element_object):
|
||||
"""Splits the full content to: table of content, speeches and in some
|
||||
cases attachments."""
|
||||
config = configparser.ConfigParser()
|
||||
config.read("config.ini")
|
||||
|
||||
session_start_split = config["Regular expressions splits"]["session_start_president_split"]
|
||||
regex_start = re.compile(session_start_split)
|
||||
tmp_list = regex_start.split(self.full_content, maxsplit=1)
|
||||
self.toc = tmp_list[0]
|
||||
self.president = tmp_list[1]
|
||||
self.content = tmp_list[2]
|
||||
|
||||
attachment_split = config["Regular expressions splits"]["attachment_split"]
|
||||
regex_att = re.compile(attachment_split)
|
||||
tmp_list = regex_att.split(self.content)
|
||||
tmp_list = [element for element in tmp_list if element is not None]
|
||||
if(tmp_list[-1] == ""): # if the split does not match anything last item is empty string.
|
||||
self.content = "".join(tmp_list[0:-1])
|
||||
self.attachment = "Keine Anlage extrahiert."
|
||||
self.logger.warning(("There is no attachment."))
|
||||
else:
|
||||
self.content = "".join(tmp_list[0:-1])
|
||||
self.attachment = tmp_list[-1]
|
||||
self.logger.info("Attachment found.")
|
||||
self.logger.info("Contet splitted at:" + str(regex_start))
|
||||
self.logger.info("Contet splitted at:" + str(regex_att))
|
||||
|
||||
def get_session_times(self):
|
||||
"""This function looks into the entire protocoll content to extract the
|
||||
last closing time and the starting time. If only one of both or none are
|
||||
found, the missing time will be set to xx:xx."""
|
||||
config = configparser.ConfigParser()
|
||||
config.read("config.ini")
|
||||
regex_conf_values = config.items("Regular expressions time extraction")
|
||||
regex_conf_values = [regex[1] for regex in regex_conf_values]
|
||||
tmp_list = []
|
||||
identifier = 0
|
||||
start_time_found = True
|
||||
end_time_found = True
|
||||
|
||||
for regex in (regex_conf_values):
|
||||
identifier += 1
|
||||
regex = re.compile(regex)
|
||||
if(identifier == 1):
|
||||
# Always gets first start time.
|
||||
matches = list(regex.finditer(self.full_content))
|
||||
if(len(matches) > 1):
|
||||
match = matches[-1]
|
||||
elif(len(matches) == 0):
|
||||
match = None
|
||||
else:
|
||||
match = matches[0]
|
||||
elif(identifier == 2):
|
||||
# Always gets last closing time
|
||||
matches = list(regex.finditer(self.full_content))
|
||||
if(len(matches) > 1):
|
||||
match = matches[-1]
|
||||
elif(len(matches) == 0):
|
||||
match = None
|
||||
else:
|
||||
match = matches[0]
|
||||
|
||||
if(match is None and identifier == 1):
|
||||
self.logger.warning("No start time found for " + str(regex))
|
||||
start_time_found = False
|
||||
elif(match is None and identifier == 2):
|
||||
self.logger.warning("No end time found for " + str(regex))
|
||||
end_time_found = False
|
||||
elif(match):
|
||||
session_time = [group for group in match.groups()
|
||||
if group is not None]
|
||||
session_time = ["0" + group if len(group) == 1 else group for
|
||||
group in session_time] # Adds a 0 in front if digit len is 1
|
||||
if(len(session_time) == 2):
|
||||
tmp_list.append(":".join(session_time))
|
||||
elif(len(session_time) == 1):
|
||||
tmp_list.append(session_time[0] + ":00")
|
||||
|
||||
if(len(tmp_list) == 2):
|
||||
self.session_start_time = tmp_list[0]
|
||||
self.session_end_time = tmp_list[1]
|
||||
self.logger.info("Start time found: " + self.session_start_time)
|
||||
self.logger.info("End time found: " + self.session_end_time)
|
||||
self.logger.info("Successfully matched start and end times.")
|
||||
elif(len(tmp_list) == 1 and start_time_found is True and end_time_found
|
||||
is False):
|
||||
self.session_start_time = tmp_list[0]
|
||||
self.session_end_time = "xx:xx"
|
||||
self.logger.warning("Only start time found: "
|
||||
+ self.session_start_time)
|
||||
self.logger.warning("End time set to: "
|
||||
+ self.session_end_time)
|
||||
elif(len(tmp_list) == 1 and start_time_found is False and end_time_found
|
||||
is True):
|
||||
self.session_end_time = tmp_list[0]
|
||||
self.session_start_time = "xx:xx"
|
||||
self.logger.warning("Only end time found: "
|
||||
+ self.session_end_time)
|
||||
self.logger.warning("Start time set to: "
|
||||
+ self.session_start_time)
|
||||
|
||||
def write_to_attr(self, element, attr_key, attr_value):
|
||||
"""
|
||||
Writes two strings as a an attribute key value pair to a given
|
||||
element.
|
||||
"""
|
||||
elements = self.xml_tree.findall(element)
|
||||
if(elements == []):
|
||||
element = self.tree.getroot()
|
||||
elements.append(element)
|
||||
for element in elements:
|
||||
element.set(attr_key, attr_value)
|
||||
self.xml_tree = self.xml_tree
|
||||
self.logger.info("Wrote attribute "
|
||||
+ attr_key
|
||||
+ "="
|
||||
+ "\""
|
||||
+ attr_value
|
||||
+ "\"")
|
||||
|
||||
def save_to_file(self, output_path, file_path, subfolder, config_section,
|
||||
config_key):
|
||||
"""
|
||||
Writes the new markup to a new xml file. Takes the output path and
|
||||
creates a new folder there. Also updates the config file with the new
|
||||
path.
|
||||
"""
|
||||
self.filename = os.path.basename(file_path)
|
||||
save_path = os.path.join(output_path, subfolder)
|
||||
if not os.path.exists(save_path):
|
||||
os.mkdir(save_path)
|
||||
tree = etree.ElementTree(self.xml_tree)
|
||||
new_filename = self.filename
|
||||
save_file_path = os.path.join(save_path, new_filename)
|
||||
tree.write(save_file_path,
|
||||
pretty_print=True,
|
||||
xml_declaration=True,
|
||||
encoding="utf8",
|
||||
doctype="<!DOCTYPE dbtplenarprotokoll SYSTEM 'dbtplenarprotokoll_minimal.dtd\'>")
|
||||
self.logger.info("New XML saved to:" + save_file_path)
|
||||
update_config.update_config("config.ini", config_section, config_key,
|
||||
save_path)
|
161
bundesdata_markup_nlp/markup/SpeakerMarkup.py
Executable file
161
bundesdata_markup_nlp/markup/SpeakerMarkup.py
Executable file
@ -0,0 +1,161 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
from markup.EntityMarkup import EntityMarkup
|
||||
import re
|
||||
import logging
|
||||
|
||||
|
||||
class SpeakerMarkup(EntityMarkup):
|
||||
"""
|
||||
Class for specific markup of different speakers identified by different
|
||||
regular expressions included in the config file.
|
||||
"""
|
||||
|
||||
def __init__(self, string, regex):
|
||||
super(SpeakerMarkup).__init__()
|
||||
self.string_to_search = string
|
||||
self.regex_string = regex
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def identify_speaker(self):
|
||||
"""
|
||||
Gets match objects from the speakers in the given text node. Also
|
||||
calculates length of it and puts the matches in a list.
|
||||
"""
|
||||
self.matches = re.finditer(self.regex_compiled, self.string_to_search)
|
||||
tmp_list = []
|
||||
for match in self.matches:
|
||||
tmp_list.append(match)
|
||||
self.matches_count = len(tmp_list)
|
||||
self.matches = tmp_list
|
||||
|
||||
def markup_speaker(self, case="middle"):
|
||||
"""
|
||||
This is where the first simple markup happens. It uses the matches
|
||||
and replaces them with simple markup for further processing. The
|
||||
'first' markup uses re.sub. The second and third one work on string
|
||||
basis.
|
||||
"""
|
||||
|
||||
def markup_logging():
|
||||
"""Helper function for creating log file output."""
|
||||
if(self.matches_count == 0):
|
||||
self.logger.warning("0 matches for given expression:"
|
||||
+ self.regex_string)
|
||||
elif(self.matches_count > 0):
|
||||
self.logger.info(str(self.matches_count)
|
||||
+ " matches for given expression:"
|
||||
+ self.regex_string)
|
||||
elif(self.matches_count == 1):
|
||||
self.logger.info(str(self.matches_count)
|
||||
+ " match for given expression:"
|
||||
+ self.regex_string)
|
||||
|
||||
if(case == "first"):
|
||||
# Uses re.sub because it is only for one match.
|
||||
start_tags = "<rede><redner>"
|
||||
end_tags = "</redner>"
|
||||
self.matches_count = 1 # sets count to 1 because it only marks the first match
|
||||
markup_logging()
|
||||
first_match = self.matches[0]
|
||||
start_xml = start_tags + first_match.group() + end_tags
|
||||
if(len(first_match.group().split()) <= 10):
|
||||
self.string_to_search = self.regex_compiled.sub(start_xml,
|
||||
self.string_to_search,
|
||||
count=1)
|
||||
self.markuped_string = self.string_to_search
|
||||
|
||||
elif(case == "middle"):
|
||||
"""
|
||||
Does not use re.sub because it is faster to work on the string.
|
||||
Also it avoids looping two times to get the specific match.group()
|
||||
which caused some errors.
|
||||
"""
|
||||
index_shift = 0
|
||||
start_tags = "\n</rede><rede><redner>"
|
||||
end_tags = "</redner>"
|
||||
markup_logging()
|
||||
for match in self.matches:
|
||||
index_start = match.start() + index_shift
|
||||
index_end = match.end() + index_shift
|
||||
whole_match_len = len(match.group())
|
||||
# Handels cases where lots of text before the actual speaker is # matched
|
||||
linebrks_in_match = len(match.group().split("\n"))
|
||||
if(linebrks_in_match >= 2):
|
||||
last_part_match = "".join(match.group().split("\n")[1:])
|
||||
first_line_of_match = match.group().split("\n")[0]
|
||||
if(len(first_line_of_match.split()) <= 10):
|
||||
match = first_line_of_match + last_part_match
|
||||
else:
|
||||
match = last_part_match
|
||||
|
||||
delta_start_index = whole_match_len - len(match)
|
||||
index_start = index_start + delta_start_index
|
||||
|
||||
self.string_to_search = (self.string_to_search[:index_start]
|
||||
+ start_tags
|
||||
+ match
|
||||
+ end_tags
|
||||
+ self.string_to_search[index_end:]
|
||||
)
|
||||
index_shift += len(start_tags) + len(end_tags)
|
||||
|
||||
else:
|
||||
self.string_to_search = (self.string_to_search[:index_start]
|
||||
+ start_tags
|
||||
+ match.group()
|
||||
+ end_tags
|
||||
+ self.string_to_search[index_end:]
|
||||
)
|
||||
index_shift += len(start_tags) + len(end_tags)
|
||||
|
||||
self.markuped_string = self.string_to_search
|
||||
|
||||
elif(case == "last"):
|
||||
index_shift = 0
|
||||
"""
|
||||
Matches the end of the session to add the last closing <rede> tag
|
||||
to the last speech for well-formed xml. Uses re.sub because it is
|
||||
only one operation.
|
||||
"""
|
||||
end_tag = "</rede>"
|
||||
session_close_time_tag = ('<sitzungsende/>')
|
||||
# Created end tags will be inserted into the protocol
|
||||
if(len(self.matches) == 1):
|
||||
self.logger.info("Last speech successfully tagged.")
|
||||
markup_logging()
|
||||
for match in self.matches:
|
||||
end_xml = end_tag + match.group() + session_close_time_tag
|
||||
if(len(match.group().split()) <= 15):
|
||||
self.string_to_search = self.regex_compiled.sub(end_xml,
|
||||
self.string_to_search,
|
||||
count=1)
|
||||
self.markuped_string = self.string_to_search
|
||||
|
||||
elif(len(self.matches) == 0):
|
||||
self.logger.warning(("No end of session found! Last tag " + end_tag
|
||||
+ " will be added to the end of the protocol."
|
||||
" This might add some unrelated text to the "
|
||||
"last speech."))
|
||||
markup_logging()
|
||||
self.markuped_string = self.string_to_search + end_tag
|
||||
|
||||
else:
|
||||
markup_logging()
|
||||
self.logger.warning(("There are " + str(len(self.matches))
|
||||
+ " session endings. Ignoring the endings"
|
||||
+ " before the last final ending of the "
|
||||
+ " session."))
|
||||
match = self.matches[-1]
|
||||
end_xml = end_tag + match.group() + session_close_time_tag
|
||||
whole_match_len = len(match.group())
|
||||
index_start = match.start() + index_shift
|
||||
index_end = match.end() + index_shift
|
||||
last_line = match.group().split("\n")[-1] # Always takes the last line of a match avoiding lots of text before the actual speaker.
|
||||
delta_start_index = whole_match_len - len(last_line)
|
||||
index_start = index_start + delta_start_index
|
||||
self.string_to_search = (self.string_to_search[:index_start]
|
||||
+ end_xml
|
||||
+ self.string_to_search[index_end:])
|
||||
index_shift += len(end_tag)
|
||||
self.markuped_string = self.string_to_search
|
554
bundesdata_markup_nlp/markup/SpeakerNameMarkup.py
Executable file
554
bundesdata_markup_nlp/markup/SpeakerNameMarkup.py
Executable file
@ -0,0 +1,554 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from markup.SpeakerMarkup import SpeakerMarkup
|
||||
from xml.etree import ElementTree
|
||||
from lxml import etree
|
||||
from tqdm import tqdm
|
||||
from itertools import combinations
|
||||
import copy
|
||||
import logging
|
||||
import re
|
||||
import os
|
||||
|
||||
|
||||
class SpeakerNameMarkup(SpeakerMarkup):
|
||||
"""
|
||||
This class is for the complex markup of the speakers in one given protocol.
|
||||
Creates the name tag with all needed inforamtion from the Stammdatenbank.
|
||||
Has to cross reference the speaker with said Stammdatenbank.
|
||||
"""
|
||||
known_redner_dicts = dict()
|
||||
last_wahlperiode = int()
|
||||
|
||||
def __init__(self, file_path, element_name=".//redner"):
|
||||
super(SpeakerNameMarkup).__init__()
|
||||
self.file_path = file_path
|
||||
self.filename = os.path.basename(self.file_path)[:-4]
|
||||
self.element_name = element_name
|
||||
self.redner_dict = dict()
|
||||
self.all_speakers = []
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def cross_reference_markup(self, strings, feature_set_dict,
|
||||
MdB_etree):
|
||||
"""
|
||||
Checks if features like name, surename academic title and city are
|
||||
present in the input string. Consists of main function and helper
|
||||
functions. First the string will be split in tokens. Every token will
|
||||
be checked a gainst sets of valid names, surnames, academic titles and
|
||||
fractions. If there is a match a dictionary entriy will be set
|
||||
accordingly.
|
||||
Also uses the add_missing_MdB_feature helper function in a second step
|
||||
to add features which are not present in the string or have been
|
||||
identified wrongly.
|
||||
The function crates a dictionary containing all features of one speaker
|
||||
to crate a valid XML element from it later on.
|
||||
"""
|
||||
|
||||
def initiate_dict(keys, extra_keys):
|
||||
"""
|
||||
Creates a dictionarie with a set of keys and sets them to None.
|
||||
Some specific key values will be set to specific values.
|
||||
"""
|
||||
for key in keys:
|
||||
redner_dict[key] = None
|
||||
for key in extra_keys:
|
||||
redner_dict[key] = None
|
||||
redner_dict["feature_complete"] = False
|
||||
redner_dict["original_string"] = string
|
||||
redner_dict["identified"] = False
|
||||
redner_dict["damalige_fraktion"] = None
|
||||
|
||||
def get_names(keys, dict, token):
|
||||
"""
|
||||
Checks if token is in set vorname or nachname. If it is dictionary
|
||||
values will be set accordingly. Avoids that surname will be
|
||||
overwirtten by a name wich is also a valid surname.
|
||||
"""
|
||||
for key in keys[0:2]: # Only for vorname, nachname in written order
|
||||
if(token in feature_set_dict[key][0] and redner_dict[key]
|
||||
is None):
|
||||
redner_dict[key] = token
|
||||
elif(token in feature_set_dict["nachname"][0]
|
||||
and redner_dict["nachname"] is not None):
|
||||
redner_dict["nachname"] = token
|
||||
else:
|
||||
continue
|
||||
|
||||
def get_feature(key, string, set):
|
||||
"""
|
||||
Checks if a token is a valid feature (like name affix or academic
|
||||
title, ortszusatz or namenszusatz) and adds it to the dictionary.
|
||||
Does not check for names.
|
||||
"""
|
||||
for feature in set:
|
||||
if(key == "titel"):
|
||||
regex = r"(\b{}\B)".format(re.escape(feature)) # could be Dr. and . is not a word boundary.
|
||||
elif(key is "namenszusatz"):
|
||||
regex = r"\b({})\b".format(re.escape(feature)) # No . in word so word boundary at start and end of regex.
|
||||
elif(key is "fraktion"):
|
||||
regex = r"\B(\({}\))\B".format(re.escape(feature)) # always surrounded by parentheses, but also has to match them to avoid matching i. e. "CDU" in "CDU/CSU"
|
||||
elif(key is "ortszusatz"):
|
||||
regex = r"\B{}\B".format(re.escape(feature)) # always surrounded by parentheses
|
||||
else:
|
||||
regex = r"(\b{}\b)".format(re.escape(feature))
|
||||
match = re.search(regex, string)
|
||||
if(match):
|
||||
if(key == "fraktion"):
|
||||
redner_dict[key] = match.group()[1:-1] # removes ()
|
||||
break
|
||||
else:
|
||||
redner_dict[key] = match.group()
|
||||
break
|
||||
else:
|
||||
redner_dict[key] = None
|
||||
|
||||
def get_role(string):
|
||||
"""Checks redner string for role. Identifies 'Bundesministerin für
|
||||
Familie, Senioren, Frauen und Jugend' etc."""
|
||||
if("Staatssekretär" in string or "Staatssekretärin" in string):
|
||||
regex = r"(Staatssekretär(in)?)"
|
||||
splits = re.split(regex, string, maxsplit=1)
|
||||
role_long = splits[1] + splits[-1]
|
||||
redner_dict["rolle_lang"] = role_long
|
||||
role_short = [word[0] for word in role_long.split()
|
||||
if word[0].isupper()]
|
||||
role_short = splits[1] + " " + "".join(role_short)
|
||||
redner_dict["rolle_kurz"] = role_short
|
||||
elif("Bundesminister" in string or "Bundesministerin" in string):
|
||||
regex = r"(Bundesminister(in)?)"
|
||||
splits = re.split(regex, string, maxsplit=1)
|
||||
role_long = splits[1] + splits[-1]
|
||||
redner_dict["rolle_lang"] = role_long
|
||||
role_short = [word[0] for word in role_long.split()
|
||||
if word[0].isupper()]
|
||||
role_short = splits[1] + " " + "".join(role_short)
|
||||
redner_dict["rolle_kurz"] = role_short
|
||||
|
||||
def check_name(redner_dict):
|
||||
"""
|
||||
Checks if vorname and nachname are the same. Sets vorname to None if
|
||||
True. Vorname will be set later on with add_missing_MdB_feature.
|
||||
"""
|
||||
if(redner_dict["nachname"] == redner_dict["vorname"]):
|
||||
redner_dict["vorname"] = None
|
||||
|
||||
def get_party(redner_dict):
|
||||
"""
|
||||
Creates a party key in the dictionary containing the party of the
|
||||
speaker. Party is not the same as fraction. This is mainly done
|
||||
because CDU/CSU is the fraction in the bundestag but speakers can
|
||||
belong to either the CDU or CSU. If the fraction is not CDU/CSU
|
||||
party will be set to fraction. Also handels problems with GRÜNE.
|
||||
"""
|
||||
if(redner_dict["fraktion"] != "CDU/CSU"
|
||||
and redner_dict["fraktion"] != "CDU"
|
||||
and redner_dict["fraktion"] != "CSU"):
|
||||
redner_dict["partei"] = redner_dict["fraktion"]
|
||||
elif(redner_dict["fraktion"] == "CDU"
|
||||
or redner_dict["fraktion"] == "CSU"):
|
||||
redner_dict["partei"] = redner_dict["fraktion"]
|
||||
redner_dict["fraktion"] = "CDU/CSU"
|
||||
if(redner_dict["fraktion"] == "GRÜNE"):
|
||||
redner_dict["fraktion"] = "BÜNDNIS 90/DIE GRÜNEN"
|
||||
|
||||
def check_party_and_fraction():
|
||||
"""
|
||||
Checks if party and fraction have been set correctly. Will be used
|
||||
after add_missing_MdB_feature. To correct some errors with CDU/CSU.
|
||||
"""
|
||||
if(redner_dict["fraktion"] is not None
|
||||
and redner_dict["partei"] == "CDU"
|
||||
or redner_dict["partei"] == "CSU"):
|
||||
redner_dict["fraktion"] = "CDU/CSU"
|
||||
|
||||
if(redner_dict["partei"] is None
|
||||
and redner_dict["fraktion"] is not None
|
||||
and redner_dict["fraktion"] != "CDU"
|
||||
and redner_dict["fraktion"] != "CSU"):
|
||||
redner_dict["partei"] = redner_dict["fraktion"]
|
||||
|
||||
def get_match_in_str(key, string, regex):
|
||||
"""
|
||||
Matches a regex in the current string and adds it as a value to the
|
||||
given key into the dictionary.
|
||||
"""
|
||||
match = re.search(regex, string)
|
||||
if(match):
|
||||
redner_dict[key] = match.group()
|
||||
else:
|
||||
redner_dict[key] = None
|
||||
|
||||
def add_missing_MdB_feature(string, redner_dict, feature_set_dict,
|
||||
MdB_etree, conditions_key_list,
|
||||
feature_lookup, feature_to_add,
|
||||
logging_state=False, multi_ids=False):
|
||||
"""
|
||||
This function trys to get missing features for on speaker. Input is
|
||||
a list of features(conditions_key_list) which are used as parameters
|
||||
in an xpath expression. The Xpath is built dynamically from the
|
||||
list.
|
||||
If the Xpath matches one unique entry the feature(feature_to_add)
|
||||
will be set to the match of feature_lookup in the matched element.
|
||||
"""
|
||||
###
|
||||
# Xpath creation from conditions_key_list
|
||||
###
|
||||
xpath_parts = []
|
||||
conds = conditions_key_list
|
||||
len_conds = len(conds)
|
||||
if(len_conds == 1):
|
||||
for condition in conds:
|
||||
xpath_part = ".//MDB[.//{}/text()='{}']" \
|
||||
.format(feature_set_dict[condition][1],
|
||||
redner_dict[condition])
|
||||
xpath_parts.append(xpath_part)
|
||||
xpath = "".join(xpath_parts)
|
||||
if("None" in xpath):
|
||||
xpath = None
|
||||
elif(len_conds == 2):
|
||||
xpath_first_part = ".//MDB[.//{}/text()='{}'" \
|
||||
.format(feature_set_dict[conds[0]][1],
|
||||
redner_dict[conds[0]])
|
||||
xpath_parts.insert(0, xpath_first_part)
|
||||
xpath_last_part = ".//{}/text()='{}']" \
|
||||
.format(feature_set_dict[conds[-1]][1],
|
||||
redner_dict[conds[-1]])
|
||||
xpath_parts.append(xpath_last_part)
|
||||
xpath = " and ".join(xpath_parts)
|
||||
if("None" in xpath):
|
||||
xpath = None
|
||||
elif(len_conds > 2):
|
||||
xpath_first_part = ".//MDB[.//{}/text()='{}'" \
|
||||
.format(feature_set_dict[conds[0]][1],
|
||||
redner_dict[conds[0]])
|
||||
xpath_parts.insert(0, xpath_first_part)
|
||||
for condition in conds[1:-1]:
|
||||
xpath_inner_part = ".//{}/text()='{}'" \
|
||||
.format(feature_set_dict[condition][1],
|
||||
redner_dict[condition])
|
||||
xpath_parts.append(xpath_inner_part)
|
||||
xpath_last_part = ".//{}/text()='{}']" \
|
||||
.format(feature_set_dict[conds[-1]][1],
|
||||
redner_dict[conds[-1]])
|
||||
xpath_parts.append(xpath_last_part)
|
||||
xpath = " and ".join(xpath_parts)
|
||||
if("None" in xpath): # sets xpaths to None if it uses a feature which is None
|
||||
xpath = None
|
||||
xpath_parts = [] # empties xpath_parts list
|
||||
try: # tries every xpath
|
||||
matches = MdB_etree.xpath(xpath)
|
||||
except TypeError: # handles xpaths that are None
|
||||
matches = []
|
||||
# If xpath has unique match new feature value will be set to given feature
|
||||
if(len(matches) == 1):
|
||||
matches = matches[0]
|
||||
feature_lookup = ".//" + feature_lookup
|
||||
new_feature = matches.xpath(feature_lookup)[0].text
|
||||
self.logger.info((" There is one unique match "
|
||||
+ " for this speaker: "
|
||||
+ str(redner_dict)
|
||||
+ " Extracted feature "
|
||||
+ feature_lookup + ": "
|
||||
+ str(new_feature)
|
||||
+ " with: "
|
||||
+ str(conds)))
|
||||
redner_dict[feature_to_add] = new_feature
|
||||
self.logger.info(("New speaker features are: "
|
||||
+ str(redner_dict)))
|
||||
# Handels mathches tha are not unique for logging and mutli id
|
||||
elif(len(matches) > 1):
|
||||
self.logger.warning((" There are "
|
||||
+ str(len(matches))
|
||||
+ " matches for this speaker: "
|
||||
+ str(redner_dict)
|
||||
+ " .Could not extract: "
|
||||
+ feature_lookup
|
||||
+ " Features used are: "
|
||||
+ str(conds)))
|
||||
elif(len(matches) > 1 and multi_ids is True):
|
||||
ids = matches
|
||||
for id, i in ids, enumerate(ids):
|
||||
key = "id" + i
|
||||
redner_dict[key] = id
|
||||
return matches
|
||||
|
||||
def get_periode(MdB_etree):
|
||||
periode = self.xml_tree.xpath(".//wahlperiode")
|
||||
if(periode):
|
||||
redner_dict["wahlperiode"] = periode[0].text
|
||||
return periode[0].text
|
||||
|
||||
###
|
||||
# Start of main function cross_reference_markup
|
||||
###
|
||||
|
||||
# Initiates empty dict and gets keys for it
|
||||
redner_dict = dict()
|
||||
features = list(feature_set_dict.keys())
|
||||
|
||||
# Counters to calculate how successful the identification of speakers is
|
||||
identified_speakers = 0
|
||||
unidentified_speakers = 0
|
||||
multiple_identified_speakers = 0
|
||||
|
||||
# Cross references every <redner> string
|
||||
for string in tqdm(strings, desc="Cross reference name markup for speakers in strings"):
|
||||
self.logger.info("\nStarting name markup process for new speaker:")
|
||||
# Sets values in redner_dict to None or specific value
|
||||
initiate_dict(features, [feature for feature in features])
|
||||
tokens = string.replace(":", "").replace(",", "").split() # replaces ":" and "," with nothing because some names would be "name:" and some names would contain a ","
|
||||
for token in tokens:
|
||||
get_names(features, feature_set_dict, token)
|
||||
self.logger.info("nachname is: " + str(redner_dict["nachname"]))
|
||||
feature_keys = [key for key in features if key not in ["vorname",
|
||||
"nachname"]]
|
||||
for f_key in feature_keys:
|
||||
get_feature(f_key, string, feature_set_dict[f_key][0])
|
||||
get_party(redner_dict)
|
||||
check_name(redner_dict)
|
||||
regex_p = r"^\w*(?:P|p)räsident\w*"
|
||||
get_match_in_str("präsident", string, regex_p)
|
||||
get_role(string)
|
||||
|
||||
###
|
||||
# Checks if script is still running for the same current periode.
|
||||
# If this is not the case the known_redner_dicts will be emptied.
|
||||
###
|
||||
current_wahlperiode = get_periode(MdB_etree)
|
||||
if(current_wahlperiode != SpeakerNameMarkup.last_wahlperiode):
|
||||
SpeakerNameMarkup.known_redner_dicts = dict()
|
||||
SpeakerNameMarkup.last_wahlperiode = current_wahlperiode
|
||||
|
||||
###
|
||||
# Creates possible combinations of features which will be used in
|
||||
# add_missing_MdB_feature to identify missing features like vorname or
|
||||
# nachname.
|
||||
###
|
||||
|
||||
combination_features = [feature for feature in features if feature
|
||||
not in ["namenszusatz",
|
||||
"feature_complete",
|
||||
"id",
|
||||
"titel",
|
||||
"rolle_kurz",
|
||||
"rolle_lang",
|
||||
"original_string",
|
||||
"identified",
|
||||
"damalige_fraktion"]]
|
||||
subsets = []
|
||||
for length in range(0, 5):
|
||||
for subset in combinations(combination_features, length):
|
||||
subsets.append(list(subset))
|
||||
subsets = subsets[1:]
|
||||
combination_features.remove("wahlperiode")
|
||||
combination_features.remove("nachname")
|
||||
|
||||
###
|
||||
# First while loop trying to identify every feature for one speaker.
|
||||
# Uses combinations from above. Before calling the function
|
||||
# add_missing_MdB_feature there is a check if the speaker has alreeady
|
||||
# been identified before. If this is the case features will be set to
|
||||
# the already identfied features. This saves a lot of time.
|
||||
###
|
||||
|
||||
counter_feats = 0
|
||||
while(redner_dict["feature_complete"] is False):
|
||||
redner_dict["damalige_fraktion"] = redner_dict["fraktion"]
|
||||
# print("Doing name markup for:", redner_dict)
|
||||
# Checks if speaker has been already identified before.
|
||||
if(string in SpeakerNameMarkup.known_redner_dicts):
|
||||
# print("Speaker has already been identified once.")
|
||||
redner_dict = SpeakerNameMarkup.known_redner_dicts[string].copy()
|
||||
# print("Speaker features are set to:",
|
||||
# SpeakerNameMarkup.known_redner_dicts[string])
|
||||
redner_dict["identified"] = True
|
||||
self.logger.info(("Speaker has alreeady been identified "
|
||||
+ "once."))
|
||||
self.logger.info(("Speaker features are set to: "
|
||||
+ str(SpeakerNameMarkup.known_redner_dicts[string])))
|
||||
if(SpeakerNameMarkup.known_redner_dicts[string]["feature_complete"] is not False):
|
||||
identified_speakers += 1
|
||||
break
|
||||
else:
|
||||
for feature in combination_features:
|
||||
for subset in subsets:
|
||||
add_missing_MdB_feature(string,
|
||||
redner_dict,
|
||||
feature_set_dict,
|
||||
MdB_etree,
|
||||
subset,
|
||||
feature_set_dict[feature][1],
|
||||
feature)
|
||||
check_party_and_fraction()
|
||||
if(redner_dict["vorname"] is not None
|
||||
and redner_dict["nachname"] is not None
|
||||
and redner_dict["fraktion"] is not None
|
||||
and redner_dict["partei"] is not None):
|
||||
redner_dict["feature_complete"] = True
|
||||
counter_feats += 1
|
||||
if(counter_feats == len(combination_features)):
|
||||
redner_dict["feature_complete"] = False
|
||||
break
|
||||
|
||||
###
|
||||
# Second while loop uses four features to identfie the unique ID for one
|
||||
# speaker with add_missing_MdB_feature. Also tries to identfie speakers
|
||||
# with lesser known features. In this case there can be multiple possile
|
||||
# ids for one speaker these will be saved in a special dictionary entry.
|
||||
# Rare case.
|
||||
###
|
||||
|
||||
counter_ids = 0
|
||||
while(redner_dict["id"] is None):
|
||||
if(redner_dict["feature_complete"] is True):
|
||||
add_missing_MdB_feature(string,
|
||||
redner_dict,
|
||||
feature_set_dict,
|
||||
MdB_etree,
|
||||
["vorname", "nachname", "partei",
|
||||
"wahlperiode"],
|
||||
feature_set_dict["id"][1],
|
||||
"id")
|
||||
key_original_string = redner_dict["original_string"]
|
||||
SpeakerNameMarkup.known_redner_dicts.update(
|
||||
{key_original_string: redner_dict.copy()})
|
||||
redner_dict["identified"] = True
|
||||
if(counter_ids == 1):
|
||||
redner_dict["id"] = None
|
||||
redner_dict["feature_complete"] = False
|
||||
redner_dict["identified"] = False
|
||||
self.logger.warning(("Unique ID could not be assigned. "
|
||||
+ "Feature complete: True "
|
||||
+ "Features are: "
|
||||
+ str(redner_dict)))
|
||||
SpeakerNameMarkup.known_redner_dicts.update(
|
||||
{key_original_string: redner_dict.copy()})
|
||||
unidentified_speakers += 1
|
||||
identified_speakers -= 1 # because identified_speakers was set before
|
||||
break
|
||||
identified_speakers += 1
|
||||
elif(redner_dict["feature_complete"] is not True):
|
||||
redner_dict["id"] = None
|
||||
ids = add_missing_MdB_feature(string,
|
||||
redner_dict,
|
||||
feature_set_dict,
|
||||
MdB_etree,
|
||||
["nachname", "partei",
|
||||
"wahlperiode"],
|
||||
feature_set_dict["id"][1],
|
||||
"id", False, True)
|
||||
if(ids is not None and len(ids) > 1):
|
||||
redner_dict["identified"] = "Multiple"
|
||||
multiple_identified_speakers += 1
|
||||
identified_speakers -= 1
|
||||
break
|
||||
elif(ids is None):
|
||||
self.logger.warning(("Unique ID could not be assigned. "
|
||||
+ "Feature complete: False "
|
||||
+ "Features are: "
|
||||
+ str(redner_dict)))
|
||||
redner_dict["identified"] = False
|
||||
unidentified_speakers += 1
|
||||
break
|
||||
counter_ids += 1
|
||||
|
||||
self.logger.info(("Number of identified speakers with valid id and"
|
||||
+ " name markup is: "
|
||||
+ str(identified_speakers)))
|
||||
self.logger.info(("Number of unidentified speakers without valid"
|
||||
+ " id and name markup is: "
|
||||
+ str(unidentified_speakers)))
|
||||
self.logger.info(("Number of speakers with possible multiple ids: "
|
||||
+ str(multiple_identified_speakers)))
|
||||
self.logger.info(("Number of all speaker entitiys in current"
|
||||
+ " protocoll is: "
|
||||
+ str(len(strings))))
|
||||
redner_dict_final = copy.deepcopy(redner_dict)
|
||||
self.redner_dict = redner_dict_final
|
||||
self.all_speakers.append(self.redner_dict)
|
||||
for key in features:
|
||||
redner_dict[key] = None
|
||||
|
||||
# print("Speaker features after whole cross reference markup:",
|
||||
# redner_dict_final)
|
||||
self.logger.info(("Saved speakers (identfied and not identified): "
|
||||
+ str(len(self.all_speakers))))
|
||||
|
||||
def create_speaker_elements(self):
|
||||
"""
|
||||
Creates a valid redner XML element for one redner_dict entry from the
|
||||
list self.all_speakers. Has to be done step by step becuase dictionary
|
||||
is not sorted and name sub elements have to be in specific order.
|
||||
"""
|
||||
self.all_speaker_elements = []
|
||||
for redner_entry in tqdm(self.all_speakers, desc="Creating speaker element"):
|
||||
redner_element = etree.Element("redner")
|
||||
redner_element.set("id", str(redner_entry["id"]))
|
||||
name_element = etree.Element("name")
|
||||
titel_element = etree.Element("titel")
|
||||
titel_element.text = redner_entry["titel"]
|
||||
vorname_element = etree.Element("vorname")
|
||||
vorname_element.text = redner_entry["vorname"]
|
||||
namenszusatz_element = etree.Element("namenszusatz")
|
||||
namenszusatz_element.text = redner_entry["namenszusatz"]
|
||||
nachname_element = etree.Element("nachname")
|
||||
nachname_element.text = redner_entry["nachname"]
|
||||
damalige_fraktion_element = etree.Element("damalige_fraktion")
|
||||
damalige_fraktion_element.text = redner_entry["damalige_fraktion"]
|
||||
fraktion_element = etree.Element("fraktion")
|
||||
fraktion_element.text = redner_entry["fraktion"]
|
||||
partei_element = etree.Element("partei")
|
||||
partei_element.text = redner_entry["partei"]
|
||||
ortszusatz_element = etree.Element("ortszusatz")
|
||||
ortszusatz_element.text = redner_entry["ortszusatz"]
|
||||
rolle_lang_element = etree.Element("rolle_lang")
|
||||
rolle_lang_element.text = redner_entry["rolle_lang"]
|
||||
rolle_kurz_element = etree.Element("rolle_kurz")
|
||||
rolle_kurz_element.text = redner_entry["rolle_kurz"]
|
||||
original_string_element = etree.Element("original_string")
|
||||
original_string_element.text = redner_entry["original_string"]
|
||||
|
||||
if(redner_entry["titel"] is not None):
|
||||
name_element.append(titel_element)
|
||||
name_element.append(vorname_element)
|
||||
if(redner_entry["namenszusatz"] is not None):
|
||||
name_element.append(namenszusatz_element)
|
||||
name_element.append(nachname_element)
|
||||
name_element.append(damalige_fraktion_element)
|
||||
name_element.append(fraktion_element)
|
||||
name_element.append(partei_element)
|
||||
if(redner_entry["ortszusatz"] is not None):
|
||||
name_element.append(ortszusatz_element)
|
||||
if(redner_entry["rolle_lang"] is not None):
|
||||
name_element.append(rolle_lang_element)
|
||||
name_element.append(rolle_kurz_element)
|
||||
name_element.append(original_string_element)
|
||||
name_element.tail = original_string_element.text
|
||||
redner_element.append(name_element)
|
||||
self.all_speaker_elements.append(redner_element)
|
||||
self.logger.info(("Speaker element is: "
|
||||
+ ElementTree.tostring(redner_element).decode("utf-8")))
|
||||
|
||||
def set_speech_ids(self):
|
||||
"""
|
||||
This functions sets a unique rede id for every rede element in one
|
||||
protocoll. Id is a ten digit integer preceded by the string ID.
|
||||
Example: ID1809900000
|
||||
First two digits are the wahlperiode the followinf three digits are the
|
||||
sitzungsnr (session number). The remaining digits are for counting the
|
||||
speeches. First speech is 00100, second is 00200, eleventh is 01100 and so on.
|
||||
Example: ID1809901100 --> eleventh speech
|
||||
Last tow digits are for corrections.
|
||||
"""
|
||||
|
||||
id_counter = 000
|
||||
speeches = self.xml_tree.xpath(".//sitzungsbeginn | .//rede")
|
||||
for speech in tqdm(speeches, desc="Creating speech ids"):
|
||||
id_counter_str = str(id_counter).zfill(5)
|
||||
id = "ID" + self.filename + id_counter_str
|
||||
speech.set("id", id)
|
||||
id_counter += 100
|
||||
self.logger.info(("Speech id is: " + id))
|
||||
self.xml_tree = self.xml_tree
|
0
bundesdata_markup_nlp/markup/__init__.py
Executable file
0
bundesdata_markup_nlp/markup/__init__.py
Executable file
Binary file not shown.
BIN
bundesdata_markup_nlp/markup/__pycache__/MdBData.cpython-37.pyc
Normal file
BIN
bundesdata_markup_nlp/markup/__pycache__/MdBData.cpython-37.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
bundesdata_markup_nlp/markup/__pycache__/__init__.cpython-37.pyc
Normal file
BIN
bundesdata_markup_nlp/markup/__pycache__/__init__.cpython-37.pyc
Normal file
Binary file not shown.
Binary file not shown.
BIN
bundesdata_markup_nlp/markup/__pycache__/metadata.cpython-37.pyc
Normal file
BIN
bundesdata_markup_nlp/markup/__pycache__/metadata.cpython-37.pyc
Normal file
Binary file not shown.
Binary file not shown.
BIN
bundesdata_markup_nlp/markup/__pycache__/speakers.cpython-37.pyc
Normal file
BIN
bundesdata_markup_nlp/markup/__pycache__/speakers.cpython-37.pyc
Normal file
Binary file not shown.
BIN
bundesdata_markup_nlp/markup/__pycache__/speeches.cpython-37.pyc
Normal file
BIN
bundesdata_markup_nlp/markup/__pycache__/speeches.cpython-37.pyc
Normal file
Binary file not shown.
49
bundesdata_markup_nlp/markup/beautify_markup.py
Executable file
49
bundesdata_markup_nlp/markup/beautify_markup.py
Executable file
@ -0,0 +1,49 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from utility.FileGetter import FileGetter
|
||||
from utility.XMLProtocol import XMLProtocol
|
||||
import configparser
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def beautify_xml(case, alter_lines=False, line_width=0):
|
||||
"""
|
||||
Beautifies the xml protocols so that they are easily readable by humans.
|
||||
Uses .beautify_xml_part() and .beautify_xml() to be able to format lines for
|
||||
specific parts of an xml. Alter lines can be set to Flase or True. Line
|
||||
width that will be used if alter_lines is True can be set to any value
|
||||
between 0 and 160.
|
||||
"""
|
||||
config = configparser.ConfigParser()
|
||||
config.read("config.ini")
|
||||
if(case == "markup"):
|
||||
output_path = config["File paths"]["output_folder"]
|
||||
input_path = config["File paths"]["clear_speech_markup"]
|
||||
key_name = "beautiful_xml"
|
||||
elif(case == "nlp"):
|
||||
output_path = config["File paths"]["nlp_output"]
|
||||
input_path = config["File paths"]["nlp_lemmatized_tokenized"]
|
||||
key_name = "nlp_beuatiful_xml"
|
||||
files = FileGetter(input_path, "*.xml")
|
||||
files = files.get_files()
|
||||
for file_path in tqdm(sorted(files), desc="First beautification steps"):
|
||||
xml = XMLProtocol()
|
||||
xml.read_xml(file_path)
|
||||
xml.beautify_xml_part(file_path, ".//vorspann")
|
||||
xml.replace_elements(".//vorspann", [xml.beautified_part])
|
||||
xml.beautify_xml_part(file_path, ".//sitzungsverlauf", alter_lines,
|
||||
line_width)
|
||||
xml.replace_elements(".//sitzungsverlauf", [xml.beautified_part])
|
||||
xml.save_to_file(output_path, file_path, key_name,
|
||||
"File paths", key_name)
|
||||
config.read("config.ini")
|
||||
beautiful_xmls_path = config["File paths"][key_name]
|
||||
files = FileGetter(beautiful_xmls_path, "*.xml")
|
||||
files = files.get_files()
|
||||
for file_path in tqdm(files, desc="Second beautification steps"):
|
||||
xml.beautify_xml(file_path, False)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
beautify_xml()
|
57
bundesdata_markup_nlp/markup/metadata.py
Executable file
57
bundesdata_markup_nlp/markup/metadata.py
Executable file
@ -0,0 +1,57 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from utility.FileGetter import FileGetter
|
||||
from markup.MetadataMarkup import MetadataMarkup
|
||||
from tqdm import tqdm
|
||||
import os
|
||||
import configparser
|
||||
import logging
|
||||
|
||||
|
||||
def get_metadata():
|
||||
"""
|
||||
This script creates a valid metadata head and first level xml tag strucutre
|
||||
for all files in one directory with subdirs. It needs all filepaths for all
|
||||
files to consider. File paths will be extracted by using the FileGetter
|
||||
class.
|
||||
After that it extracts the given metadata for one file each and writes it as
|
||||
valid XML according to the new offical schema into a new file at the given
|
||||
output path.
|
||||
"""
|
||||
logger = logging.getLogger(__name__)
|
||||
print("Running metadata creation for original XML-protocolls.")
|
||||
config = configparser.ConfigParser()
|
||||
config.read("config.ini")
|
||||
input_path = config["File paths"]["input_folder_xmls"]
|
||||
output_path = config["File paths"]["output_folder"]
|
||||
Files = FileGetter(input_path, "*.xml")
|
||||
file_list = Files.get_files()
|
||||
metadata = MetadataMarkup()
|
||||
for file in tqdm(sorted(file_list), desc="Metadata status:"):
|
||||
logger.info("\nCreating metadata for: " + str(os.path.basename(file)))
|
||||
root = metadata.read_protcol(file)
|
||||
metadata.extract_metadata(root)
|
||||
metadata.built_iso_date(metadata.datum_ger_non_iso)
|
||||
metadata.built_date_string(metadata.datum_iso)
|
||||
metadata.delete_old_metadata(root)
|
||||
metadata.split_content(root)
|
||||
metadata.insert_new_metadata(root)
|
||||
metadata.get_session_times()
|
||||
metadata.write_to_attr("dbtplenarprotokoll", "sitzung-datum",
|
||||
metadata.datum_ger_non_iso)
|
||||
metadata.write_to_attr("dbtplenarprotokoll", "sitzung-start-uhrzeit",
|
||||
metadata.session_start_time)
|
||||
metadata.write_to_attr("dbtplenarprotokol", "sitzung-ende-uhrzeit",
|
||||
metadata.session_end_time)
|
||||
metadata.write_to_attr("dbtplenarprotokoll", "sitzungs-nr",
|
||||
metadata.sitzungsnr)
|
||||
metadata.write_to_attr("dbtplenarprotokol", "wahlperiode",
|
||||
metadata.wahlperiode)
|
||||
metadata.save_to_file(output_path, file, "new_metadata", "File paths", "new_metadata")
|
||||
logger.info("New metadata created for: " + str(os.path.basename(file)))
|
||||
print("Succesfully extracted and wrote new metadata to XML-protocolls.")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
get_metadata()
|
122
bundesdata_markup_nlp/markup/speaker_names.py
Executable file
122
bundesdata_markup_nlp/markup/speaker_names.py
Executable file
@ -0,0 +1,122 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from markup.SpeakerNameMarkup import SpeakerNameMarkup
|
||||
from markup.MdBData import MdBData
|
||||
from utility.FileGetter import FileGetter
|
||||
from xml.etree import ElementTree
|
||||
from tqdm import tqdm
|
||||
import os
|
||||
import configparser
|
||||
import logging
|
||||
|
||||
|
||||
def get_names():
|
||||
"""
|
||||
This script gets the identified speaker elements. It will analyse the text
|
||||
of those to determine <vorname>, <nachname>, @id etc. for every speaker.
|
||||
Also creates a speech id for every speech.
|
||||
"""
|
||||
###
|
||||
# Setting paths in config and start logging
|
||||
###
|
||||
logger = logging.getLogger(__name__)
|
||||
config = configparser.ConfigParser()
|
||||
config.read("config.ini")
|
||||
xml_path = config["File paths"]["new_simple_markup"]
|
||||
output_path = config["File paths"]["output_folder"]
|
||||
parent_path = os.path.dirname(os.getcwd())
|
||||
stammdatenbank_full_path = os.path.join(parent_path,
|
||||
"data/MdB_data/MdB_Stammdaten.xml")
|
||||
###
|
||||
# opens and reads Stammdatenbank
|
||||
###
|
||||
stammdatenbank = MdBData()
|
||||
stammdatenbank.read_xml(stammdatenbank_full_path)
|
||||
###
|
||||
# Getting sets of different name name/MdB features
|
||||
###
|
||||
# getting first names
|
||||
first_names = stammdatenbank.get_set(".//VORNAME", stammdatenbank.xml_tree)
|
||||
first_names.discard(None)
|
||||
# getting las names
|
||||
last_names = stammdatenbank.get_set(".//NACHNAME", stammdatenbank.xml_tree)
|
||||
last_names.discard(None)
|
||||
# getting academic titles
|
||||
academic_titles = stammdatenbank.get_set(".//AKAD_TITEL",
|
||||
stammdatenbank.xml_tree)
|
||||
academic_titles_short = stammdatenbank.get_set(".//ANREDE_TITEL",
|
||||
stammdatenbank.xml_tree)
|
||||
additional_academic_titles = [title for title in config["Additional name features"]["academic_titles"].split()]
|
||||
for title in additional_academic_titles:
|
||||
academic_titles.add(title)
|
||||
academic_titles = academic_titles.union(academic_titles_short)
|
||||
academic_titles.discard(None)
|
||||
# getting parties
|
||||
parties = stammdatenbank.get_set(".//PARTEI_KURZ", stammdatenbank.xml_tree)
|
||||
additional_parties = [party for party in config["Additional name features"]["parties"].split()]
|
||||
for party in additional_parties:
|
||||
parties.add(party)
|
||||
parties.discard(None)
|
||||
# getting name affixes
|
||||
name_affixes = stammdatenbank.get_set(".//PRAEFIX", stammdatenbank.xml_tree)
|
||||
name_affixes.discard(None)
|
||||
# getting cities
|
||||
cities = stammdatenbank.get_set(".//ORTSZUSATZ", stammdatenbank.xml_tree)
|
||||
cities.discard(None)
|
||||
# setting empty sets to later combine them with XML node names for XPaths
|
||||
party = set() #
|
||||
periode = set() #
|
||||
feature_complete = set() #
|
||||
speaker_id = set() #
|
||||
role_long = set()
|
||||
role_short = set()
|
||||
###
|
||||
# creating dict with tuples of sets and corresponding XML node name
|
||||
###
|
||||
sets = [(first_names, "VORNAME"), (last_names, "NACHNAME"),
|
||||
(academic_titles, "AKAD_TITEL"), (parties, "PARTEI_KURZ"),
|
||||
(name_affixes, "PRAEFIX"), (cities, "ORTSZUSATZ"),
|
||||
(party, "PARTEI_KURZ"), (periode, "WP"), (feature_complete, "None"),
|
||||
(speaker_id, "ID"), (role_long, "None"), (role_short, "None")]
|
||||
features = ["vorname", "nachname", "titel", "fraktion", "namenszusatz",
|
||||
"ortszusatz", "partei", "wahlperiode", "feature_complete",
|
||||
"id", "rolle_lang", "rolle_kurz"]
|
||||
feature_set_dict = dict(zip(features, sets))
|
||||
###
|
||||
# opening XML protocolls
|
||||
# starting speaker markup for features
|
||||
###
|
||||
files = FileGetter(xml_path, "*.xml")
|
||||
files = files.get_files()
|
||||
for file_path in tqdm(sorted(files),
|
||||
desc="File status"):
|
||||
complex_speaker = SpeakerNameMarkup(file_path, ".//redner")
|
||||
complex_speaker.read_xml(file_path)
|
||||
complex_speaker.get_element_text()
|
||||
logger.info(("Doing cross reference markup for names to get redner ids."
|
||||
+ " For file: "
|
||||
+ os.path.basename(file_path)))
|
||||
complex_speaker.cross_reference_markup(complex_speaker.current_strings,
|
||||
feature_set_dict,
|
||||
stammdatenbank.xml_tree)
|
||||
complex_speaker.create_speaker_elements()
|
||||
complex_speaker.replace_elements(".//redner",
|
||||
complex_speaker.all_speaker_elements,
|
||||
True)
|
||||
xml_string = ElementTree.tostring(complex_speaker.xml_tree)
|
||||
bool = complex_speaker.simple_check_xml(xml_string, file_path, False,
|
||||
False)
|
||||
if(bool is False):
|
||||
logger.error(("This XML file is not well-formed. Program stopped."
|
||||
" Fix or remove this file an run the program again."
|
||||
))
|
||||
print("Program has stopped. See logs for more info.")
|
||||
break
|
||||
complex_speaker.set_speech_ids()
|
||||
complex_speaker.save_to_file(output_path, file_path, "complex_markup",
|
||||
"File paths", "complex_markup")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
get_names()
|
114
bundesdata_markup_nlp/markup/speakers.py
Executable file
114
bundesdata_markup_nlp/markup/speakers.py
Executable file
@ -0,0 +1,114 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from utility.FileGetter import FileGetter
|
||||
from utility.XMLProtocol import XMLProtocol
|
||||
from markup.EntityMarkup import EntityMarkup
|
||||
from markup.SpeakerMarkup import SpeakerMarkup
|
||||
from tqdm import tqdm
|
||||
import configparser
|
||||
import logging
|
||||
import os
|
||||
|
||||
|
||||
def get_speakers():
|
||||
"""
|
||||
This script identifies speakers in one xml with the new metadata structure
|
||||
created by metastructure.py and applies well-formed XML markup to them and their
|
||||
speeches. The markup trys to follow the official guideline from the Deutsche
|
||||
Bundesregierung but is more simplistic and deviates from it when it comes down
|
||||
to apply markup to the presiden of a session. This decision was made to
|
||||
guarantee that every speakers speech only contains what he or she is saying.
|
||||
Thus the markup follows the own minimal markup defined in the DTD
|
||||
'minimal_markup.dtd' which trys to mimic the official one as close as
|
||||
possible. The full offical markup cannot be applied to the XML protocolls
|
||||
automatically. Script uses classes and subclasses from EntityMarkup.py.
|
||||
"""
|
||||
logger = logging.getLogger(__name__)
|
||||
print("Running simple markup for first speaker identification.")
|
||||
config = configparser.ConfigParser()
|
||||
config.read("config.ini")
|
||||
regex_conf_triples = config.items("Regular expressions speakers")
|
||||
regex_conf_triples = [regex[1].split(" ; ") for regex in regex_conf_triples]
|
||||
input_path = config["File paths"]["new_metadata"]
|
||||
output_path = config["File paths"]["output_folder"]
|
||||
files = FileGetter(input_path, "*.xml")
|
||||
file_list = files.get_files()
|
||||
sum_matches = 0
|
||||
|
||||
for file_path in tqdm(sorted(file_list), desc="Speaker markup status"):
|
||||
|
||||
identified = EntityMarkup(file_path)
|
||||
logger.info("Doing simple markup for: " + str(os.path.basename(file_path)))
|
||||
logger.info("\nMarkup status for: " + str(os.path.basename(file_path)))
|
||||
with open(file_path, 'r') as f:
|
||||
xml_as_string = f.read()
|
||||
xml_as_bytes = xml_as_string.encode("utf-8")
|
||||
bool = identified.simple_check_xml(xml_as_bytes, file_path, False,
|
||||
False)
|
||||
if(bool is False):
|
||||
logger.error(("This XML file is not well-formed. Program stopped."
|
||||
" Fix or remove this file an run the program again."
|
||||
))
|
||||
print("Program has stopped. See logs for more info.")
|
||||
break
|
||||
identified.read_xml(file_path)
|
||||
identified.get_element_text()
|
||||
string_for_markup = identified.current_string
|
||||
# Start of simple markup
|
||||
for regex_conf_triplet in regex_conf_triples:
|
||||
regex = regex_conf_triplet[0]
|
||||
case = regex_conf_triplet[1]
|
||||
speaker = SpeakerMarkup(string_for_markup, regex)
|
||||
speaker.compile_regex(regex)
|
||||
speaker.identify_speaker()
|
||||
speaker.markup_speaker(case)
|
||||
string_for_markup = speaker.markuped_string
|
||||
sum_matches += speaker.matches_count
|
||||
|
||||
logger.info(str(sum_matches) + " total matches in the protocol.")
|
||||
sum_matches = 0
|
||||
speaker.simple_check_xml(string_for_markup, file_path, False)
|
||||
# Saving simple markuped string to xml
|
||||
speaker.read_xml(file_path)
|
||||
speaker.replace_string(string_for_markup, "sitzungsverlauf")
|
||||
speaker.save_to_file(output_path, file_path, "simple_xml", "File paths",
|
||||
"new_simple_markup")
|
||||
|
||||
print("Simple markup finished.")
|
||||
|
||||
config.read("config.ini")
|
||||
new_simple_xml_path = config["File paths"]["new_simple_markup"]
|
||||
# Start of president Replacer
|
||||
new_files = FileGetter(new_simple_xml_path, "*.xml")
|
||||
new_file_list = new_files.get_files()
|
||||
print("Replacing some XML-elements in the protocolls.")
|
||||
for file_path in tqdm(sorted(new_file_list), desc="Files replacement status"):
|
||||
logger.info("Replacing some xml elements for: " + str(os.path.basename(file_path)))
|
||||
for regex_conf_triplet in regex_conf_triples:
|
||||
if(regex_conf_triplet[1] != "first"
|
||||
or regex_conf_triplet[1] != "last"):
|
||||
regex = regex_conf_triplet[0]
|
||||
speaker_rolle_value = regex_conf_triplet[2]
|
||||
replacements = XMLProtocol()
|
||||
replacements.read_xml(file_path)
|
||||
replacements.compile_regex(regex)
|
||||
replacements.expand_element(".//rede", "typ",
|
||||
speaker_rolle_value)
|
||||
replacements.save_to_file(output_path, file_path, "simple_xml",
|
||||
"File paths", "new_simple_markup")
|
||||
start_time_attr_value = replacements.xml_tree.get("sitzung-start-uhrzeit")
|
||||
replacements.replace_tag_attr(".//sitzungsverlauf/rede[1]",
|
||||
"sitzungsbeginn",
|
||||
"sitzung-start-uhrzeit",
|
||||
start_time_attr_value,
|
||||
False)
|
||||
end_time_attr_value = replacements.xml_tree.get("sitzung-ende-uhrzeit")
|
||||
replacements.expand_element(".//sitzungsende", "sitzung-ende-uhrzeit",
|
||||
end_time_attr_value, False)
|
||||
replacements.save_to_file(output_path, file_path, "simple_xml",
|
||||
"File paths", "new_simple_markup")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
get_speakers()
|
76
bundesdata_markup_nlp/markup/speeches.py
Executable file
76
bundesdata_markup_nlp/markup/speeches.py
Executable file
@ -0,0 +1,76 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from utility.FileGetter import FileGetter
|
||||
from markup.EntityMarkup import EntityMarkup
|
||||
import configparser
|
||||
from tqdm import tqdm
|
||||
import logging
|
||||
|
||||
def markup_speeches():
|
||||
"""
|
||||
Marks up different entitys in the speech strings. For example comments.
|
||||
First it marks speech parts (<p>) line by line.
|
||||
"""
|
||||
logger = logging.getLogger(__name__)
|
||||
config = configparser.ConfigParser()
|
||||
config.read("config.ini")
|
||||
complex_xmls = config["File paths"]["complex_markup"]
|
||||
output_path = config["File paths"]["output_folder"]
|
||||
regex_conf_pairs = config.items("Regular expressions speeches")
|
||||
regex_conf_pairs = [regex[1].split(" ; ") for regex in regex_conf_pairs]
|
||||
multiline_entities = config.items("Multiline entities")
|
||||
multiline_entities = [regex[1].split(" ; ") for regex in multiline_entities]
|
||||
files = FileGetter(complex_xmls, "*.xml")
|
||||
file_list = files.get_files()
|
||||
for file_path in tqdm(sorted(file_list), desc="File status speech markup"):
|
||||
entity = EntityMarkup(file_path)
|
||||
entity.read_xml(file_path)
|
||||
speeches = entity.xml_tree.xpath(".//rede")
|
||||
session_start = entity.xml_tree.xpath(".//sitzungsbeginn")[0]
|
||||
for speech in speeches:
|
||||
entity.markup_speech_lines(speech)
|
||||
entity.markup_speech_lines(session_start)
|
||||
|
||||
session_lines = entity.xml_tree.xpath(".//p")
|
||||
for line in tqdm(session_lines, desc="Marking single line entities"):
|
||||
for pair in regex_conf_pairs:
|
||||
entity.inject_element(line, pair[0], pair[1])
|
||||
|
||||
session_lines = entity.xml_tree.xpath(".//p") # gets new altered session lines (<p>)
|
||||
for pair in tqdm(multiline_entities, desc="Marking multiline entities:"):
|
||||
entity.get_multiline_entities(session_lines, pair[0], pair[1], pair[2])
|
||||
# For logging
|
||||
all_entities = 0
|
||||
only_single_line_entities = 0
|
||||
for pair in regex_conf_pairs:
|
||||
element_path = ".//" + pair[1]
|
||||
nr_entities = len(entity.xml_tree.xpath(element_path))
|
||||
logger.info(("Number of identified " + pair[1] + " elements is: "
|
||||
+ str(nr_entities)
|
||||
+ " (single line)"))
|
||||
all_entities += nr_entities
|
||||
only_single_line_entities += nr_entities
|
||||
|
||||
for pair in multiline_entities:
|
||||
element_path = ".//" + pair[2]
|
||||
nr_entities = len(entity.xml_tree.xpath(element_path))
|
||||
logger.info(("Number of identified " + pair[2] + " elements is: "
|
||||
+ str(nr_entities)
|
||||
+ " (multi line)"))
|
||||
all_entities += nr_entities
|
||||
|
||||
logger.info(("Number of all identified single line entities: "
|
||||
+ str(only_single_line_entities)))
|
||||
|
||||
logger.info(("Number of all identified entities is: " + str(all_entities)
|
||||
+ " Also includes multiline matches. Number could be higher"
|
||||
+ " than it is if multiline matches are matching the same"
|
||||
+ " like the single line entitie regexes."))
|
||||
|
||||
entity.save_to_file(output_path, file_path, "clear_speech_markup",
|
||||
"File paths", "clear_speech_markup")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
markup_speeches()
|
Reference in New Issue
Block a user