bundesdata_markup_nlp_software/bundesdata_markup_nlp/markup/speakers.py

115 lines
5.4 KiB
Python
Raw Permalink Normal View History

2019-02-21 18:29:44 +00:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from utility.FileGetter import FileGetter
from utility.XMLProtocol import XMLProtocol
from markup.EntityMarkup import EntityMarkup
from markup.SpeakerMarkup import SpeakerMarkup
from tqdm import tqdm
import configparser
import logging
import os
def get_speakers():
"""
This script identifies speakers in one xml with the new metadata structure
created by metastructure.py and applies well-formed XML markup to them and their
speeches. The markup trys to follow the official guideline from the Deutsche
Bundesregierung but is more simplistic and deviates from it when it comes down
to apply markup to the presiden of a session. This decision was made to
guarantee that every speakers speech only contains what he or she is saying.
Thus the markup follows the own minimal markup defined in the DTD
'minimal_markup.dtd' which trys to mimic the official one as close as
possible. The full offical markup cannot be applied to the XML protocolls
automatically. Script uses classes and subclasses from EntityMarkup.py.
"""
logger = logging.getLogger(__name__)
print("Running simple markup for first speaker identification.")
config = configparser.ConfigParser()
config.read("config.ini")
regex_conf_triples = config.items("Regular expressions speakers")
regex_conf_triples = [regex[1].split(" ; ") for regex in regex_conf_triples]
input_path = config["File paths"]["new_metadata"]
output_path = config["File paths"]["output_folder"]
files = FileGetter(input_path, "*.xml")
file_list = files.get_files()
sum_matches = 0
for file_path in tqdm(sorted(file_list), desc="Speaker markup status"):
identified = EntityMarkup(file_path)
logger.info("Doing simple markup for: " + str(os.path.basename(file_path)))
logger.info("\nMarkup status for: " + str(os.path.basename(file_path)))
with open(file_path, 'r') as f:
xml_as_string = f.read()
xml_as_bytes = xml_as_string.encode("utf-8")
bool = identified.simple_check_xml(xml_as_bytes, file_path, False,
False)
if(bool is False):
logger.error(("This XML file is not well-formed. Program stopped."
" Fix or remove this file an run the program again."
))
print("Program has stopped. See logs for more info.")
break
identified.read_xml(file_path)
identified.get_element_text()
string_for_markup = identified.current_string
# Start of simple markup
for regex_conf_triplet in regex_conf_triples:
regex = regex_conf_triplet[0]
case = regex_conf_triplet[1]
speaker = SpeakerMarkup(string_for_markup, regex)
speaker.compile_regex(regex)
speaker.identify_speaker()
speaker.markup_speaker(case)
string_for_markup = speaker.markuped_string
sum_matches += speaker.matches_count
logger.info(str(sum_matches) + " total matches in the protocol.")
sum_matches = 0
speaker.simple_check_xml(string_for_markup, file_path, False)
# Saving simple markuped string to xml
speaker.read_xml(file_path)
speaker.replace_string(string_for_markup, "sitzungsverlauf")
speaker.save_to_file(output_path, file_path, "simple_xml", "File paths",
"new_simple_markup")
print("Simple markup finished.")
config.read("config.ini")
new_simple_xml_path = config["File paths"]["new_simple_markup"]
# Start of president Replacer
new_files = FileGetter(new_simple_xml_path, "*.xml")
new_file_list = new_files.get_files()
print("Replacing some XML-elements in the protocolls.")
for file_path in tqdm(sorted(new_file_list), desc="Files replacement status"):
logger.info("Replacing some xml elements for: " + str(os.path.basename(file_path)))
for regex_conf_triplet in regex_conf_triples:
if(regex_conf_triplet[1] != "first"
or regex_conf_triplet[1] != "last"):
regex = regex_conf_triplet[0]
speaker_rolle_value = regex_conf_triplet[2]
replacements = XMLProtocol()
replacements.read_xml(file_path)
replacements.compile_regex(regex)
replacements.expand_element(".//rede", "typ",
speaker_rolle_value)
replacements.save_to_file(output_path, file_path, "simple_xml",
"File paths", "new_simple_markup")
start_time_attr_value = replacements.xml_tree.get("sitzung-start-uhrzeit")
replacements.replace_tag_attr(".//sitzungsverlauf/rede[1]",
"sitzungsbeginn",
"sitzung-start-uhrzeit",
start_time_attr_value,
False)
end_time_attr_value = replacements.xml_tree.get("sitzung-ende-uhrzeit")
replacements.expand_element(".//sitzungsende", "sitzung-ende-uhrzeit",
end_time_attr_value, False)
replacements.save_to_file(output_path, file_path, "simple_xml",
"File paths", "new_simple_markup")
if __name__ == '__main__':
get_speakers()