#!/usr/bin/env python # -*- coding: utf-8 -*- from utility.FileGetter import FileGetter from utility.XMLProtocol import XMLProtocol from markup.EntityMarkup import EntityMarkup from markup.SpeakerMarkup import SpeakerMarkup from tqdm import tqdm import configparser import logging import os def get_speakers(): """ This script identifies speakers in one xml with the new metadata structure created by metastructure.py and applies well-formed XML markup to them and their speeches. The markup trys to follow the official guideline from the Deutsche Bundesregierung but is more simplistic and deviates from it when it comes down to apply markup to the presiden of a session. This decision was made to guarantee that every speakers speech only contains what he or she is saying. Thus the markup follows the own minimal markup defined in the DTD 'minimal_markup.dtd' which trys to mimic the official one as close as possible. The full offical markup cannot be applied to the XML protocolls automatically. Script uses classes and subclasses from EntityMarkup.py. """ logger = logging.getLogger(__name__) print("Running simple markup for first speaker identification.") config = configparser.ConfigParser() config.read("config.ini") regex_conf_triples = config.items("Regular expressions speakers") regex_conf_triples = [regex[1].split(" ; ") for regex in regex_conf_triples] input_path = config["File paths"]["new_metadata"] output_path = config["File paths"]["output_folder"] files = FileGetter(input_path, "*.xml") file_list = files.get_files() sum_matches = 0 for file_path in tqdm(sorted(file_list), desc="Speaker markup status"): identified = EntityMarkup(file_path) logger.info("Doing simple markup for: " + str(os.path.basename(file_path))) logger.info("\nMarkup status for: " + str(os.path.basename(file_path))) with open(file_path, 'r') as f: xml_as_string = f.read() xml_as_bytes = xml_as_string.encode("utf-8") bool = identified.simple_check_xml(xml_as_bytes, file_path, False, False) if(bool is False): logger.error(("This XML file is not well-formed. Program stopped." " Fix or remove this file an run the program again." )) print("Program has stopped. See logs for more info.") break identified.read_xml(file_path) identified.get_element_text() string_for_markup = identified.current_string # Start of simple markup for regex_conf_triplet in regex_conf_triples: regex = regex_conf_triplet[0] case = regex_conf_triplet[1] speaker = SpeakerMarkup(string_for_markup, regex) speaker.compile_regex(regex) speaker.identify_speaker() speaker.markup_speaker(case) string_for_markup = speaker.markuped_string sum_matches += speaker.matches_count logger.info(str(sum_matches) + " total matches in the protocol.") sum_matches = 0 speaker.simple_check_xml(string_for_markup, file_path, False) # Saving simple markuped string to xml speaker.read_xml(file_path) speaker.replace_string(string_for_markup, "sitzungsverlauf") speaker.save_to_file(output_path, file_path, "simple_xml", "File paths", "new_simple_markup") print("Simple markup finished.") config.read("config.ini") new_simple_xml_path = config["File paths"]["new_simple_markup"] # Start of president Replacer new_files = FileGetter(new_simple_xml_path, "*.xml") new_file_list = new_files.get_files() print("Replacing some XML-elements in the protocolls.") for file_path in tqdm(sorted(new_file_list), desc="Files replacement status"): logger.info("Replacing some xml elements for: " + str(os.path.basename(file_path))) for regex_conf_triplet in regex_conf_triples: if(regex_conf_triplet[1] != "first" or regex_conf_triplet[1] != "last"): regex = regex_conf_triplet[0] speaker_rolle_value = regex_conf_triplet[2] replacements = XMLProtocol() replacements.read_xml(file_path) replacements.compile_regex(regex) replacements.expand_element(".//rede", "typ", speaker_rolle_value) replacements.save_to_file(output_path, file_path, "simple_xml", "File paths", "new_simple_markup") start_time_attr_value = replacements.xml_tree.get("sitzung-start-uhrzeit") replacements.replace_tag_attr(".//sitzungsverlauf/rede[1]", "sitzungsbeginn", "sitzung-start-uhrzeit", start_time_attr_value, False) end_time_attr_value = replacements.xml_tree.get("sitzung-ende-uhrzeit") replacements.expand_element(".//sitzungsende", "sitzung-ende-uhrzeit", end_time_attr_value, False) replacements.save_to_file(output_path, file_path, "simple_xml", "File paths", "new_simple_markup") if __name__ == '__main__': get_speakers()