115 lines
5.4 KiB
Python
115 lines
5.4 KiB
Python
|
#!/usr/bin/env python
|
||
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
from utility.FileGetter import FileGetter
|
||
|
from utility.XMLProtocol import XMLProtocol
|
||
|
from markup.EntityMarkup import EntityMarkup
|
||
|
from markup.SpeakerMarkup import SpeakerMarkup
|
||
|
from tqdm import tqdm
|
||
|
import configparser
|
||
|
import logging
|
||
|
import os
|
||
|
|
||
|
|
||
|
def get_speakers():
|
||
|
"""
|
||
|
This script identifies speakers in one xml with the new metadata structure
|
||
|
created by metastructure.py and applies well-formed XML markup to them and their
|
||
|
speeches. The markup trys to follow the official guideline from the Deutsche
|
||
|
Bundesregierung but is more simplistic and deviates from it when it comes down
|
||
|
to apply markup to the presiden of a session. This decision was made to
|
||
|
guarantee that every speakers speech only contains what he or she is saying.
|
||
|
Thus the markup follows the own minimal markup defined in the DTD
|
||
|
'minimal_markup.dtd' which trys to mimic the official one as close as
|
||
|
possible. The full offical markup cannot be applied to the XML protocolls
|
||
|
automatically. Script uses classes and subclasses from EntityMarkup.py.
|
||
|
"""
|
||
|
logger = logging.getLogger(__name__)
|
||
|
print("Running simple markup for first speaker identification.")
|
||
|
config = configparser.ConfigParser()
|
||
|
config.read("config.ini")
|
||
|
regex_conf_triples = config.items("Regular expressions speakers")
|
||
|
regex_conf_triples = [regex[1].split(" ; ") for regex in regex_conf_triples]
|
||
|
input_path = config["File paths"]["new_metadata"]
|
||
|
output_path = config["File paths"]["output_folder"]
|
||
|
files = FileGetter(input_path, "*.xml")
|
||
|
file_list = files.get_files()
|
||
|
sum_matches = 0
|
||
|
|
||
|
for file_path in tqdm(sorted(file_list), desc="Speaker markup status"):
|
||
|
|
||
|
identified = EntityMarkup(file_path)
|
||
|
logger.info("Doing simple markup for: " + str(os.path.basename(file_path)))
|
||
|
logger.info("\nMarkup status for: " + str(os.path.basename(file_path)))
|
||
|
with open(file_path, 'r') as f:
|
||
|
xml_as_string = f.read()
|
||
|
xml_as_bytes = xml_as_string.encode("utf-8")
|
||
|
bool = identified.simple_check_xml(xml_as_bytes, file_path, False,
|
||
|
False)
|
||
|
if(bool is False):
|
||
|
logger.error(("This XML file is not well-formed. Program stopped."
|
||
|
" Fix or remove this file an run the program again."
|
||
|
))
|
||
|
print("Program has stopped. See logs for more info.")
|
||
|
break
|
||
|
identified.read_xml(file_path)
|
||
|
identified.get_element_text()
|
||
|
string_for_markup = identified.current_string
|
||
|
# Start of simple markup
|
||
|
for regex_conf_triplet in regex_conf_triples:
|
||
|
regex = regex_conf_triplet[0]
|
||
|
case = regex_conf_triplet[1]
|
||
|
speaker = SpeakerMarkup(string_for_markup, regex)
|
||
|
speaker.compile_regex(regex)
|
||
|
speaker.identify_speaker()
|
||
|
speaker.markup_speaker(case)
|
||
|
string_for_markup = speaker.markuped_string
|
||
|
sum_matches += speaker.matches_count
|
||
|
|
||
|
logger.info(str(sum_matches) + " total matches in the protocol.")
|
||
|
sum_matches = 0
|
||
|
speaker.simple_check_xml(string_for_markup, file_path, False)
|
||
|
# Saving simple markuped string to xml
|
||
|
speaker.read_xml(file_path)
|
||
|
speaker.replace_string(string_for_markup, "sitzungsverlauf")
|
||
|
speaker.save_to_file(output_path, file_path, "simple_xml", "File paths",
|
||
|
"new_simple_markup")
|
||
|
|
||
|
print("Simple markup finished.")
|
||
|
|
||
|
config.read("config.ini")
|
||
|
new_simple_xml_path = config["File paths"]["new_simple_markup"]
|
||
|
# Start of president Replacer
|
||
|
new_files = FileGetter(new_simple_xml_path, "*.xml")
|
||
|
new_file_list = new_files.get_files()
|
||
|
print("Replacing some XML-elements in the protocolls.")
|
||
|
for file_path in tqdm(sorted(new_file_list), desc="Files replacement status"):
|
||
|
logger.info("Replacing some xml elements for: " + str(os.path.basename(file_path)))
|
||
|
for regex_conf_triplet in regex_conf_triples:
|
||
|
if(regex_conf_triplet[1] != "first"
|
||
|
or regex_conf_triplet[1] != "last"):
|
||
|
regex = regex_conf_triplet[0]
|
||
|
speaker_rolle_value = regex_conf_triplet[2]
|
||
|
replacements = XMLProtocol()
|
||
|
replacements.read_xml(file_path)
|
||
|
replacements.compile_regex(regex)
|
||
|
replacements.expand_element(".//rede", "typ",
|
||
|
speaker_rolle_value)
|
||
|
replacements.save_to_file(output_path, file_path, "simple_xml",
|
||
|
"File paths", "new_simple_markup")
|
||
|
start_time_attr_value = replacements.xml_tree.get("sitzung-start-uhrzeit")
|
||
|
replacements.replace_tag_attr(".//sitzungsverlauf/rede[1]",
|
||
|
"sitzungsbeginn",
|
||
|
"sitzung-start-uhrzeit",
|
||
|
start_time_attr_value,
|
||
|
False)
|
||
|
end_time_attr_value = replacements.xml_tree.get("sitzung-ende-uhrzeit")
|
||
|
replacements.expand_element(".//sitzungsende", "sitzung-ende-uhrzeit",
|
||
|
end_time_attr_value, False)
|
||
|
replacements.save_to_file(output_path, file_path, "simple_xml",
|
||
|
"File paths", "new_simple_markup")
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
get_speakers()
|