#!/usr/bin/env python # -*- coding: utf-8 -*- from utility.FileGetter import FileGetter from markup.EntityMarkup import EntityMarkup import configparser from tqdm import tqdm import logging def markup_speeches(): """ Marks up different entitys in the speech strings. For example comments. First it marks speech parts (
) line by line. """ logger = logging.getLogger(__name__) config = configparser.ConfigParser() config.read("config.ini") complex_xmls = config["File paths"]["complex_markup"] output_path = config["File paths"]["output_folder"] regex_conf_pairs = config.items("Regular expressions speeches") regex_conf_pairs = [regex[1].split(" ; ") for regex in regex_conf_pairs] multiline_entities = config.items("Multiline entities") multiline_entities = [regex[1].split(" ; ") for regex in multiline_entities] files = FileGetter(complex_xmls, "*.xml") file_list = files.get_files() for file_path in tqdm(sorted(file_list), desc="File status speech markup"): entity = EntityMarkup(file_path) entity.read_xml(file_path) speeches = entity.xml_tree.xpath(".//rede") session_start = entity.xml_tree.xpath(".//sitzungsbeginn")[0] for speech in speeches: entity.markup_speech_lines(speech) entity.markup_speech_lines(session_start) session_lines = entity.xml_tree.xpath(".//p") for line in tqdm(session_lines, desc="Marking single line entities"): for pair in regex_conf_pairs: entity.inject_element(line, pair[0], pair[1]) session_lines = entity.xml_tree.xpath(".//p") # gets new altered session lines (
) for pair in tqdm(multiline_entities, desc="Marking multiline entities:"): entity.get_multiline_entities(session_lines, pair[0], pair[1], pair[2]) # For logging all_entities = 0 only_single_line_entities = 0 for pair in regex_conf_pairs: element_path = ".//" + pair[1] nr_entities = len(entity.xml_tree.xpath(element_path)) logger.info(("Number of identified " + pair[1] + " elements is: " + str(nr_entities) + " (single line)")) all_entities += nr_entities only_single_line_entities += nr_entities for pair in multiline_entities: element_path = ".//" + pair[2] nr_entities = len(entity.xml_tree.xpath(element_path)) logger.info(("Number of identified " + pair[2] + " elements is: " + str(nr_entities) + " (multi line)")) all_entities += nr_entities logger.info(("Number of all identified single line entities: " + str(only_single_line_entities))) logger.info(("Number of all identified entities is: " + str(all_entities) + " Also includes multiline matches. Number could be higher" + " than it is if multiline matches are matching the same" + " like the single line entitie regexes.")) entity.save_to_file(output_path, file_path, "clear_speech_markup", "File paths", "clear_speech_markup") if __name__ == '__main__': markup_speeches()