bundesdata_markup_nlp_software/bundesdata_markup_nlp/markup/speeches.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from utility.FileGetter import FileGetter
from markup.EntityMarkup import EntityMarkup
import configparser
from tqdm import tqdm
import logging

def markup_speeches():
    """
    Marks up different entitys in the speech strings. For example comments.
    First it marks speech parts (<p>) line by line.
    """
    logger = logging.getLogger(__name__)
    config = configparser.ConfigParser()
    config.read("config.ini")
    complex_xmls = config["File paths"]["complex_markup"]
    output_path = config["File paths"]["output_folder"]
    regex_conf_pairs = config.items("Regular expressions speeches")
    regex_conf_pairs = [regex[1].split(" ; ") for regex in regex_conf_pairs]
    multiline_entities = config.items("Multiline entities")
    multiline_entities = [regex[1].split(" ; ") for regex in multiline_entities]
    files = FileGetter(complex_xmls, "*.xml")
    file_list = files.get_files()
    for file_path in tqdm(sorted(file_list), desc="File status speech markup"):
        entity = EntityMarkup(file_path)
        entity.read_xml(file_path)
        speeches = entity.xml_tree.xpath(".//rede")
        session_start = entity.xml_tree.xpath(".//sitzungsbeginn")[0]
        for speech in speeches:
            entity.markup_speech_lines(speech)
        entity.markup_speech_lines(session_start)

        session_lines = entity.xml_tree.xpath(".//p")
        for line in tqdm(session_lines, desc="Marking single line entities"):
            for pair in regex_conf_pairs:
                entity.inject_element(line, pair[0], pair[1])

        session_lines = entity.xml_tree.xpath(".//p") # gets new altered session lines (<p>)
        for pair in tqdm(multiline_entities, desc="Marking multiline entities:"):
            entity.get_multiline_entities(session_lines, pair[0], pair[1], pair[2])
        # For logging
        all_entities = 0
        only_single_line_entities = 0
        for pair in regex_conf_pairs:
            element_path = ".//" + pair[1]
            nr_entities = len(entity.xml_tree.xpath(element_path))
            logger.info(("Number of identified " + pair[1] + " elements is: "
                         + str(nr_entities)
                         + " (single line)"))
            all_entities += nr_entities
            only_single_line_entities += nr_entities

        for pair in multiline_entities:
            element_path = ".//" + pair[2]
            nr_entities = len(entity.xml_tree.xpath(element_path))
            logger.info(("Number of identified " + pair[2] + " elements is: "
                         + str(nr_entities)
                         + " (multi line)"))
            all_entities += nr_entities

        logger.info(("Number of all identified single line entities: "
                     + str(only_single_line_entities)))

        logger.info(("Number of all identified entities is: " + str(all_entities)
                     + " Also includes multiline matches. Number could be higher"
                     + " than it is if multiline matches are matching the same"
                     + " like the single line entitie regexes."))

        entity.save_to_file(output_path, file_path, "clear_speech_markup",
                            "File paths", "clear_speech_markup")


if __name__ == '__main__':
    markup_speeches()