bundesdata_markup_nlp_software/bundesdata_markup_nlp/markup/speeches.py
2019-02-21 19:29:44 +01:00

77 lines
3.3 KiB
Python
Executable File

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from utility.FileGetter import FileGetter
from markup.EntityMarkup import EntityMarkup
import configparser
from tqdm import tqdm
import logging
def markup_speeches():
"""
Marks up different entitys in the speech strings. For example comments.
First it marks speech parts (<p>) line by line.
"""
logger = logging.getLogger(__name__)
config = configparser.ConfigParser()
config.read("config.ini")
complex_xmls = config["File paths"]["complex_markup"]
output_path = config["File paths"]["output_folder"]
regex_conf_pairs = config.items("Regular expressions speeches")
regex_conf_pairs = [regex[1].split(" ; ") for regex in regex_conf_pairs]
multiline_entities = config.items("Multiline entities")
multiline_entities = [regex[1].split(" ; ") for regex in multiline_entities]
files = FileGetter(complex_xmls, "*.xml")
file_list = files.get_files()
for file_path in tqdm(sorted(file_list), desc="File status speech markup"):
entity = EntityMarkup(file_path)
entity.read_xml(file_path)
speeches = entity.xml_tree.xpath(".//rede")
session_start = entity.xml_tree.xpath(".//sitzungsbeginn")[0]
for speech in speeches:
entity.markup_speech_lines(speech)
entity.markup_speech_lines(session_start)
session_lines = entity.xml_tree.xpath(".//p")
for line in tqdm(session_lines, desc="Marking single line entities"):
for pair in regex_conf_pairs:
entity.inject_element(line, pair[0], pair[1])
session_lines = entity.xml_tree.xpath(".//p") # gets new altered session lines (<p>)
for pair in tqdm(multiline_entities, desc="Marking multiline entities:"):
entity.get_multiline_entities(session_lines, pair[0], pair[1], pair[2])
# For logging
all_entities = 0
only_single_line_entities = 0
for pair in regex_conf_pairs:
element_path = ".//" + pair[1]
nr_entities = len(entity.xml_tree.xpath(element_path))
logger.info(("Number of identified " + pair[1] + " elements is: "
+ str(nr_entities)
+ " (single line)"))
all_entities += nr_entities
only_single_line_entities += nr_entities
for pair in multiline_entities:
element_path = ".//" + pair[2]
nr_entities = len(entity.xml_tree.xpath(element_path))
logger.info(("Number of identified " + pair[2] + " elements is: "
+ str(nr_entities)
+ " (multi line)"))
all_entities += nr_entities
logger.info(("Number of all identified single line entities: "
+ str(only_single_line_entities)))
logger.info(("Number of all identified entities is: " + str(all_entities)
+ " Also includes multiline matches. Number could be higher"
+ " than it is if multiline matches are matching the same"
+ " like the single line entitie regexes."))
entity.save_to_file(output_path, file_path, "clear_speech_markup",
"File paths", "clear_speech_markup")
if __name__ == '__main__':
markup_speeches()