77 lines
3.3 KiB
Python
Executable File
77 lines
3.3 KiB
Python
Executable File
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
|
|
from utility.FileGetter import FileGetter
|
|
from markup.EntityMarkup import EntityMarkup
|
|
import configparser
|
|
from tqdm import tqdm
|
|
import logging
|
|
|
|
def markup_speeches():
|
|
"""
|
|
Marks up different entitys in the speech strings. For example comments.
|
|
First it marks speech parts (<p>) line by line.
|
|
"""
|
|
logger = logging.getLogger(__name__)
|
|
config = configparser.ConfigParser()
|
|
config.read("config.ini")
|
|
complex_xmls = config["File paths"]["complex_markup"]
|
|
output_path = config["File paths"]["output_folder"]
|
|
regex_conf_pairs = config.items("Regular expressions speeches")
|
|
regex_conf_pairs = [regex[1].split(" ; ") for regex in regex_conf_pairs]
|
|
multiline_entities = config.items("Multiline entities")
|
|
multiline_entities = [regex[1].split(" ; ") for regex in multiline_entities]
|
|
files = FileGetter(complex_xmls, "*.xml")
|
|
file_list = files.get_files()
|
|
for file_path in tqdm(sorted(file_list), desc="File status speech markup"):
|
|
entity = EntityMarkup(file_path)
|
|
entity.read_xml(file_path)
|
|
speeches = entity.xml_tree.xpath(".//rede")
|
|
session_start = entity.xml_tree.xpath(".//sitzungsbeginn")[0]
|
|
for speech in speeches:
|
|
entity.markup_speech_lines(speech)
|
|
entity.markup_speech_lines(session_start)
|
|
|
|
session_lines = entity.xml_tree.xpath(".//p")
|
|
for line in tqdm(session_lines, desc="Marking single line entities"):
|
|
for pair in regex_conf_pairs:
|
|
entity.inject_element(line, pair[0], pair[1])
|
|
|
|
session_lines = entity.xml_tree.xpath(".//p") # gets new altered session lines (<p>)
|
|
for pair in tqdm(multiline_entities, desc="Marking multiline entities:"):
|
|
entity.get_multiline_entities(session_lines, pair[0], pair[1], pair[2])
|
|
# For logging
|
|
all_entities = 0
|
|
only_single_line_entities = 0
|
|
for pair in regex_conf_pairs:
|
|
element_path = ".//" + pair[1]
|
|
nr_entities = len(entity.xml_tree.xpath(element_path))
|
|
logger.info(("Number of identified " + pair[1] + " elements is: "
|
|
+ str(nr_entities)
|
|
+ " (single line)"))
|
|
all_entities += nr_entities
|
|
only_single_line_entities += nr_entities
|
|
|
|
for pair in multiline_entities:
|
|
element_path = ".//" + pair[2]
|
|
nr_entities = len(entity.xml_tree.xpath(element_path))
|
|
logger.info(("Number of identified " + pair[2] + " elements is: "
|
|
+ str(nr_entities)
|
|
+ " (multi line)"))
|
|
all_entities += nr_entities
|
|
|
|
logger.info(("Number of all identified single line entities: "
|
|
+ str(only_single_line_entities)))
|
|
|
|
logger.info(("Number of all identified entities is: " + str(all_entities)
|
|
+ " Also includes multiline matches. Number could be higher"
|
|
+ " than it is if multiline matches are matching the same"
|
|
+ " like the single line entitie regexes."))
|
|
|
|
entity.save_to_file(output_path, file_path, "clear_speech_markup",
|
|
"File paths", "clear_speech_markup")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
markup_speeches()
|