bundesdata_markup_nlp_software/bundesdata_markup_nlp/markup/SpeakerMarkup.py
2019-02-21 19:29:44 +01:00

162 lines
7.5 KiB
Python
Executable File

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from markup.EntityMarkup import EntityMarkup
import re
import logging
class SpeakerMarkup(EntityMarkup):
"""
Class for specific markup of different speakers identified by different
regular expressions included in the config file.
"""
def __init__(self, string, regex):
super(SpeakerMarkup).__init__()
self.string_to_search = string
self.regex_string = regex
self.logger = logging.getLogger(__name__)
def identify_speaker(self):
"""
Gets match objects from the speakers in the given text node. Also
calculates length of it and puts the matches in a list.
"""
self.matches = re.finditer(self.regex_compiled, self.string_to_search)
tmp_list = []
for match in self.matches:
tmp_list.append(match)
self.matches_count = len(tmp_list)
self.matches = tmp_list
def markup_speaker(self, case="middle"):
"""
This is where the first simple markup happens. It uses the matches
and replaces them with simple markup for further processing. The
'first' markup uses re.sub. The second and third one work on string
basis.
"""
def markup_logging():
"""Helper function for creating log file output."""
if(self.matches_count == 0):
self.logger.warning("0 matches for given expression:"
+ self.regex_string)
elif(self.matches_count > 0):
self.logger.info(str(self.matches_count)
+ " matches for given expression:"
+ self.regex_string)
elif(self.matches_count == 1):
self.logger.info(str(self.matches_count)
+ " match for given expression:"
+ self.regex_string)
if(case == "first"):
# Uses re.sub because it is only for one match.
start_tags = "<rede><redner>"
end_tags = "</redner>"
self.matches_count = 1 # sets count to 1 because it only marks the first match
markup_logging()
first_match = self.matches[0]
start_xml = start_tags + first_match.group() + end_tags
if(len(first_match.group().split()) <= 10):
self.string_to_search = self.regex_compiled.sub(start_xml,
self.string_to_search,
count=1)
self.markuped_string = self.string_to_search
elif(case == "middle"):
"""
Does not use re.sub because it is faster to work on the string.
Also it avoids looping two times to get the specific match.group()
which caused some errors.
"""
index_shift = 0
start_tags = "\n</rede><rede><redner>"
end_tags = "</redner>"
markup_logging()
for match in self.matches:
index_start = match.start() + index_shift
index_end = match.end() + index_shift
whole_match_len = len(match.group())
# Handels cases where lots of text before the actual speaker is # matched
linebrks_in_match = len(match.group().split("\n"))
if(linebrks_in_match >= 2):
last_part_match = "".join(match.group().split("\n")[1:])
first_line_of_match = match.group().split("\n")[0]
if(len(first_line_of_match.split()) <= 10):
match = first_line_of_match + last_part_match
else:
match = last_part_match
delta_start_index = whole_match_len - len(match)
index_start = index_start + delta_start_index
self.string_to_search = (self.string_to_search[:index_start]
+ start_tags
+ match
+ end_tags
+ self.string_to_search[index_end:]
)
index_shift += len(start_tags) + len(end_tags)
else:
self.string_to_search = (self.string_to_search[:index_start]
+ start_tags
+ match.group()
+ end_tags
+ self.string_to_search[index_end:]
)
index_shift += len(start_tags) + len(end_tags)
self.markuped_string = self.string_to_search
elif(case == "last"):
index_shift = 0
"""
Matches the end of the session to add the last closing <rede> tag
to the last speech for well-formed xml. Uses re.sub because it is
only one operation.
"""
end_tag = "</rede>"
session_close_time_tag = ('<sitzungsende/>')
# Created end tags will be inserted into the protocol
if(len(self.matches) == 1):
self.logger.info("Last speech successfully tagged.")
markup_logging()
for match in self.matches:
end_xml = end_tag + match.group() + session_close_time_tag
if(len(match.group().split()) <= 15):
self.string_to_search = self.regex_compiled.sub(end_xml,
self.string_to_search,
count=1)
self.markuped_string = self.string_to_search
elif(len(self.matches) == 0):
self.logger.warning(("No end of session found! Last tag " + end_tag
+ " will be added to the end of the protocol."
" This might add some unrelated text to the "
"last speech."))
markup_logging()
self.markuped_string = self.string_to_search + end_tag
else:
markup_logging()
self.logger.warning(("There are " + str(len(self.matches))
+ " session endings. Ignoring the endings"
+ " before the last final ending of the "
+ " session."))
match = self.matches[-1]
end_xml = end_tag + match.group() + session_close_time_tag
whole_match_len = len(match.group())
index_start = match.start() + index_shift
index_end = match.end() + index_shift
last_line = match.group().split("\n")[-1] # Always takes the last line of a match avoiding lots of text before the actual speaker.
delta_start_index = whole_match_len - len(last_line)
index_start = index_start + delta_start_index
self.string_to_search = (self.string_to_search[:index_start]
+ end_xml
+ self.string_to_search[index_end:])
index_shift += len(end_tag)
self.markuped_string = self.string_to_search