162 lines
7.5 KiB
Python
Executable File
162 lines
7.5 KiB
Python
Executable File
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
from markup.EntityMarkup import EntityMarkup
|
|
import re
|
|
import logging
|
|
|
|
|
|
class SpeakerMarkup(EntityMarkup):
|
|
"""
|
|
Class for specific markup of different speakers identified by different
|
|
regular expressions included in the config file.
|
|
"""
|
|
|
|
def __init__(self, string, regex):
|
|
super(SpeakerMarkup).__init__()
|
|
self.string_to_search = string
|
|
self.regex_string = regex
|
|
self.logger = logging.getLogger(__name__)
|
|
|
|
def identify_speaker(self):
|
|
"""
|
|
Gets match objects from the speakers in the given text node. Also
|
|
calculates length of it and puts the matches in a list.
|
|
"""
|
|
self.matches = re.finditer(self.regex_compiled, self.string_to_search)
|
|
tmp_list = []
|
|
for match in self.matches:
|
|
tmp_list.append(match)
|
|
self.matches_count = len(tmp_list)
|
|
self.matches = tmp_list
|
|
|
|
def markup_speaker(self, case="middle"):
|
|
"""
|
|
This is where the first simple markup happens. It uses the matches
|
|
and replaces them with simple markup for further processing. The
|
|
'first' markup uses re.sub. The second and third one work on string
|
|
basis.
|
|
"""
|
|
|
|
def markup_logging():
|
|
"""Helper function for creating log file output."""
|
|
if(self.matches_count == 0):
|
|
self.logger.warning("0 matches for given expression:"
|
|
+ self.regex_string)
|
|
elif(self.matches_count > 0):
|
|
self.logger.info(str(self.matches_count)
|
|
+ " matches for given expression:"
|
|
+ self.regex_string)
|
|
elif(self.matches_count == 1):
|
|
self.logger.info(str(self.matches_count)
|
|
+ " match for given expression:"
|
|
+ self.regex_string)
|
|
|
|
if(case == "first"):
|
|
# Uses re.sub because it is only for one match.
|
|
start_tags = "<rede><redner>"
|
|
end_tags = "</redner>"
|
|
self.matches_count = 1 # sets count to 1 because it only marks the first match
|
|
markup_logging()
|
|
first_match = self.matches[0]
|
|
start_xml = start_tags + first_match.group() + end_tags
|
|
if(len(first_match.group().split()) <= 10):
|
|
self.string_to_search = self.regex_compiled.sub(start_xml,
|
|
self.string_to_search,
|
|
count=1)
|
|
self.markuped_string = self.string_to_search
|
|
|
|
elif(case == "middle"):
|
|
"""
|
|
Does not use re.sub because it is faster to work on the string.
|
|
Also it avoids looping two times to get the specific match.group()
|
|
which caused some errors.
|
|
"""
|
|
index_shift = 0
|
|
start_tags = "\n</rede><rede><redner>"
|
|
end_tags = "</redner>"
|
|
markup_logging()
|
|
for match in self.matches:
|
|
index_start = match.start() + index_shift
|
|
index_end = match.end() + index_shift
|
|
whole_match_len = len(match.group())
|
|
# Handels cases where lots of text before the actual speaker is # matched
|
|
linebrks_in_match = len(match.group().split("\n"))
|
|
if(linebrks_in_match >= 2):
|
|
last_part_match = "".join(match.group().split("\n")[1:])
|
|
first_line_of_match = match.group().split("\n")[0]
|
|
if(len(first_line_of_match.split()) <= 10):
|
|
match = first_line_of_match + last_part_match
|
|
else:
|
|
match = last_part_match
|
|
|
|
delta_start_index = whole_match_len - len(match)
|
|
index_start = index_start + delta_start_index
|
|
|
|
self.string_to_search = (self.string_to_search[:index_start]
|
|
+ start_tags
|
|
+ match
|
|
+ end_tags
|
|
+ self.string_to_search[index_end:]
|
|
)
|
|
index_shift += len(start_tags) + len(end_tags)
|
|
|
|
else:
|
|
self.string_to_search = (self.string_to_search[:index_start]
|
|
+ start_tags
|
|
+ match.group()
|
|
+ end_tags
|
|
+ self.string_to_search[index_end:]
|
|
)
|
|
index_shift += len(start_tags) + len(end_tags)
|
|
|
|
self.markuped_string = self.string_to_search
|
|
|
|
elif(case == "last"):
|
|
index_shift = 0
|
|
"""
|
|
Matches the end of the session to add the last closing <rede> tag
|
|
to the last speech for well-formed xml. Uses re.sub because it is
|
|
only one operation.
|
|
"""
|
|
end_tag = "</rede>"
|
|
session_close_time_tag = ('<sitzungsende/>')
|
|
# Created end tags will be inserted into the protocol
|
|
if(len(self.matches) == 1):
|
|
self.logger.info("Last speech successfully tagged.")
|
|
markup_logging()
|
|
for match in self.matches:
|
|
end_xml = end_tag + match.group() + session_close_time_tag
|
|
if(len(match.group().split()) <= 15):
|
|
self.string_to_search = self.regex_compiled.sub(end_xml,
|
|
self.string_to_search,
|
|
count=1)
|
|
self.markuped_string = self.string_to_search
|
|
|
|
elif(len(self.matches) == 0):
|
|
self.logger.warning(("No end of session found! Last tag " + end_tag
|
|
+ " will be added to the end of the protocol."
|
|
" This might add some unrelated text to the "
|
|
"last speech."))
|
|
markup_logging()
|
|
self.markuped_string = self.string_to_search + end_tag
|
|
|
|
else:
|
|
markup_logging()
|
|
self.logger.warning(("There are " + str(len(self.matches))
|
|
+ " session endings. Ignoring the endings"
|
|
+ " before the last final ending of the "
|
|
+ " session."))
|
|
match = self.matches[-1]
|
|
end_xml = end_tag + match.group() + session_close_time_tag
|
|
whole_match_len = len(match.group())
|
|
index_start = match.start() + index_shift
|
|
index_end = match.end() + index_shift
|
|
last_line = match.group().split("\n")[-1] # Always takes the last line of a match avoiding lots of text before the actual speaker.
|
|
delta_start_index = whole_match_len - len(last_line)
|
|
index_start = index_start + delta_start_index
|
|
self.string_to_search = (self.string_to_search[:index_start]
|
|
+ end_xml
|
|
+ self.string_to_search[index_end:])
|
|
index_shift += len(end_tag)
|
|
self.markuped_string = self.string_to_search
|