#!/usr/bin/env python # -*- coding: utf-8 -*- from markup.EntityMarkup import EntityMarkup import re import logging class SpeakerMarkup(EntityMarkup): """ Class for specific markup of different speakers identified by different regular expressions included in the config file. """ def __init__(self, string, regex): super(SpeakerMarkup).__init__() self.string_to_search = string self.regex_string = regex self.logger = logging.getLogger(__name__) def identify_speaker(self): """ Gets match objects from the speakers in the given text node. Also calculates length of it and puts the matches in a list. """ self.matches = re.finditer(self.regex_compiled, self.string_to_search) tmp_list = [] for match in self.matches: tmp_list.append(match) self.matches_count = len(tmp_list) self.matches = tmp_list def markup_speaker(self, case="middle"): """ This is where the first simple markup happens. It uses the matches and replaces them with simple markup for further processing. The 'first' markup uses re.sub. The second and third one work on string basis. """ def markup_logging(): """Helper function for creating log file output.""" if(self.matches_count == 0): self.logger.warning("0 matches for given expression:" + self.regex_string) elif(self.matches_count > 0): self.logger.info(str(self.matches_count) + " matches for given expression:" + self.regex_string) elif(self.matches_count == 1): self.logger.info(str(self.matches_count) + " match for given expression:" + self.regex_string) if(case == "first"): # Uses re.sub because it is only for one match. start_tags = "" end_tags = "" self.matches_count = 1 # sets count to 1 because it only marks the first match markup_logging() first_match = self.matches[0] start_xml = start_tags + first_match.group() + end_tags if(len(first_match.group().split()) <= 10): self.string_to_search = self.regex_compiled.sub(start_xml, self.string_to_search, count=1) self.markuped_string = self.string_to_search elif(case == "middle"): """ Does not use re.sub because it is faster to work on the string. Also it avoids looping two times to get the specific match.group() which caused some errors. """ index_shift = 0 start_tags = "\n" end_tags = "" markup_logging() for match in self.matches: index_start = match.start() + index_shift index_end = match.end() + index_shift whole_match_len = len(match.group()) # Handels cases where lots of text before the actual speaker is # matched linebrks_in_match = len(match.group().split("\n")) if(linebrks_in_match >= 2): last_part_match = "".join(match.group().split("\n")[1:]) first_line_of_match = match.group().split("\n")[0] if(len(first_line_of_match.split()) <= 10): match = first_line_of_match + last_part_match else: match = last_part_match delta_start_index = whole_match_len - len(match) index_start = index_start + delta_start_index self.string_to_search = (self.string_to_search[:index_start] + start_tags + match + end_tags + self.string_to_search[index_end:] ) index_shift += len(start_tags) + len(end_tags) else: self.string_to_search = (self.string_to_search[:index_start] + start_tags + match.group() + end_tags + self.string_to_search[index_end:] ) index_shift += len(start_tags) + len(end_tags) self.markuped_string = self.string_to_search elif(case == "last"): index_shift = 0 """ Matches the end of the session to add the last closing tag to the last speech for well-formed xml. Uses re.sub because it is only one operation. """ end_tag = "" session_close_time_tag = ('') # Created end tags will be inserted into the protocol if(len(self.matches) == 1): self.logger.info("Last speech successfully tagged.") markup_logging() for match in self.matches: end_xml = end_tag + match.group() + session_close_time_tag if(len(match.group().split()) <= 15): self.string_to_search = self.regex_compiled.sub(end_xml, self.string_to_search, count=1) self.markuped_string = self.string_to_search elif(len(self.matches) == 0): self.logger.warning(("No end of session found! Last tag " + end_tag + " will be added to the end of the protocol." " This might add some unrelated text to the " "last speech.")) markup_logging() self.markuped_string = self.string_to_search + end_tag else: markup_logging() self.logger.warning(("There are " + str(len(self.matches)) + " session endings. Ignoring the endings" + " before the last final ending of the " + " session.")) match = self.matches[-1] end_xml = end_tag + match.group() + session_close_time_tag whole_match_len = len(match.group()) index_start = match.start() + index_shift index_end = match.end() + index_shift last_line = match.group().split("\n")[-1] # Always takes the last line of a match avoiding lots of text before the actual speaker. delta_start_index = whole_match_len - len(last_line) index_start = index_start + delta_start_index self.string_to_search = (self.string_to_search[:index_start] + end_xml + self.string_to_search[index_end:]) index_shift += len(end_tag) self.markuped_string = self.string_to_search