162 lines
		
	
	
		
			7.5 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			162 lines
		
	
	
		
			7.5 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
#!/usr/bin/env python
 | 
						|
# -*- coding: utf-8 -*-
 | 
						|
from markup.EntityMarkup import EntityMarkup
 | 
						|
import re
 | 
						|
import logging
 | 
						|
 | 
						|
 | 
						|
class SpeakerMarkup(EntityMarkup):
 | 
						|
    """
 | 
						|
    Class for specific markup of different speakers identified by different
 | 
						|
    regular expressions included in the config file.
 | 
						|
    """
 | 
						|
 | 
						|
    def __init__(self, string, regex):
 | 
						|
        super(SpeakerMarkup).__init__()
 | 
						|
        self.string_to_search = string
 | 
						|
        self.regex_string = regex
 | 
						|
        self.logger = logging.getLogger(__name__)
 | 
						|
 | 
						|
    def identify_speaker(self):
 | 
						|
        """
 | 
						|
        Gets match objects from the speakers in the given text node. Also
 | 
						|
        calculates length of it and puts the matches in a list.
 | 
						|
        """
 | 
						|
        self.matches = re.finditer(self.regex_compiled, self.string_to_search)
 | 
						|
        tmp_list = []
 | 
						|
        for match in self.matches:
 | 
						|
            tmp_list.append(match)
 | 
						|
        self.matches_count = len(tmp_list)
 | 
						|
        self.matches = tmp_list
 | 
						|
 | 
						|
    def markup_speaker(self, case="middle"):
 | 
						|
        """
 | 
						|
        This is where the first simple markup happens. It uses the matches
 | 
						|
        and replaces them with simple markup for further processing. The
 | 
						|
        'first' markup uses re.sub. The second and third one work on string
 | 
						|
        basis.
 | 
						|
        """
 | 
						|
 | 
						|
        def markup_logging():
 | 
						|
            """Helper function for creating log file output."""
 | 
						|
            if(self.matches_count == 0):
 | 
						|
                self.logger.warning("0 matches for given expression:"
 | 
						|
                                    + self.regex_string)
 | 
						|
            elif(self.matches_count > 0):
 | 
						|
                self.logger.info(str(self.matches_count)
 | 
						|
                                 + " matches for given expression:"
 | 
						|
                                 + self.regex_string)
 | 
						|
            elif(self.matches_count == 1):
 | 
						|
                self.logger.info(str(self.matches_count)
 | 
						|
                                 + " match for given expression:"
 | 
						|
                                 + self.regex_string)
 | 
						|
 | 
						|
        if(case == "first"):
 | 
						|
            # Uses re.sub because it is only for one match.
 | 
						|
            start_tags = "<rede><redner>"
 | 
						|
            end_tags = "</redner>"
 | 
						|
            self.matches_count = 1  # sets count to 1 because it only marks the first match
 | 
						|
            markup_logging()
 | 
						|
            first_match = self.matches[0]
 | 
						|
            start_xml = start_tags + first_match.group() + end_tags
 | 
						|
            if(len(first_match.group().split()) <= 10):
 | 
						|
                self.string_to_search = self.regex_compiled.sub(start_xml,
 | 
						|
                                                                self.string_to_search,
 | 
						|
                                                                count=1)
 | 
						|
            self.markuped_string = self.string_to_search
 | 
						|
 | 
						|
        elif(case == "middle"):
 | 
						|
            """
 | 
						|
            Does not use re.sub because it is faster to work on the string.
 | 
						|
            Also it avoids looping two times to get the specific match.group()
 | 
						|
            which caused some errors.
 | 
						|
            """
 | 
						|
            index_shift = 0
 | 
						|
            start_tags = "\n</rede><rede><redner>"
 | 
						|
            end_tags = "</redner>"
 | 
						|
            markup_logging()
 | 
						|
            for match in self.matches:
 | 
						|
                index_start = match.start() + index_shift
 | 
						|
                index_end = match.end() + index_shift
 | 
						|
                whole_match_len = len(match.group())
 | 
						|
                # Handels cases where lots of text before the actual speaker is # matched
 | 
						|
                linebrks_in_match = len(match.group().split("\n"))
 | 
						|
                if(linebrks_in_match >= 2):
 | 
						|
                    last_part_match = "".join(match.group().split("\n")[1:])
 | 
						|
                    first_line_of_match = match.group().split("\n")[0]
 | 
						|
                    if(len(first_line_of_match.split()) <= 10):
 | 
						|
                        match = first_line_of_match + last_part_match
 | 
						|
                    else:
 | 
						|
                        match = last_part_match
 | 
						|
 | 
						|
                    delta_start_index = whole_match_len - len(match)
 | 
						|
                    index_start = index_start + delta_start_index
 | 
						|
 | 
						|
                    self.string_to_search = (self.string_to_search[:index_start]
 | 
						|
                                             + start_tags
 | 
						|
                                             + match
 | 
						|
                                             + end_tags
 | 
						|
                                             + self.string_to_search[index_end:]
 | 
						|
                                             )
 | 
						|
                    index_shift += len(start_tags) + len(end_tags)
 | 
						|
 | 
						|
                else:
 | 
						|
                    self.string_to_search = (self.string_to_search[:index_start]
 | 
						|
                                             + start_tags
 | 
						|
                                             + match.group()
 | 
						|
                                             + end_tags
 | 
						|
                                             + self.string_to_search[index_end:]
 | 
						|
                                             )
 | 
						|
                    index_shift += len(start_tags) + len(end_tags)
 | 
						|
 | 
						|
            self.markuped_string = self.string_to_search
 | 
						|
 | 
						|
        elif(case == "last"):
 | 
						|
            index_shift = 0
 | 
						|
            """
 | 
						|
            Matches the end of the session to add the last closing <rede> tag
 | 
						|
            to the last speech for well-formed xml. Uses re.sub because it is
 | 
						|
            only one operation.
 | 
						|
            """
 | 
						|
            end_tag = "</rede>"
 | 
						|
            session_close_time_tag = ('<sitzungsende/>')
 | 
						|
            # Created end tags will be inserted into the protocol
 | 
						|
            if(len(self.matches) == 1):
 | 
						|
                self.logger.info("Last speech successfully tagged.")
 | 
						|
                markup_logging()
 | 
						|
                for match in self.matches:
 | 
						|
                    end_xml = end_tag + match.group() + session_close_time_tag
 | 
						|
                    if(len(match.group().split()) <= 15):
 | 
						|
                        self.string_to_search = self.regex_compiled.sub(end_xml,
 | 
						|
                                                                        self.string_to_search,
 | 
						|
                                                                        count=1)
 | 
						|
                self.markuped_string = self.string_to_search
 | 
						|
 | 
						|
            elif(len(self.matches) == 0):
 | 
						|
                self.logger.warning(("No end of session found! Last tag " + end_tag
 | 
						|
                                     + " will be added to the end of the protocol."
 | 
						|
                                     " This might add some unrelated text to the "
 | 
						|
                                     "last speech."))
 | 
						|
                markup_logging()
 | 
						|
                self.markuped_string = self.string_to_search + end_tag
 | 
						|
 | 
						|
            else:
 | 
						|
                markup_logging()
 | 
						|
                self.logger.warning(("There are " + str(len(self.matches))
 | 
						|
                                     + " session endings. Ignoring the endings"
 | 
						|
                                     + " before the last final ending of the "
 | 
						|
                                     + " session."))
 | 
						|
                match = self.matches[-1]
 | 
						|
                end_xml = end_tag + match.group() + session_close_time_tag
 | 
						|
                whole_match_len = len(match.group())
 | 
						|
                index_start = match.start() + index_shift
 | 
						|
                index_end = match.end() + index_shift
 | 
						|
                last_line = match.group().split("\n")[-1]  # Always takes the last line of a match avoiding lots of text before the actual speaker.
 | 
						|
                delta_start_index = whole_match_len - len(last_line)
 | 
						|
                index_start = index_start + delta_start_index
 | 
						|
                self.string_to_search = (self.string_to_search[:index_start]
 | 
						|
                                         + end_xml
 | 
						|
                                         + self.string_to_search[index_end:])
 | 
						|
                index_shift += len(end_tag)
 | 
						|
                self.markuped_string = self.string_to_search
 |