bundesdata_markup_nlp_software/bundesdata_markup_nlp/markup/SpeakerMarkup.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from markup.EntityMarkup import EntityMarkup
import re
import logging


class SpeakerMarkup(EntityMarkup):
    """
    Class for specific markup of different speakers identified by different
    regular expressions included in the config file.
    """

    def __init__(self, string, regex):
        super(SpeakerMarkup).__init__()
        self.string_to_search = string
        self.regex_string = regex
        self.logger = logging.getLogger(__name__)

    def identify_speaker(self):
        """
        Gets match objects from the speakers in the given text node. Also
        calculates length of it and puts the matches in a list.
        """
        self.matches = re.finditer(self.regex_compiled, self.string_to_search)
        tmp_list = []
        for match in self.matches:
            tmp_list.append(match)
        self.matches_count = len(tmp_list)
        self.matches = tmp_list

    def markup_speaker(self, case="middle"):
        """
        This is where the first simple markup happens. It uses the matches
        and replaces them with simple markup for further processing. The
        'first' markup uses re.sub. The second and third one work on string
        basis.
        """

        def markup_logging():
            """Helper function for creating log file output."""
            if(self.matches_count == 0):
                self.logger.warning("0 matches for given expression:"
                                    + self.regex_string)
            elif(self.matches_count > 0):
                self.logger.info(str(self.matches_count)
                                 + " matches for given expression:"
                                 + self.regex_string)
            elif(self.matches_count == 1):
                self.logger.info(str(self.matches_count)
                                 + " match for given expression:"
                                 + self.regex_string)

        if(case == "first"):
            # Uses re.sub because it is only for one match.
            start_tags = "<rede><redner>"
            end_tags = "</redner>"
            self.matches_count = 1  # sets count to 1 because it only marks the first match
            markup_logging()
            first_match = self.matches[0]
            start_xml = start_tags + first_match.group() + end_tags
            if(len(first_match.group().split()) <= 10):
                self.string_to_search = self.regex_compiled.sub(start_xml,
                                                                self.string_to_search,
                                                                count=1)
            self.markuped_string = self.string_to_search

        elif(case == "middle"):
            """
            Does not use re.sub because it is faster to work on the string.
            Also it avoids looping two times to get the specific match.group()
            which caused some errors.
            """
            index_shift = 0
            start_tags = "\n</rede><rede><redner>"
            end_tags = "</redner>"
            markup_logging()
            for match in self.matches:
                index_start = match.start() + index_shift
                index_end = match.end() + index_shift
                whole_match_len = len(match.group())
                # Handels cases where lots of text before the actual speaker is # matched
                linebrks_in_match = len(match.group().split("\n"))
                if(linebrks_in_match >= 2):
                    last_part_match = "".join(match.group().split("\n")[1:])
                    first_line_of_match = match.group().split("\n")[0]
                    if(len(first_line_of_match.split()) <= 10):
                        match = first_line_of_match + last_part_match
                    else:
                        match = last_part_match

                    delta_start_index = whole_match_len - len(match)
                    index_start = index_start + delta_start_index

                    self.string_to_search = (self.string_to_search[:index_start]
                                             + start_tags
                                             + match
                                             + end_tags
                                             + self.string_to_search[index_end:]
                                             )
                    index_shift += len(start_tags) + len(end_tags)

                else:
                    self.string_to_search = (self.string_to_search[:index_start]
                                             + start_tags
                                             + match.group()
                                             + end_tags
                                             + self.string_to_search[index_end:]
                                             )
                    index_shift += len(start_tags) + len(end_tags)

            self.markuped_string = self.string_to_search

        elif(case == "last"):
            index_shift = 0
            """
            Matches the end of the session to add the last closing <rede> tag
            to the last speech for well-formed xml. Uses re.sub because it is
            only one operation.
            """
            end_tag = "</rede>"
            session_close_time_tag = ('<sitzungsende/>')
            # Created end tags will be inserted into the protocol
            if(len(self.matches) == 1):
                self.logger.info("Last speech successfully tagged.")
                markup_logging()
                for match in self.matches:
                    end_xml = end_tag + match.group() + session_close_time_tag
                    if(len(match.group().split()) <= 15):
                        self.string_to_search = self.regex_compiled.sub(end_xml,
                                                                        self.string_to_search,
                                                                        count=1)
                self.markuped_string = self.string_to_search

            elif(len(self.matches) == 0):
                self.logger.warning(("No end of session found! Last tag " + end_tag
                                     + " will be added to the end of the protocol."
                                     " This might add some unrelated text to the "
                                     "last speech."))
                markup_logging()
                self.markuped_string = self.string_to_search + end_tag

            else:
                markup_logging()
                self.logger.warning(("There are " + str(len(self.matches))
                                     + " session endings. Ignoring the endings"
                                     + " before the last final ending of the "
                                     + " session."))
                match = self.matches[-1]
                end_xml = end_tag + match.group() + session_close_time_tag
                whole_match_len = len(match.group())
                index_start = match.start() + index_shift
                index_end = match.end() + index_shift
                last_line = match.group().split("\n")[-1]  # Always takes the last line of a match avoiding lots of text before the actual speaker.
                delta_start_index = whole_match_len - len(last_line)
                index_start = index_start + delta_start_index
                self.string_to_search = (self.string_to_search[:index_start]
                                         + end_xml
                                         + self.string_to_search[index_end:])
                index_shift += len(end_tag)
                self.markuped_string = self.string_to_search