Initial commit

2019-02-21 19:29:44 +01:00
commit 4263e5f41e
52 changed files with 3024 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
 data/*
 .idea/*
--- a/README.md
+++ b/README.md
@@ -0,0 +1,72 @@
 # Master_thesis
 Master Thesis Repository.
 ## Benötigte Pakete und Sprachen
 - Python 3.7+
 - Python Pakete werden mittels requirements.txt installiert. Siehe Installation Schritt 2.
 ## Installation
 1. Stellen sie sicher, dass das Paket `python3.7-dev` installiert ist. Wenn nicht: `sudo apt-get install python3.7-dev`
 1. Installieren Sie _virtualenv_ mittels `pip install virtualenv`. Oder dem jeweiligen package manager der eigenen Distribution.
 2. Installieren Sie JS Beautifier systemweit `sudo npm -g install js-beautify` (Optional! Wenn nicht gewünscht, kann der Schritt übersprungen werden. Der Schritt welches dieses Paket während der Auszeichnung benötigt kann übersprungen werden. Allerdings gibt es so keine schön formatierten XML-Dateien.)
 3. Erstelle virtual environment für das Projekt mittels `virtualenv --python=python3.7 path/to/folder`
 4. Aktivieren der virtuellen Umgebung mittels `source path/to/folder/bin/activate`
 5. `cd verzeichnis/des/repository`
 6. Installieren der Abhängigkeiten mit `pip install -r requirements.txt`.
 ## Scriptaufrufe Beispiele:
 ### @Home
 - `source ~/VirtualEnvs/bundesdata/bin/activate`
 - `cd ~/Documents/Eigene\ geschriebene\ Programme/master_thesis/bundesdata/`
 #### Development Data
 **Metadata**
 -`python markup/metastructure.py -p /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data/working_data/development_data_xml -f *.xml -o /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data/working_data`
 **Speakers**
 - `python markup/speakers.py -p /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data/working_data/xml_new_metadata_structure -f *.xml -o /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data/working_data`
 #### Full data
 **Metadata**
 -`python markup/metastructure.py -p /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data/protocols_raw_xml -f *.xml -o /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data`
 **Speakers**
 - `python markup/speakers.py -p /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data/xml_new_metadata_structure -f *.xml -o /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data`
 ### @Uni
 #### Development Data
 - `source /home/stephan/VirtualEnvs/bundesdata/bin/activate`
 - `cd /home/stephan/Repos/master_thesis/bundesdata`
 **Speakers**
 - `python markup/speakers.py -p /home/stephan/Repos/master_thesis/data/working_data/xml_new_metadata_structure -f *.xml -o /home/stephan/Repos/master_thesis/data/working_data`
 **Metadata**
 -`python markup/metastructure.py -p /home/stephan/Repos/master_thesis/data/working_data/development_data_xml -f *.xml -o /home/stephan/Repos/master_thesis/data/working_data`
 #### Test Data
 - `source /home/stephan/VirtualEnvs/bundesdata/bin/activate`
 - `cd /home/stephan/Repos/master_thesis/bundesdata`
 **Speakers**
 - `python markup/speakers.py -p /home/stephan/Repos/master_thesis/data/working_data/test/xml_new_metadata_structure -f *.xml -o /home/stephan/Repos/master_thesis/data/working_data/test`
 **Metadata**
 -`python markup/metastructure.py -p /home/stephan/Repos/master_thesis/data/working_data/test_data_xml -f *.xml -o /home/stephan/Repos/master_thesis/data/working_data/test`
 #### Full data
 - `source /home/stephan/VirtualEnvs/bundesdata/bin/activate`
 - `cd /home/stephan/Repos/master_thesis/bundesdata`
 **Speakers**
 - `python markup/speakers.py -p /home/stephan/Repos/master_thesis/data/xml_new_metadata_structure -f *.xml -o /home/stephan/Repos/master_thesis/data`
 **Metadata**
 -`python markup/metastructure.py -p /home/stephan/Repos/master_thesis/data/protocols_raw_xml -f *.xml -o /home/stephan/Repos/master_thesis/data`
--- a/bundesdata_markup_nlp/init.py
+++ b/bundesdata_markup_nlp/init.py
--- a/bundesdata_markup_nlp/bundesdata_markup.py
+++ b/bundesdata_markup_nlp/bundesdata_markup.py
@@ -0,0 +1,214 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 from markup import metadata, speakers, speaker_names, speeches
 from utility import update_config
 from markup import beautify_markup
 from utility import delete_folder
 import argparse
 import time
 import configparser
 from datetime import datetime
 import logging
 import os
 """
 This is the mains script handeling the automatic markup of the protocols. Needs
 some user Input specified in parse-arguments().
 """
 def parse_arguments():
    """
    Argument Parser
    """
    parser = argparse.ArgumentParser(description="Starts the markup process of \
                                     the XML protocols. Uses either the input  \
                                     and output paths currently specified in   \
                                     the config file or the paths set when     \
                                     calling the script from the terminal with \
                                     the flag argument '-sp' or '--set_paths'. \
                                     Using this parameter writes the given     \
                                     paths into the config file.          \
                                     Some steps of the markup process can be   \
                                     skipped if they already have been executed\
                                     once while useing the -kt option          \
                                     by using the corresponding parameters.    \
                                     ")
    parser.add_argument("-sp",
                        "--set_paths",
                        nargs=2,
                        help="User can set the input and output paths for the  \
                        files created during the markup. The paths will be     \
                        written to the config file.",
                        required=False,
                        type=str,
                        metavar=("input_path", "output_path"))
    parser.add_argument("-sm",
                        "--skip_metadata",
                        help="Skips the script creating metadata and first     \
                        xml strucutre.",
                        action="store_true",
                        required=False)
    parser.add_argument("-ss",
                        "--skip_simple_speakers",
                        help="Skips the script creating the first simple       \
                        speaker markup.",
                        action="store_true",
                        required=False)
    parser.add_argument("-sn",
                        "--skip_name_markup",
                        help="Skips the script creating the name markup.",
                        action="store_true",
                        required=False)
    parser.add_argument("-ssp",
                        "--skip_speeches",
                        help="Skips the script creating markup inside of       \
                        speeches.",
                        action="store_true",
                        required=False)
    parser.add_argument("-sb",
                        "--skip_beautify_xml",
                        help="Skips the script creating beautiful xml files.",
                        action="store_true",
                        required=False)
    parser.add_argument("-kt",
                        "--keep_tmp_files",
                        help="Keeps all temporary xml files beeing created     \
                        during the entire markup process. Using this flag is   \
                        needed when skipping steps of the entire markup during \
                        a rerun of the script.                                 \
                        If this is not set temporary files will always be      \
                        deleted.",
                        action="store_true",
                        required=False)
    parser.add_argument("-fr",
                        "--fresh_run",
                        help="Deltes all temporary folders in output folder    \
                        also deletes all paths saved in the config file file   \
                        before starting the markup process. The user has to set\
                        the paths again with -sp.",
                        action="store_true",
                        required=False)
    parser.add_argument("-la",
                        "--log_all",
                        help="If set the programm will log all information     \
                        about the markup process (statistics etc.). Otherwise  \
                        it only logs errors and warnings.",
                        action="store_true",
                        required=False)
    args = parser.parse_args()
    return args
 def main():
    """
    Main function calling all other scripts for the automatic markup of the
    protocols.
    """
    args = parse_arguments()
    if(args.log_all is True):
        level = logging.INFO
    elif(args.log_all is False):
        level = logging.WARNING
    logging.basicConfig(filename="logs/bundesdata.log", level=level,
                        format="%(asctime)s %(name)s %(levelname)s:%(message)s",
                        datefmt='%Y/%m/%d %H:%M:%S',
                        filemode="w")
    logger = logging.getLogger(__name__)
    start_time = datetime.now()
    print("Start time of script is:", start_time)
    print("Info and status about the markup process can be found in:",
          "logs/bundesdata.log")
    logger.info("Start time of script is: " + str(start_time))
    # Deletes output folder and all folders inside that.
    # Also removes all path options from the section "File paths"
    if(args.fresh_run is True):
        config = configparser.ConfigParser()
        config.read("config.ini")
        options = config.items("File paths")
        for option in options:
            if(option[0] == "output_folder"):
                try:
                    delete_folder.delete_folder(option[1])
                except FileNotFoundError:
                    pass
            else:
                config.remove_option("File paths", option[0])
        with open("config.ini", 'w') as out:
            config.write(out)
    # sets paths and creates output folder
    if(args.set_paths):
        input_path = args.set_paths[0]
        output_path = os.path.join(args.set_paths[1], "output")
        if not os.path.exists(output_path):
            os.mkdir(output_path)
        config = configparser.ConfigParser()
        config.read("config.ini")
        update_config.update_config("config.ini", "File paths",
                                    "input_folder_xmls", input_path)
        update_config.update_config("config.ini", "File paths",
                                    "output_folder", output_path)
    if(args.skip_metadata is not True):
        print("Starting metadata extraction and markup.")
        metadata.get_metadata()
        print("Metadata creation and content splits finished.")
    elif(args.skip_metadata is True):
        print("Skipping script metadata.py.")
    time.sleep(1)
    if(args.skip_simple_speakers is not True):
        print("Starting first simple speeches and speaker markup.")
        speakers.get_speakers()
        print(("Finished simple markup."))
    elif(args.skip_simple_speakers is True):
        print("Skipping script speakers.py.")
    time.sleep(1)
    if(args.skip_name_markup is not True):
        print("Starting complex markup of speaker names.")
        speaker_names.get_names()
        print("Finished complex name markup. (names etc.)")
    elif(args.skip_name_markup is True):
        print("Skipping script speaker_names.py.")
    time.sleep(1)
    if(args.skip_speeches is not True):
        print("Starting markup of comments etc. in speeches.")
        speeches.markup_speeches()
        print("Finished markup of comments etc. in speeches.")
    elif(args.skip_speeches is True):
        print("Skipping script speeches.py.")
    time.sleep(1)
    if(args.skip_beautify_xml is not True):
        print("Starting to prettyfie the xmls.")
        beautify_markup.beautify_xml("markup")
        print("Prettyfied the xmls.")
    elif(args.skip_beautify_xml is True):
        print("Skipping script beautify_markup.py.")
    if(args.keep_tmp_files is not True):
        config = configparser.ConfigParser()
        config.read("config.ini")
        folder_paths = []
        folder_paths.append(config["File paths"]["new_metadata"])
        folder_paths.append(config["File paths"]["new_simple_markup"])
        folder_paths.append(config["File paths"]["complex_markup"])
        folder_paths.append(config["File paths"]["clear_speech_markup"])
        for folder_path in folder_paths:
            delete_folder.delete_folder(folder_path)
    end_time = datetime.now()
    print("End time of script is:", str(end_time))
    logger.info("End time of script is: " + str(end_time))
    duration = end_time - start_time
    print("Duration of script is:", duration)
    logger.info("Script duration is: " + str(duration))
 if __name__ == '__main__':
    main()
--- a/bundesdata_markup_nlp/bundesdata_nlp.py
+++ b/bundesdata_markup_nlp/bundesdata_nlp.py
@@ -0,0 +1,178 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 import argparse
 import configparser
 import os
 import logging
 from utility.FileGetter import FileGetter
 from utility import update_config
 from utility import delete_folder
 from markup import beautify_markup
 from nlp import tokenize, lemmatization, n_grams
 from datetime import datetime
 """
 This script handles the tokenization, lemmatization and ngramm calculation of
 the input protocols. Needs some user input specfied int parse_arguments().
 """
 def parse_arguments():
    """
    Argument Parser
    """
    parser = argparse.ArgumentParser(description="Starts the nlp analysis of   \
                                     the newly created XML-protocols")
    parser.add_argument("-sp",
                        "--set_paths",
                        nargs=2,
                        help="User can set the input and output paths for the  \
                        files created during the nlp process. The paths will be\
                        written to the config file.",
                        required=False,
                        type=str,
                        metavar=("input_path", "output_path"))
    parser.add_argument("-fr",
                        "--fresh_run",
                        help="Deltes all temporary folders and output folders  \
                        created during a previously nlp run before this one    \
                        starts.",
                        action="store_true",
                        required=False)
    parser.add_argument("-sb",
                        "--skip_beautify_xml",
                        help="Skips the script creating beautiful xml files.",
                        action="store_true",
                        required=False)
    parser.add_argument("-ns",
                        "--no_stop_words",
                        help="If this is used the lemmatization or tokenization\
                        of the input protocols will exculde stop words.",
                        required=False,
                        action="store_true")
    group = parser.add_mutually_exclusive_group(required=False)
    group.add_argument("-lm",
                       "--lemmatize",
                       help="Lemmatizes the XML protocols in the input directory\
                       and saves them into the output directory.",
                       action="store_true",
                       required=False)
    group.add_argument("-tn",
                       "--tokenize",
                       help="Tokenizes the XML protocols in the input directory\
                       and saves them into the output directory.",
                       action="store_true",
                       required=False)
    group.add_argument("-cn",
                       "--calculate_n_grams",
                       nargs=2,
                       help="Calculates n_grams for any tokenized or leammtized\
                       XML protocol created by this script.                    \
                       feature_to_group_n_grams_by can be set to the following:\
                       'year','month_year', 'speaker' or 'speech'.",
                       required=False,
                       type=str,
                       metavar=("feature_to_group_n_grams_by", "input_type_name"))
    args = parser.parse_args()
    return args
 def main():
    # logging and start time
    logging.basicConfig(filename="logs/bundesdata_nlp.log", level=logging.INFO,
                        format="%(asctime)s %(name)s %(levelname)s:%(message)s",
                        datefmt='%Y/%m/%d %H:%M:%S',
                        filemode="w")
    logger = logging.getLogger(__name__)
    start_time = datetime.now()
    print("Start time of script is:", start_time)
    print("Info and status about the nlp process can be found in:",
          "logs/bundesdata_nlp.log")
    logger.info("Start time of script is: " + str(start_time))
    # get arguments
    args = parse_arguments()
    # reads config
    config = configparser.ConfigParser()
    config.read("config.ini")
    # if fresh_run is true directory nlp_output will be deleted
    if(args.fresh_run is True):
        config = configparser.ConfigParser()
        config.read("config.ini")
        options = config.items("File paths")
        for option in options:
            if(option[0] == "nlp_output"):
                try:
                    delete_folder.delete_folder(option[1])
                except FileNotFoundError:
                    pass
            else:
                config.remove_option("File paths", option[0])
        with open("config.ini", 'w') as out:
            config.write(out)
    # create outputfolder if it does not exists and wirtes path to config
    if(args.set_paths):
        output_path = os.path.join(args.set_paths[1], "nlp_output")
        if not os.path.exists(output_path):
            os.mkdir(output_path)
        update_config.update_config("config.ini", "File paths",
                                    "nlp_output", output_path)
    else:
        output_path = config["File paths"]["nlp_output"]
        if not os.path.exists(output_path):
            os.mkdir(output_path)
            update_config.update_config("config.ini", "File paths",
                                        "nlp_output", output_path)
    # gets file_path list of input files and wirtes inputfolder path to config
    if(args.set_paths):
        input_path = args.set_paths[0]
        update_config.update_config("config.ini", "File paths",
                                    "nlp_input", input_path)
    elif(args.calculate_n_grams):
        input_path = config["File paths"]["nlp_beuatiful_xml"]
    else:
        input_path = config["File paths"]["nlp_input"]
    files = FileGetter(input_path, "*.xml")
    files = files.get_files()
    # if statements deciding which script will be executed
    if(args.lemmatize is True and args.no_stop_words is True):
        print("Starting lemmatization excluding stop words.")
        lemmatization.lemmatization(files, True)
        print("Finished lemmatization excluding stop words.")
    elif(args.lemmatize is True and args.no_stop_words is False):
        print("Starting lemmatization including stop words.")
        lemmatization.lemmatization(files)
        print("Finished lemmatization including stop words.")
    if(args.tokenize is True and args.no_stop_words is True):
        print("Starting tokenization excluding stop words.")
        tokenize.tokenize(files, True)
        print("Finished tokenization excluding stop words.")
    elif(args.tokenize is True and args.no_stop_words is False):
        print("Starting tokenization including stop words.")
        tokenize.tokenize(files)
        print("Finished tokenization including stop words.")
    if(args.calculate_n_grams):
        print("Starting calculation of n-grams for input files.")
        n_grams.n_grams(files, args.calculate_n_grams[0], args.calculate_n_grams[1])
        print("Finished calculation of n-grams for input files.")
    if(args.skip_beautify_xml is not True and args.lemmatize is True
       or args.tokenize is True):
        print("Starting to prettyfy the xmls.")
        beautify_markup.beautify_xml("nlp", True, 80)
        print("Prettyfied the xmls.")
    elif(args.skip_beautify_xml is True):
        print("Skipping script beautify_markup.py.")
    end_time = datetime.now()
    print("End time of script is:", str(end_time))
    logger.info("End time of script is: " + str(end_time))
    duration = end_time - start_time
    print("Duration of script is:", duration)
    logger.info("Script duration is: " + str(duration))
 if __name__ == '__main__':
    main()
--- a/bundesdata_markup_nlp/config.ini
+++ b/bundesdata_markup_nlp/config.ini
@@ -0,0 +1,47 @@
 [Regular expressions time extraction]
 session_start_time = (?:Die Sitzung wird [umrn]+ (?:(?:(\d{1,2}) Uhr (?:(\d{1,2})?)|(?:(\d{1,2})\.(?:(\d{1,2})) Uhr)) ?(?:Minuten?)?.?)? ?(?:durch\n*[\w \.;'\(\)]*)?[\s \. A-z]*(?:(?:eröffnet\.)|(?:eingeleitet[\w „\",\.]+)))|(?:Begi[\w]+:? (\d{1,2})(?:[, \.]*)?(?:(\d{1,2}))? ?Uhr\.?)|(?:Die Sitzung wird [umrn]+ (\d{1,2}) Uhr eröffnet.)
 session_end_time = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr\s(\d{1,2}).?\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(\d{1,2})\sUhr (\d{1,2})\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr und\s.?(\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (\d{1,2}) Uhr (\d{1,2})\.\))
 [Regular expressions splits]
 session_start_president_split = (\n\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?:)
 attachment_split = ((?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\)))
 [Regular expressions speakers]
 speaker_president_first = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; first ; Präsident
 speaker_state_secretary = ^[ \-\.,\w]+ Staatssekretär[\-\w\n, \n]+: ; middle ; Staatssekretär
 speaker_minister_of_state = ^[ \-\.,\w]+ Staatsminister[\-\w\n, \n]+: ; middle ; Staatsminister
 speaker_applicant = [ \-\.,\w]+ (\([\w ]+\))?, (?:A|a)ntragsteller(?:in)?[\-\w\n, \n]*: ; middle ; Antragsteller
 speaker_president = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; middle ; Präsident
 speaker_undefined = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-\.]+\) ?: ; middle ; MdB
 speaker_defined = ^[\w \-\.,]+ ?Bundesminister(in)? [\w\-\., ]* ?: ; middle ; Bundesminister
 speaker_chancellor = ^[\w \-\.\,]+Bundeskanzler(in)? ?: ; middle ; Bundeskanzler
 speaker_secretary = ^[\w \-\.,]+ ?Schriftführer(in)? ?: ; middle ; Schriftführer
 speaker_rapporteur = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-]+\) ?, (?:B|b)erichterstatter: ; middle ; Berichterstatter
 end_of_session = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\)) ; last ; Zeitpunkt
 [Additional name features]
 academic_titles = Dr. Dr. h. c. ; Dr. h. c.
 parties = DIE LINKE ; CDU/CSU ; PDS/Linke Liste ; Fraktionslos ; F.D.P.
 [Regular expressions speeches]
 comments = \B\([^\(\)]*\)\B ; kommentar
 date_string = [\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,2} ?\. Wahlperiode (?:–|—|-|--) \d{1,3} ?\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]*|[\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,3}\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]* ; metadata
 [Multiline entities]
 multiline_comment = \B\([^\(\)]* ; [^\(\)]*\)\B ; kommentar
 [File paths]
 nlp_output = /home/stephan/Desktop/tmp_test/nlp_output
 nlp_input = /home/stephan/Desktop/tmp_test/nlp_output/nlp_beuatiful_xml/
 nlp_lemmatized_tokenized = /home/stephan/Desktop/tmp_test/nlp_output/lemmatized
 tmp_path = /home/stephan/Desktop/tmp_test/nlp_output/lemmatized/tmp
 nlp_beuatiful_xml = /home/stephan/Desktop/tmp_test/nlp_output/nlp_beuatiful_xml
 input_folder_xmls = /home/stephan/Desktop/tmp_test/protocols/
 output_folder = /home/stephan/Desktop/tmp_test/output
 new_metadata = /home/stephan/Desktop/tmp_test/output/new_metadata
 new_simple_markup = /home/stephan/Desktop/tmp_test/output/simple_xml
 complex_markup = /home/stephan/Desktop/tmp_test/output/complex_markup
 clear_speech_markup = /home/stephan/Desktop/tmp_test/output/clear_speech_markup
 beautiful_xml = /home/stephan/Desktop/tmp_test/output/beautiful_xml
 fixed_markup = /home/stephan/Repos/master_thesis/data/working_data/id_fixed/fixed_markup
--- a/bundesdata_markup_nlp/config_(backup).ini
+++ b/bundesdata_markup_nlp/config_(backup).ini
@@ -0,0 +1,46 @@
 [Regular expressions time extraction]
 session_start_time = (?:Die Sitzung wird [umrn]+ (?:(?:(\d{1,2}) Uhr (?:(\d{1,2})?)|(?:(\d{1,2})\.(?:(\d{1,2})) Uhr)) ?(?:Minuten?)?.?)? ?(?:durch\n*[\w \.;'\(\)]*)?[\s \. A-z]*(?:(?:eröffnet\.)|(?:eingeleitet[\w „\",\.]+)))|(?:Begi[\w]+:? (\d{1,2})(?:[, \.]*)?(?:(\d{1,2}))? ?Uhr\.?)|(?:Die Sitzung wird [umrn]+ (\d{1,2}) Uhr eröffnet.)
 session_end_time = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr\s(\d{1,2}).?\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(\d{1,2})\sUhr (\d{1,2})\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr und\s.?(\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (\d{1,2}) Uhr (\d{1,2})\.\))
 [Regular expressions splits]
 session_start_president_split = (\n\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?:)
 attachment_split = ((?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\)))
 [Regular expressions speakers]
 speaker_president_first = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; first ; Präsident
 speaker_state_secretary = ^[ \-\.,\w]+ Staatssekretär[\-\w\n, \n]+: ; middle ; Staatssekretär
 speaker_minister_of_state = ^[ \-\.,\w]+ Staatsminister[\-\w\n, \n]+: ; middle ; Staatsminister
 speaker_applicant = [ \-\.,\w]+ (\([\w ]+\))?, (?:A|a)ntragsteller(?:in)?[\-\w\n, \n]*: ; middle ; Antragsteller
 speaker_president = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; middle ; Präsident
 speaker_undefined = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-\.]+\) ?: ; middle ; MdB
 speaker_defined = ^[\w \-\.,]+ ?Bundesminister(in)? [\w\-\., ]* ?: ; middle ; Bundesminister
 speaker_chancellor = ^[\w \-\.\,]+Bundeskanzler(in)? ?: ; middle ; Bundeskanzler
 speaker_secretary = ^[\w \-\.,]+ ?Schriftführer(in)? ?: ; middle ; Schriftführer
 speaker_rapporteur = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-]+\) ?, (?:B|b)erichterstatter: ; middle ; Berichterstatter
 end_of_session = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\)) ; last ; Zeitpunkt
 [Additional name features]
 academic_titles = Dr. Dr. h. c. ; Dr. h. c.
 parties = DIE LINKE ; CDU/CSU ; PDS/Linke Liste ; Fraktionslos ; F.D.P.
 [Regular expressions speeches]
 comments = \B\([^\(\)]*\)\B ; kommentar
 date_string = [\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,2} ?\. Wahlperiode (?:–|—|-|--) \d{1,3} ?\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]*|[\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,3}\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]* ; metadata
 [Multiline entities]
 multiline_comment = \B\([^\(\)]* ; [^\(\)]*\)\B ; kommentar
 [File paths]
 nlp_output = /home/stephan/Desktop/nlp_output
 nlp_input = /home/stephan/Repos/master_thesis_data/data/outputs/outputs_markup/development_data/beautiful_xml/
 nlp_lemmatized_tokenized = /home/stephan/Desktop/nlp_output/lemmatized
 tmp_path = /home/stephan/Repos/master_thesis/data/working_data/output/clear_speech_markup/tmp
 nlp_beuatiful_xml = /home/stephan/Desktop/nlp_output/nlp_beuatiful_xml
 input_folder_xmls = /home/stephan/Repos/master_thesis/data/working_data/sub_set/
 output_folder = /home/stephan/Repos/master_thesis/data/working_data/output
 new_metadata = /home/stephan/Repos/master_thesis/data/working_data/output/new_metadata
 new_simple_markup = /home/stephan/Repos/master_thesis/data/working_data/output/simple_xml
 complex_markup = /home/stephan/Repos/master_thesis/data/working_data/output/complex_markup
 clear_speech_markup = /home/stephan/Repos/master_thesis/data/working_data/output/clear_speech_markup
 beautiful_xml = /home/stephan/Repos/master_thesis/data/working_data/output/beautiful_xml
--- a/bundesdata_markup_nlp/config_readme.md
+++ b/bundesdata_markup_nlp/config_readme.md
@@ -0,0 +1,105 @@
 [Regular expressions time extraction]
 # These regular expressions are used to extract the start and ending time of one
 # session. The regular expressions are kind of complex because they have to catch
 # a lot of human errors. To catch those errors the expression is repeatedly
 # "chained" by using the or statement with only minor differences between each
 # expression. This is the easiest way though to catch as many times as possible.
 # The expressions match the partial strings where the start or end time is mentioned.
 # With different match groups the hours and minutes will then be extracted.
 # START TIME: Matches the start time.
 session_start_time = (?:Die Sitzung wird [umrn]+ (?:(?:(\d{1,2}) Uhr (?:(\d{1,2})?)|(?:(\d{1,2})\.(?:(\d{1,2})) Uhr)) ?(?:Minuten?)?.?)? ?(?:durch\n*[\w \.;'\(\)]*)?[\s \. A-z]*(?:(?:eröffnet\.)|(?:eingeleitet[\w „\",\.]+)))|(?:Begi[\w]+:? (\d{1,2})(?:[, \.]*)?(?:(\d{1,2}))? ?Uhr\.?)|(?:Die Sitzung wird [umrn]+ (\d{1,2}) Uhr eröffnet.)|(?:eingeleitet[\w „\",\.]+)))|(?:Begi[\w]+:? (\d{1,2})(?:[, \.]*)?(?:(\d{1,2}))? ?Uhr\.?)|(?:Die Sitzung wird [umrn]+ (\d{1,2}) Uhr eröffnet.)
 # END TIME: Matches the end time.
 session_end_time = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr\s(\d{1,2}).?\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(\d{1,2})\sUhr (\d{1,2})\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr und\s.?(\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (\d{1,2}) Uhr (\d{1,2})\.\))
 [Regular expressions splits]
 # These expressions are used for splitting the protocols at the location if
 # matched.
 # All match groups are non catching except the  group catching the entire regex
 # to insert it later on again. This is the main difference to the time extractions.
 # These splits are needed to automatically separate the actual session content
 # from the table of contents and the attachments.
 # Split at first president occurrence.
 session_start_president_split = (\n\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?:)
 # Split at the end time of session.
 attachment_split = ((?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\)))
 [Regular expressions speakers]
 # These are the regular expressions for matching the speakers in the protocols.
 # They consist of tuples with three values.
 # First element of the tuple is the regex.
 # Second element is a case that tells if this regex should be used as a
 # First, middle, or last element/match during the markup process.
 # Third element describes the type of speech the speaker is holding in German, to use it as an attribute later on.
 # The value tuple is divided with " ; " to convert it into a list later on.
 # It is similar to csv syntax. If needed the user can add more key, value pairs following the same
 # pattern to automatically identify even more speaker roles.
 speaker_president_first = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; first ; Präsident
 speaker_state_secretary = ^[ \-\.,\w]+ Staatssekretär[\-\w\n, \n]+: ; middle ; Staatssekretär
 speaker_minister_of_state = ^[ \-\.,\w]+ Staatsminister[\-\w\n, \n]+: ; middle ; Staatsminister
 speaker_applicant = [ \-\.,\w]+ (\([\w ]+\))?, (?:A|a)ntragsteller(?:in)?[\-\w\n, \n]*: ; middle ; Antragsteller
 speaker_president = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; middle ; Präsident
 speaker_undefined = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-\.]+\) ?: ; middle ; MdB
 speaker_defined = ^[\w \-\.,]+ ?Bundesminister(in)? [\w\-\., ]* ?: ; middle ; Bundesminister
 speaker_chancellor = ^[\w \-\.\,]+Bundeskanzler(in)? ?: ; middle ; Bundeskanzler
 speaker_secretary = ^[\w \-\.,]+ ?Schriftführer(in)? ?: ; middle ; Schriftführer
 speaker_rapporteur = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-]+\) ?, (?:B|b)erichterstatter: ; middle ; Berichterstatter
 end_of_session = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\)) ; last ; Zeitpunkt
 [Additional name features]
 # In this section the user can add additional strings which are not part of the
 # Stammdatenbank but are used inside the protocolls.
 academic_titles = Dr. Dr. h. c. ; Dr. h. c.
 parties = DIE LINKE ; CDU/CSU ; PDS/Linke Liste ; Fraktionslos ; F.D.P.
 [Regular expressions speeches]
 # These regular expressions are used to markup some entities inside of the actual speeches.
 # The value of any given key is a tuple with two values splitted by " ; " like in the section
 # \[Regular expressions speakers\]. First value is the regex and the second value is the tagname
 # wirrten as a string. This list of key, value pairs can also be extended by the user to identify
 # even more entities inside of the speeches. Just add key, value pairs following the same pattern.
 # These expressions are only used to identify entities which are present in one <p> without
 # linebreaks.
 comments = \B\([^\(\)]*\)\B ; kommentar
 date_string_with_periode = [\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,2} ?\. Wahlperiode (?:–|—|-|--) \d{1,3} ?\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]* ; metadata
 date_string_without_periode = [\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,3}\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]* ; metadata
 [Multiline entities]
 # These regulare expressions are used to identifie entities in speeches which span over multiple <p>
 # elements. The value of any given key is a tuple with three values splitted by " ; " like in the
 # section [Regular expressions speakers]. First value is a regex describing how the start of the
 # entity string looks like. The second value is a regex describing how the end of the entity string
 # looks like. Third value is the tagname written as a normal string.
 multiline_comment = \B\([^\(\)]* ; [^\(\)]*\)\B ; kommentar
 [File paths]
 # This is where the paths for input and output folders are set. The input folder
 # path should contain the XML-protocols that will be processed.
 # The output folder path specifies the place where all the intermediate files
 # and the final new XML protocols with the new automatic created markup will be
 # saved.
 input_folder_xmls = /home/stephan/Repos/master_thesis/data/working_data/development_data_xml
 output_folder = /home/stephan/Repos/master_thesis/data/working_data/
 # These paths will be set while running the programm.
 nlp_output = /home/stephan/Desktop/nlp_output
 nlp_input = /home/stephan/Desktop/protocols/
 nlp_lemmatized_tokenized = /home/stephan/Desktop/nlp_output/lemmatized
 tmp_path = /home/stephan/Desktop/nlp_output/lemmatized/tmp
 nlp_beuatiful_xml = /home/stephan/Desktop/nlp_output/nlp_beuatiful_xml
 input_folder_xmls = /home/stephan/Repos/master_thesis_data/inputs/excluded_periods/
 output_folder = /home/stephan/Desktop/output
 new_metadata = /home/stephan/Desktop/output/new_metadata
 new_simple_markup = /home/stephan/Desktop/output/simple_xml
 complex_markup = /home/stephan/Desktop/output/complex_markup
 clear_speech_markup = /home/stephan/Desktop/output/clear_speech_markup
 beautiful_xml = /home/stephan/Desktop/output/beautiful_xml
 fixed_markup = /home/stephan/Repos/master_thesis/data/working_data/id_fixed/fixed_markup
--- a/bundesdata_markup_nlp/markup/EntityMarkup.py
+++ b/bundesdata_markup_nlp/markup/EntityMarkup.py
@@ -0,0 +1,225 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 from markup.MetadataMarkup import MetadataMarkup
 from lxml import etree
 from xml.etree import ElementTree
 from xml.sax.saxutils import escape
 import logging
 import os
 import re
 class EntityMarkup(MetadataMarkup):
    """Class for getting an XML node in which entities will be marked.
    In practice this class and its mehtods can be used to get the text of a
    given Node and marks every speaker in this text string.
    Also passes methods and fields to the more specific
    SimpleSpeakersMarkup."""
    def __init__(self, file_path, element_name=".//sitzungsverlauf"):
        super().__init__()
        self.file_path = file_path
        self.element_name = element_name
        self.xml_tree = None
        self.current_string = str()
        self.filename = os.path.basename(file_path)
        self.logger = logging.getLogger(__name__)
    def get_element_text(self):
        """
        Gets the strings of all elements matched by an element x-path. Element
        name will be passed when the class is istanced. Distunguishes between
        one string or several strings.
        """
        self.all_elements = self.xml_tree.iterfind(self.element_name)
        len_all_elements = len(list(self.all_elements))
        self.current_strings = []
        if(len_all_elements == 1):
            self.all_elements = self.xml_tree.iterfind(self.element_name)
            self.current_string = escape(list(self.all_elements)[0].text)
            self.current_strings.append(self.current_string)
        elif(len_all_elements > 1):
            self.current_strings = []
            self.all_elements = self.xml_tree.iterfind(self.element_name)
            for element in self.all_elements:
                string = escape(element.text)
                self.current_strings.append(string)
        self.all_elements = self.xml_tree.iterfind(self.element_name)
    def replace_string(self, replacement_string, element_name):
        """
        This function takes the newly manipulated xml string and overwrites
        the old string with it.
        """
        replacement_string = (
                              "<" + element_name + ">"
                              + replacement_string
                              + "</" + element_name + ">"
                              )
        for element in self.xml_tree.xpath("//%s" % element_name):
            element.getparent().remove(element)
        replacement_element = etree.fromstring(replacement_string)
        self.xml_tree.insert(1, replacement_element)
    def simple_check_xml(self, xml_string, file_name, save_valid, node=True):
        """
        Checks if a given xml element is well-formed xml. If it is checking a
        partial string it adds a root element. If node is False it is checking a
        document as a string.
        """
        try:
            if(node is True):
                folder_path = "logs/well-formed_strings/"
                file_path = os.path.join(folder_path, os.path.basename(file_name))
                xml_string = "<root>" + xml_string + "</root>"
                tree = etree.fromstring(xml_string)
                self.logger.info(("The node string is well-formed. Simple markup is"
                                  " correct. Node string can be found in "
                                  + folder_path))
                self.logger.info(tree)
                if(save_valid is True):
                    self.logger.info("Node string can be found in" + folder_path)
                    if not os.path.exists(folder_path):
                        os.mkdir(folder_path)
                    with open(file_path, "w") as text_file:
                        text_file.write(xml_string)
            else:
                folder_path = "logs/well-formed_files/"
                file_path = os.path.join(folder_path, os.path.basename(file_name))
                xml_string = xml_string
                tree = etree.fromstring(xml_string)
                self.logger.info("The XML file is well-formed.")
                self.logger.info(tree)
                if(save_valid is True):
                    self.logger.info("File can be found in" + folder_path)
                    if not os.path.exists(folder_path):
                        os.mkdir(folder_path)
                    with open(file_path, "w") as text_file:
                        text_file.write(xml_string.decode("utf-8"))
        except Exception as e:
            if(node is True):
                folder_path = "logs/not_well-formed_strings/"
                file_path = os.path.join(folder_path, os.path.basename(file_name))
                if not os.path.exists(folder_path):
                    os.mkdir(folder_path)
                with open(file_path, "w") as text_file:
                    text_file.write(xml_string)
                self.logger.error(("XML node string is not well-formed. XML can be"
                                   " found in " + folder_path))
                self.logger.error(e)
            else:
                folder_path = "logs/not_well-formed_files/"
                file_path = os.path.join(folder_path, os.path.basename(file_name))
                if not os.path.exists(folder_path):
                    os.mkdir(folder_path)
                with open(file_path, "w") as text_file:
                    text_file.write(xml_string.decode("utf-8"))
                self.logger.error(("XML file is not well-formed. XML can be"
                                   " found in " + folder_path))
                self.logger.error(e)
                return False
    def inject_element(self, current_element, regex, tagname,
                       strip_newlines=False):
        """
        Injects new xml elements into the selected element text. The new element
        will be created by using a regular expression which matches a partial
        string in the current_element text string. The match will be the
        new_element text string. The tagname sets the tagname of the
        new_element. Optionally Attributes can be set aswell.
        """
        element_string = ElementTree.tostring(current_element, encoding="unicode", method="xml")
        match = re.search(regex, element_string)
        if(match):
            index_shift = 0
            if(strip_newlines is True):
                counter = match.group().count("\n")
                match_str = re.sub(r"\n", "", match.group())
            else:
                counter = 0
                match_str = match.group()
            index_start = match.start() + index_shift - counter
            index_end = match.end() + index_shift - counter
            new_element = etree.Element(tagname)
            new_element.text = match_str
            new_element_str = ElementTree.tostring(new_element, encoding="unicode", method="xml")
            element_string = (element_string[:index_start]
                              + new_element_str
                              + element_string[index_end:])
            index_shift += len(new_element_str) - len(match_str)
            replacement_element = etree.fromstring(element_string.encode("utf8"))
            current_element.getparent().replace(current_element, replacement_element)
    def markup_speech_lines(self, current_element):
        """
        Inserts markup in every speech that marks every line <p> with
        attribute klasse="J". J is set for every line even if it is O. In the
        early protocols (period 1. to 10.) One line is most of the time a
        sentence. In the later periods one line is capped at around 80
        characters.
        """
        lines = current_element.xpath("text()")
        if(len(lines) > 0):
            lines = lines[0].splitlines()
        current_element.xpath(".//redner")[0].tail = ""
        for line in lines:
            part_element = etree.Element("p")
            part_element.set("klasse", "J")
            part_element.text = line
            current_element.append(part_element)
    def get_multiline_entities(self, elements, start_of_str, end_of_str,
                               tagname):
        """
        This function identifies multiline entities (i.e. Kommentare/Comments)
        wich are split over multiple elements which have been marked with the
        markup_speech_lines() function.
        Gets the text of those and joins them together into one
        string. The first elements text will be set to the newly created string
        surrounded by new xml tags with tagname set to input tagname.
        All other elements with the rest of the string will be deleted.
        start_of_str should be a regex that describes the pattern how the start
        of the supposed multiline entity looks like. end_of_str describes the
        pattern how the end of the supposed multiline entity looks like.
        """
        self.multiline_text = []
        self.multiline_elements = []
        start_found = False
        end_found = False
        for element in elements:
            if(start_found is False and end_found is False
               and element.text is not None):
                start_match = re.search(start_of_str, element.text)
                if(start_match is not None):
                    self.multiline_text.append(start_match.group())
                    self.multiline_elements.append(element)
                    start_found = True
                    continue
            elif(start_found is True and end_found is False
                 and element.text is not None):
                end_match = re.search(end_of_str, element.text)
                if(end_match):
                    self.multiline_text.append(end_match.group())
                    self.multiline_elements.append(element)
                    end_found = True
                    continue
                else:
                    self.multiline_text.append(element.text)
                    self.multiline_elements.append(element)
                    continue
            elif(start_found is True and end_found is True):
                new_element_text = re.sub(r"- ", "", " ".join(self.multiline_text)) # joins the sting parts and also removes hyphenation
                part_element = etree.Element("p")
                part_element.set("klasse", "J")
                comment_element = etree.Element(tagname)
                comment_element.text = new_element_text
                part_element.append(comment_element)
                self.multiline_elements[0].getparent().replace(self.multiline_elements[0], part_element)
                for element in self.multiline_elements[1:]:
                    element.getparent().remove(element)
                start_found = False
                end_found = False
                self.multiline_text = []
                self.multiline_elements = []
                continue
--- a/bundesdata_markup_nlp/markup/MdBData.py
+++ b/bundesdata_markup_nlp/markup/MdBData.py
@@ -0,0 +1,22 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 from utility.XMLProtocol import XMLProtocol
 import logging
 class MdBData(XMLProtocol):
    """Class to handel operations on the Stammdatenbank."""
    def __init__(self):
        super(XMLProtocol, self).__init__()
        self.logger = logging.getLogger(__name__)
    def get_set(self, element_path, element_tree):
        """
        Creates Sets from input path on element_tree.
        """
        tmp_list = [element.text for element in
                    element_tree.iterfind(element_path) if element is not None]
        set_of_elements = set(tmp_list)
        return set_of_elements
--- a/bundesdata_markup_nlp/markup/MetadataMarkup.py
+++ b/bundesdata_markup_nlp/markup/MetadataMarkup.py
@@ -0,0 +1,267 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 from utility.XMLProtocol import XMLProtocol
 from utility import update_config
 from lxml import etree
 from datetime import datetime
 from babel.dates import format_date
 import os
 import re
 import logging
 import configparser
 class MetadataMarkup(XMLProtocol):
    """
    This class is for opening one XML-protocoll, extracting the included
    metadata and creating a new valid metadata head.
    """
    def __init__(self):
        super().__init__()
        self.plenarprotokoll_string = str()  # will be extracted with extract_metadata()
        self.wahlperiode = int()  # will be extracted with extract_metadata()
        self.sitzungsnr = int()  # will be extracted with extract_metadata()
        self.herausgeber = "Deutscher Bundestag"  # Always the same in every protocoll
        self.berichtart = "Steongrafischer Bericht"  # Always the same in every protocoll
        self.sitzungstitel_string = ". Sitzung"  # Always the same in every protocoll
        self.ort = "Berlin"  # Always the same in every protocoll
        self.datum_ger_non_iso = str()  # will be extracted with extract_metadata()
        self.datum_iso = str()  # ISO-date will be built from self.datum_ger_non_iso
        self.datum_string = str()  # will be built from self.datum_iso
        self.attachment = str()  # will be extracted from a split. Will not work
        # all the time. But will not break the XML.
        self.logger = logging.getLogger(__name__)
    def extract_metadata(self, etree_element_object):
        """
        Extracts metadata from the given XML-tags and wirtes them into the
        instance variables
        """
        root = etree_element_object
        metadata_list = []
        for element in root.iter():
            if(element.tag != "TEXT"):
                metadata_list.append(element.text)
        metadata_list = metadata_list[1:]
        self.wahlperiode = metadata_list[0]
        self.plenarprotokoll_string = metadata_list[1].lower().title()
        self.sitzungsnr = metadata_list[2].split("/")[1]
        self.datum_ger_non_iso = metadata_list[3]
        self.logger.info("Metadata successfully extracted.")
        self.logger.info("Wahlperiode is:" + self.wahlperiode)
        self.logger.info("Plenarprotokoll is:" + self.plenarprotokoll_string)
        self.logger.info("Sitzungsnummer is:" + self.sitzungsnr)
        self.logger.info("German non ISO date is:" + self.datum_ger_non_iso)
    def built_iso_date(self, ger_date):
        """
        Gets the german date and converts it to an ISO standard date.
        """
        self.datum_iso = datetime.strptime(ger_date, "%d.%m.%Y").date()
        self.logger.info("ISO date created:" + str(self.datum_iso))
    def built_date_string(self, iso_date):
        """
        Gets the ISO date and creates from it an german full string date.
        """
        date_string = format_date(iso_date, format="full", locale="de_DE")
        date_string = re.sub(r",", ", den", date_string)
        self.datum_string = date_string
        self.logger.info("Date string created:" + self.datum_string)
    def delete_old_metadata(self, etree_element_object):
        """
        Deletes old metadata tags and text. Renames root tag.
        """
        for element in etree_element_object.iter():
            if(element.tag != "TEXT" and element.tag != "DOKUMENT"):
                element.getparent().remove(element)
            elif(element.tag == "DOKUMENT"):
                element.tag = "dbtplenarprotokoll"
            elif(element.tag == "TEXT"):
                self.full_content = element.text
                element.getparent().remove(element)
        self.logger.info("Old metadata deleted.")
    def insert_new_metadata(self, etree_element_object):
        """
        Inserts the extracted metadata and splitted content into new created
        and valid xml tags according to the official schema.
        """
        vorspann_element = etree.Element("vorspann")
        xml_string = """
    <kopfdaten>
        <plenarprotokoll-nummer>{} <wahlperiode>{}</wahlperiode>/<sitzungsnr>{}</sitzungsnr>
        (neu)</plenarprotokoll-nummer>
        <herausgeber>{}</herausgeber>
        <berichtart>{}</berichtart>
        <sitzungstitel><sitzungsnr>{}</sitzungsnr>. Sitzung</sitzungstitel>
        <veranstaltungsdaten><ort>{}</ort>, <datum date="{}">{}</datum></veranstaltungsdaten>
    </kopfdaten>"""\
            .format(self.plenarprotokoll_string, self.wahlperiode,
                    self.sitzungsnr, self.herausgeber, self.berichtart,
                    self.sitzungsnr, self.ort, self.datum_ger_non_iso,
                    self.datum_string)
        etree_from_str = etree.fromstring(xml_string)
        etree_element_object.insert(0, vorspann_element)
        vorspann_element.append(etree_from_str)
        toc_element = etree.Element("inhaltsverzeichnis")
        toc_element.text = self.toc
        vorspann_element.append(toc_element)
        content_element = etree.Element("sitzungsverlauf")
        content_element.text = self.president + self.content
        etree_element_object.insert(2, content_element)
        anlagen_element = etree.Element("anlagen")
        anlagen_element. text = self.attachment
        etree_element_object.insert(3, anlagen_element)
        rednerliste_element = etree.Element("rednerliste",
                                            sitzungsdatum=self.datum_ger_non_iso)
        etree_element_object.insert(4, rednerliste_element)
        self.xml_tree = etree_element_object
        self.logger.info("New metadata XML-head inserted." + xml_string)
    def split_content(self, etree_element_object):
        """Splits the full content to: table of content, speeches and in some
        cases attachments."""
        config = configparser.ConfigParser()
        config.read("config.ini")
        session_start_split = config["Regular expressions splits"]["session_start_president_split"]
        regex_start = re.compile(session_start_split)
        tmp_list = regex_start.split(self.full_content, maxsplit=1)
        self.toc = tmp_list[0]
        self.president = tmp_list[1]
        self.content = tmp_list[2]
        attachment_split = config["Regular expressions splits"]["attachment_split"]
        regex_att = re.compile(attachment_split)
        tmp_list = regex_att.split(self.content)
        tmp_list = [element for element in tmp_list if element is not None]
        if(tmp_list[-1] == ""):  # if the split does not match anything last item is empty string.
            self.content = "".join(tmp_list[0:-1])
            self.attachment = "Keine Anlage extrahiert."
            self.logger.warning(("There is no attachment."))
        else:
            self.content = "".join(tmp_list[0:-1])
            self.attachment = tmp_list[-1]
            self.logger.info("Attachment found.")
        self.logger.info("Contet splitted at:" + str(regex_start))
        self.logger.info("Contet splitted at:" + str(regex_att))
    def get_session_times(self):
        """This function looks into the entire protocoll content to extract the
        last closing time and the starting time. If only one of both or none are
        found, the missing time will be set to xx:xx."""
        config = configparser.ConfigParser()
        config.read("config.ini")
        regex_conf_values = config.items("Regular expressions time extraction")
        regex_conf_values = [regex[1] for regex in regex_conf_values]
        tmp_list = []
        identifier = 0
        start_time_found = True
        end_time_found = True
        for regex in (regex_conf_values):
            identifier += 1
            regex = re.compile(regex)
            if(identifier == 1):
                # Always gets first start time.
                matches = list(regex.finditer(self.full_content))
                if(len(matches) > 1):
                    match = matches[-1]
                elif(len(matches) == 0):
                    match = None
                else:
                    match = matches[0]
            elif(identifier == 2):
                # Always gets last closing time
                matches = list(regex.finditer(self.full_content))
                if(len(matches) > 1):
                    match = matches[-1]
                elif(len(matches) == 0):
                    match = None
                else:
                    match = matches[0]
            if(match is None and identifier == 1):
                self.logger.warning("No start time found for " + str(regex))
                start_time_found = False
            elif(match is None and identifier == 2):
                self.logger.warning("No end time found for " + str(regex))
                end_time_found = False
            elif(match):
                session_time = [group for group in match.groups()
                                if group is not None]
                session_time = ["0" + group if len(group) == 1 else group for
                                group in session_time]  # Adds a 0 in front if digit len is 1
                if(len(session_time) == 2):
                    tmp_list.append(":".join(session_time))
                elif(len(session_time) == 1):
                    tmp_list.append(session_time[0] + ":00")
        if(len(tmp_list) == 2):
            self.session_start_time = tmp_list[0]
            self.session_end_time = tmp_list[1]
            self.logger.info("Start time found: " + self.session_start_time)
            self.logger.info("End time found: " + self.session_end_time)
            self.logger.info("Successfully matched start and end times.")
        elif(len(tmp_list) == 1 and start_time_found is True and end_time_found
             is False):
            self.session_start_time = tmp_list[0]
            self.session_end_time = "xx:xx"
            self.logger.warning("Only start time found: "
                                + self.session_start_time)
            self.logger.warning("End time set to: "
                                + self.session_end_time)
        elif(len(tmp_list) == 1 and start_time_found is False and end_time_found
             is True):
            self.session_end_time = tmp_list[0]
            self.session_start_time = "xx:xx"
            self.logger.warning("Only end time found: "
                                + self.session_end_time)
            self.logger.warning("Start time set to: "
                                + self.session_start_time)
    def write_to_attr(self, element, attr_key, attr_value):
        """
        Writes two strings as a an attribute key value pair to a given
        element.
        """
        elements = self.xml_tree.findall(element)
        if(elements == []):
            element = self.tree.getroot()
            elements.append(element)
        for element in elements:
            element.set(attr_key, attr_value)
            self.xml_tree = self.xml_tree
            self.logger.info("Wrote attribute "
                             + attr_key
                             + "="
                             + "\""
                             + attr_value
                             + "\"")
    def save_to_file(self, output_path, file_path, subfolder, config_section,
                     config_key):
        """
        Writes the new markup to a new xml file. Takes the output path and
        creates a new folder there. Also updates the config file with the new
        path.
        """
        self.filename = os.path.basename(file_path)
        save_path = os.path.join(output_path, subfolder)
        if not os.path.exists(save_path):
            os.mkdir(save_path)
        tree = etree.ElementTree(self.xml_tree)
        new_filename = self.filename
        save_file_path = os.path.join(save_path, new_filename)
        tree.write(save_file_path,
                   pretty_print=True,
                   xml_declaration=True,
                   encoding="utf8",
                   doctype="<!DOCTYPE dbtplenarprotokoll SYSTEM 'dbtplenarprotokoll_minimal.dtd\'>")
        self.logger.info("New XML saved to:" + save_file_path)
        update_config.update_config("config.ini", config_section, config_key,
                                    save_path)
--- a/bundesdata_markup_nlp/markup/SpeakerMarkup.py
+++ b/bundesdata_markup_nlp/markup/SpeakerMarkup.py
@@ -0,0 +1,161 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 from markup.EntityMarkup import EntityMarkup
 import re
 import logging
 class SpeakerMarkup(EntityMarkup):
    """
    Class for specific markup of different speakers identified by different
    regular expressions included in the config file.
    """
    def __init__(self, string, regex):
        super(SpeakerMarkup).__init__()
        self.string_to_search = string
        self.regex_string = regex
        self.logger = logging.getLogger(__name__)
    def identify_speaker(self):
        """
        Gets match objects from the speakers in the given text node. Also
        calculates length of it and puts the matches in a list.
        """
        self.matches = re.finditer(self.regex_compiled, self.string_to_search)
        tmp_list = []
        for match in self.matches:
            tmp_list.append(match)
        self.matches_count = len(tmp_list)
        self.matches = tmp_list
    def markup_speaker(self, case="middle"):
        """
        This is where the first simple markup happens. It uses the matches
        and replaces them with simple markup for further processing. The
        'first' markup uses re.sub. The second and third one work on string
        basis.
        """
        def markup_logging():
            """Helper function for creating log file output."""
            if(self.matches_count == 0):
                self.logger.warning("0 matches for given expression:"
                                    + self.regex_string)
            elif(self.matches_count > 0):
                self.logger.info(str(self.matches_count)
                                 + " matches for given expression:"
                                 + self.regex_string)
            elif(self.matches_count == 1):
                self.logger.info(str(self.matches_count)
                                 + " match for given expression:"
                                 + self.regex_string)
        if(case == "first"):
            # Uses re.sub because it is only for one match.
            start_tags = "<rede><redner>"
            end_tags = "</redner>"
            self.matches_count = 1  # sets count to 1 because it only marks the first match
            markup_logging()
            first_match = self.matches[0]
            start_xml = start_tags + first_match.group() + end_tags
            if(len(first_match.group().split()) <= 10):
                self.string_to_search = self.regex_compiled.sub(start_xml,
                                                                self.string_to_search,
                                                                count=1)
            self.markuped_string = self.string_to_search
        elif(case == "middle"):
            """
            Does not use re.sub because it is faster to work on the string.
            Also it avoids looping two times to get the specific match.group()
            which caused some errors.
            """
            index_shift = 0
            start_tags = "\n</rede><rede><redner>"
            end_tags = "</redner>"
            markup_logging()
            for match in self.matches:
                index_start = match.start() + index_shift
                index_end = match.end() + index_shift
                whole_match_len = len(match.group())
                # Handels cases where lots of text before the actual speaker is # matched
                linebrks_in_match = len(match.group().split("\n"))
                if(linebrks_in_match >= 2):
                    last_part_match = "".join(match.group().split("\n")[1:])
                    first_line_of_match = match.group().split("\n")[0]
                    if(len(first_line_of_match.split()) <= 10):
                        match = first_line_of_match + last_part_match
                    else:
                        match = last_part_match
                    delta_start_index = whole_match_len - len(match)
                    index_start = index_start + delta_start_index
                    self.string_to_search = (self.string_to_search[:index_start]
                                             + start_tags
                                             + match
                                             + end_tags
                                             + self.string_to_search[index_end:]
                                             )
                    index_shift += len(start_tags) + len(end_tags)
                else:
                    self.string_to_search = (self.string_to_search[:index_start]
                                             + start_tags
                                             + match.group()
                                             + end_tags
                                             + self.string_to_search[index_end:]
                                             )
                    index_shift += len(start_tags) + len(end_tags)
            self.markuped_string = self.string_to_search
        elif(case == "last"):
            index_shift = 0
            """
            Matches the end of the session to add the last closing <rede> tag
            to the last speech for well-formed xml. Uses re.sub because it is
            only one operation.
            """
            end_tag = "</rede>"
            session_close_time_tag = ('<sitzungsende/>')
            # Created end tags will be inserted into the protocol
            if(len(self.matches) == 1):
                self.logger.info("Last speech successfully tagged.")
                markup_logging()
                for match in self.matches:
                    end_xml = end_tag + match.group() + session_close_time_tag
                    if(len(match.group().split()) <= 15):
                        self.string_to_search = self.regex_compiled.sub(end_xml,
                                                                        self.string_to_search,
                                                                        count=1)
                self.markuped_string = self.string_to_search
            elif(len(self.matches) == 0):
                self.logger.warning(("No end of session found! Last tag " + end_tag
                                     + " will be added to the end of the protocol."
                                     " This might add some unrelated text to the "
                                     "last speech."))
                markup_logging()
                self.markuped_string = self.string_to_search + end_tag
            else:
                markup_logging()
                self.logger.warning(("There are " + str(len(self.matches))
                                     + " session endings. Ignoring the endings"
                                     + " before the last final ending of the "
                                     + " session."))
                match = self.matches[-1]
                end_xml = end_tag + match.group() + session_close_time_tag
                whole_match_len = len(match.group())
                index_start = match.start() + index_shift
                index_end = match.end() + index_shift
                last_line = match.group().split("\n")[-1]  # Always takes the last line of a match avoiding lots of text before the actual speaker.
                delta_start_index = whole_match_len - len(last_line)
                index_start = index_start + delta_start_index
                self.string_to_search = (self.string_to_search[:index_start]
                                         + end_xml
                                         + self.string_to_search[index_end:])
                index_shift += len(end_tag)
                self.markuped_string = self.string_to_search
--- a/bundesdata_markup_nlp/markup/SpeakerNameMarkup.py
+++ b/bundesdata_markup_nlp/markup/SpeakerNameMarkup.py
@@ -0,0 +1,554 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 from markup.SpeakerMarkup import SpeakerMarkup
 from xml.etree import ElementTree
 from lxml import etree
 from tqdm import tqdm
 from itertools import combinations
 import copy
 import logging
 import re
 import os
 class SpeakerNameMarkup(SpeakerMarkup):
    """
    This class is for the complex markup of the speakers in one given protocol.
    Creates the name tag with all needed inforamtion from the Stammdatenbank.
    Has to cross reference the speaker with said Stammdatenbank.
    """
    known_redner_dicts = dict()
    last_wahlperiode = int()
    def __init__(self, file_path, element_name=".//redner"):
        super(SpeakerNameMarkup).__init__()
        self.file_path = file_path
        self.filename = os.path.basename(self.file_path)[:-4]
        self.element_name = element_name
        self.redner_dict = dict()
        self.all_speakers = []
        self.logger = logging.getLogger(__name__)
    def cross_reference_markup(self, strings, feature_set_dict,
                               MdB_etree):
        """
        Checks if features like name, surename academic title and city are
        present in the input string. Consists of main function and helper
        functions. First the string will be split in tokens. Every token will
        be checked a gainst sets of valid names, surnames, academic titles and
        fractions. If there is a match a dictionary entriy will be set
        accordingly.
        Also uses the add_missing_MdB_feature helper function in a second step
        to add features which are not present in the string or have been
        identified wrongly.
        The function crates a dictionary containing all features of one speaker
        to crate a valid XML element from it later on.
        """
        def initiate_dict(keys, extra_keys):
            """
            Creates a dictionarie with a set of keys and sets them to None.
            Some specific key values will be set to specific values.
            """
            for key in keys:
                redner_dict[key] = None
            for key in extra_keys:
                redner_dict[key] = None
            redner_dict["feature_complete"] = False
            redner_dict["original_string"] = string
            redner_dict["identified"] = False
            redner_dict["damalige_fraktion"] = None
        def get_names(keys, dict, token):
            """
            Checks if token is in set vorname or nachname. If it is dictionary
            values will be set accordingly. Avoids that surname will be
            overwirtten by a name wich is also a valid surname.
            """
            for key in keys[0:2]:  # Only for vorname, nachname in written order
                if(token in feature_set_dict[key][0] and redner_dict[key]
                   is None):
                    redner_dict[key] = token
                elif(token in feature_set_dict["nachname"][0]
                     and redner_dict["nachname"] is not None):
                    redner_dict["nachname"] = token
                else:
                    continue
        def get_feature(key, string, set):
            """
            Checks if a token is a valid feature (like name affix or academic
            title, ortszusatz or namenszusatz) and adds it to the dictionary.
            Does not check for names.
            """
            for feature in set:
                if(key == "titel"):
                    regex = r"(\b{}\B)".format(re.escape(feature))  # could be Dr. and . is not a word boundary.
                elif(key is "namenszusatz"):
                    regex = r"\b({})\b".format(re.escape(feature))  # No . in word so word boundary at start and end of regex.
                elif(key is "fraktion"):
                        regex = r"\B(\({}\))\B".format(re.escape(feature))  # always surrounded by parentheses, but also has to match them to avoid matching i. e. "CDU" in "CDU/CSU"
                elif(key is "ortszusatz"):
                    regex = r"\B{}\B".format(re.escape(feature))  # always surrounded by parentheses
                else:
                    regex = r"(\b{}\b)".format(re.escape(feature))
                match = re.search(regex, string)
                if(match):
                    if(key == "fraktion"):
                        redner_dict[key] = match.group()[1:-1]  # removes ()
                        break
                    else:
                        redner_dict[key] = match.group()
                        break
                else:
                    redner_dict[key] = None
        def get_role(string):
            """Checks redner string for role. Identifies 'Bundesministerin für
            Familie, Senioren, Frauen und Jugend' etc."""
            if("Staatssekretär" in string or "Staatssekretärin" in string):
                regex = r"(Staatssekretär(in)?)"
                splits = re.split(regex, string, maxsplit=1)
                role_long = splits[1] + splits[-1]
                redner_dict["rolle_lang"] = role_long
                role_short = [word[0] for word in role_long.split()
                              if word[0].isupper()]
                role_short = splits[1] + " " + "".join(role_short)
                redner_dict["rolle_kurz"] = role_short
            elif("Bundesminister" in string or "Bundesministerin" in string):
                regex = r"(Bundesminister(in)?)"
                splits = re.split(regex, string, maxsplit=1)
                role_long = splits[1] + splits[-1]
                redner_dict["rolle_lang"] = role_long
                role_short = [word[0] for word in role_long.split()
                              if word[0].isupper()]
                role_short = splits[1] + " " + "".join(role_short)
                redner_dict["rolle_kurz"] = role_short
        def check_name(redner_dict):
            """
            Checks if vorname and nachname are the same. Sets vorname to None if
            True. Vorname will be set later on with add_missing_MdB_feature.
            """
            if(redner_dict["nachname"] == redner_dict["vorname"]):
                redner_dict["vorname"] = None
        def get_party(redner_dict):
            """
            Creates a party key in the dictionary containing the party of the
            speaker. Party is not the same as fraction. This is mainly done
            because CDU/CSU is the fraction in the bundestag but speakers can
            belong to either the CDU or CSU. If the fraction is not CDU/CSU
            party will be set to fraction. Also handels problems with GRÜNE.
            """
            if(redner_dict["fraktion"] != "CDU/CSU"
               and redner_dict["fraktion"] != "CDU"
               and redner_dict["fraktion"] != "CSU"):
                redner_dict["partei"] = redner_dict["fraktion"]
            elif(redner_dict["fraktion"] == "CDU"
                 or redner_dict["fraktion"] == "CSU"):
                redner_dict["partei"] = redner_dict["fraktion"]
                redner_dict["fraktion"] = "CDU/CSU"
            if(redner_dict["fraktion"] == "GRÜNE"):
                redner_dict["fraktion"] = "BÜNDNIS 90/DIE GRÜNEN"
        def check_party_and_fraction():
            """
            Checks if party and fraction have been set correctly. Will be used
            after add_missing_MdB_feature. To correct some errors with CDU/CSU.
            """
            if(redner_dict["fraktion"] is not None
               and redner_dict["partei"] == "CDU"
               or redner_dict["partei"] == "CSU"):
                redner_dict["fraktion"] = "CDU/CSU"
            if(redner_dict["partei"] is None
               and redner_dict["fraktion"] is not None
               and redner_dict["fraktion"] != "CDU"
               and redner_dict["fraktion"] != "CSU"):
                redner_dict["partei"] = redner_dict["fraktion"]
        def get_match_in_str(key, string, regex):
            """
            Matches a regex in the current string and adds it as a value to the
            given key into the dictionary.
            """
            match = re.search(regex, string)
            if(match):
                redner_dict[key] = match.group()
            else:
                redner_dict[key] = None
        def add_missing_MdB_feature(string, redner_dict, feature_set_dict,
                                    MdB_etree, conditions_key_list,
                                    feature_lookup, feature_to_add,
                                    logging_state=False, multi_ids=False):
            """
            This function trys to get missing features for on speaker. Input is
            a list of features(conditions_key_list) which are used as parameters
            in an xpath expression. The Xpath is built dynamically from the
            list.
            If the Xpath matches one unique entry the feature(feature_to_add)
            will be set to the match of feature_lookup in the matched element.
            """
            ###
            # Xpath creation from conditions_key_list
            ###
            xpath_parts = []
            conds = conditions_key_list
            len_conds = len(conds)
            if(len_conds == 1):
                for condition in conds:
                    xpath_part = ".//MDB[.//{}/text()='{}']"                   \
                                  .format(feature_set_dict[condition][1],
                                          redner_dict[condition])
                    xpath_parts.append(xpath_part)
                xpath = "".join(xpath_parts)
                if("None" in xpath):
                    xpath = None
            elif(len_conds == 2):
                xpath_first_part = ".//MDB[.//{}/text()='{}'"                  \
                                    .format(feature_set_dict[conds[0]][1],
                                            redner_dict[conds[0]])
                xpath_parts.insert(0, xpath_first_part)
                xpath_last_part = ".//{}/text()='{}']"                         \
                                   .format(feature_set_dict[conds[-1]][1],
                                           redner_dict[conds[-1]])
                xpath_parts.append(xpath_last_part)
                xpath = " and ".join(xpath_parts)
                if("None" in xpath):
                    xpath = None
            elif(len_conds > 2):
                xpath_first_part = ".//MDB[.//{}/text()='{}'"                  \
                                    .format(feature_set_dict[conds[0]][1],
                                            redner_dict[conds[0]])
                xpath_parts.insert(0, xpath_first_part)
                for condition in conds[1:-1]:
                    xpath_inner_part = ".//{}/text()='{}'"                     \
                                        .format(feature_set_dict[condition][1],
                                                redner_dict[condition])
                    xpath_parts.append(xpath_inner_part)
                xpath_last_part = ".//{}/text()='{}']"                         \
                                   .format(feature_set_dict[conds[-1]][1],
                                           redner_dict[conds[-1]])
                xpath_parts.append(xpath_last_part)
                xpath = " and ".join(xpath_parts)
                if("None" in xpath):  # sets xpaths to None if it uses a feature which is None
                    xpath = None
            xpath_parts = []  # empties xpath_parts list
            try:  # tries every xpath
                matches = MdB_etree.xpath(xpath)
            except TypeError:  # handles xpaths that are None
                matches = []
            # If xpath has unique match new feature value will be set to given feature
            if(len(matches) == 1):
                matches = matches[0]
                feature_lookup = ".//" + feature_lookup
                new_feature = matches.xpath(feature_lookup)[0].text
                self.logger.info((" There is one unique match "
                                  + " for this speaker: "
                                  + str(redner_dict)
                                  + " Extracted feature "
                                  + feature_lookup + ": "
                                  + str(new_feature)
                                  + " with: "
                                  + str(conds)))
                redner_dict[feature_to_add] = new_feature
                self.logger.info(("New speaker features are: "
                                  + str(redner_dict)))
            # Handels mathches tha are not unique for logging and mutli id
            elif(len(matches) > 1):
                self.logger.warning((" There are "
                                     + str(len(matches))
                                     + " matches for this speaker: "
                                     + str(redner_dict)
                                     + " .Could not extract: "
                                     + feature_lookup
                                     + " Features used are: "
                                     + str(conds)))
            elif(len(matches) > 1 and multi_ids is True):
                ids = matches
                for id, i in ids, enumerate(ids):
                    key = "id" + i
                    redner_dict[key] = id
                return matches
        def get_periode(MdB_etree):
            periode = self.xml_tree.xpath(".//wahlperiode")
            if(periode):
                redner_dict["wahlperiode"] = periode[0].text
                return periode[0].text
    ###
    # Start of main function cross_reference_markup
    ###
        # Initiates empty dict and gets keys for it
        redner_dict = dict()
        features = list(feature_set_dict.keys())
        # Counters to calculate how successful the identification of speakers is
        identified_speakers = 0
        unidentified_speakers = 0
        multiple_identified_speakers = 0
        # Cross references every <redner> string
        for string in tqdm(strings, desc="Cross reference name markup for speakers in strings"):
            self.logger.info("\nStarting name markup process for new speaker:")
            # Sets values in redner_dict to None or specific value
            initiate_dict(features, [feature for feature in features])
            tokens = string.replace(":", "").replace(",", "").split()  # replaces ":" and "," with nothing because some names would be "name:" and some names would contain a ","
            for token in tokens:
                get_names(features, feature_set_dict, token)
            self.logger.info("nachname is: " + str(redner_dict["nachname"]))
            feature_keys = [key for key in features if key not in ["vorname",
                                                                   "nachname"]]
            for f_key in feature_keys:
                get_feature(f_key, string, feature_set_dict[f_key][0])
            get_party(redner_dict)
            check_name(redner_dict)
            regex_p = r"^\w*(?:P|p)räsident\w*"
            get_match_in_str("präsident", string, regex_p)
            get_role(string)
        ###
        # Checks if script is still running for the same current periode.
        # If this is not the case the known_redner_dicts will be emptied.
        ###
            current_wahlperiode = get_periode(MdB_etree)
            if(current_wahlperiode != SpeakerNameMarkup.last_wahlperiode):
                SpeakerNameMarkup.known_redner_dicts = dict()
            SpeakerNameMarkup.last_wahlperiode = current_wahlperiode
        ###
        # Creates possible combinations of features which will be used in
        # add_missing_MdB_feature to identify missing features like vorname or
        # nachname.
        ###
            combination_features = [feature for feature in features if feature
                                    not in ["namenszusatz",
                                            "feature_complete",
                                            "id",
                                            "titel",
                                            "rolle_kurz",
                                            "rolle_lang",
                                            "original_string",
                                            "identified",
                                            "damalige_fraktion"]]
            subsets = []
            for length in range(0, 5):
                for subset in combinations(combination_features, length):
                    subsets.append(list(subset))
            subsets = subsets[1:]
            combination_features.remove("wahlperiode")
            combination_features.remove("nachname")
        ###
        # First while loop trying to identify every feature for one speaker.
        # Uses combinations from above. Before calling the function
        # add_missing_MdB_feature there is a check if the speaker has alreeady
        # been identified before. If this is the case features will be set to
        # the already identfied features. This saves a lot of time.
        ###
            counter_feats = 0
            while(redner_dict["feature_complete"] is False):
                redner_dict["damalige_fraktion"] = redner_dict["fraktion"]
                # print("Doing name markup for:", redner_dict)
                # Checks if speaker has been already identified before.
                if(string in SpeakerNameMarkup.known_redner_dicts):
                    # print("Speaker has already been identified once.")
                    redner_dict = SpeakerNameMarkup.known_redner_dicts[string].copy()
                    # print("Speaker features are set to:",
                    #       SpeakerNameMarkup.known_redner_dicts[string])
                    redner_dict["identified"] = True
                    self.logger.info(("Speaker has alreeady been identified "
                                      + "once."))
                    self.logger.info(("Speaker features are set to: "
                                      + str(SpeakerNameMarkup.known_redner_dicts[string])))
                    if(SpeakerNameMarkup.known_redner_dicts[string]["feature_complete"] is not False):
                        identified_speakers += 1
                    break
                else:
                    for feature in combination_features:
                        for subset in subsets:
                            add_missing_MdB_feature(string,
                                                    redner_dict,
                                                    feature_set_dict,
                                                    MdB_etree,
                                                    subset,
                                                    feature_set_dict[feature][1],
                                                    feature)
                            check_party_and_fraction()
                        if(redner_dict["vorname"] is not None
                           and redner_dict["nachname"] is not None
                           and redner_dict["fraktion"] is not None
                           and redner_dict["partei"] is not None):
                            redner_dict["feature_complete"] = True
                    counter_feats += 1
                    if(counter_feats == len(combination_features)):
                        redner_dict["feature_complete"] = False
                        break
        ###
        # Second while loop uses four features to identfie the unique ID for one
        # speaker with add_missing_MdB_feature. Also tries to identfie speakers
        # with lesser known features. In this case there can be multiple possile
        # ids for one speaker these will be saved in a special dictionary entry.
        # Rare case.
        ###
            counter_ids = 0
            while(redner_dict["id"] is None):
                if(redner_dict["feature_complete"] is True):
                    add_missing_MdB_feature(string,
                                            redner_dict,
                                            feature_set_dict,
                                            MdB_etree,
                                            ["vorname", "nachname", "partei",
                                             "wahlperiode"],
                                            feature_set_dict["id"][1],
                                            "id")
                    key_original_string = redner_dict["original_string"]
                    SpeakerNameMarkup.known_redner_dicts.update(
                                      {key_original_string: redner_dict.copy()})
                    redner_dict["identified"] = True
                    if(counter_ids == 1):
                        redner_dict["id"] = None
                        redner_dict["feature_complete"] = False
                        redner_dict["identified"] = False
                        self.logger.warning(("Unique ID could not be assigned. "
                                             + "Feature complete: True "
                                             + "Features are: "
                                             + str(redner_dict)))
                        SpeakerNameMarkup.known_redner_dicts.update(
                                          {key_original_string: redner_dict.copy()})
                        unidentified_speakers += 1
                        identified_speakers -= 1  # because identified_speakers was set before
                        break
                    identified_speakers += 1
                elif(redner_dict["feature_complete"] is not True):
                    redner_dict["id"] = None
                    ids = add_missing_MdB_feature(string,
                                                  redner_dict,
                                                  feature_set_dict,
                                                  MdB_etree,
                                                  ["nachname", "partei",
                                                   "wahlperiode"],
                                                  feature_set_dict["id"][1],
                                                  "id", False, True)
                    if(ids is not None and len(ids) > 1):
                        redner_dict["identified"] = "Multiple"
                        multiple_identified_speakers += 1
                        identified_speakers -= 1
                        break
                    elif(ids is None):
                        self.logger.warning(("Unique ID could not be assigned. "
                                             + "Feature complete: False "
                                             + "Features are: "
                                             + str(redner_dict)))
                        redner_dict["identified"] = False
                        unidentified_speakers += 1
                        break
                counter_ids += 1
            self.logger.info(("Number of identified speakers with valid id and"
                              + " name markup is: "
                              + str(identified_speakers)))
            self.logger.info(("Number of unidentified speakers without valid"
                              + " id and name markup is: "
                              + str(unidentified_speakers)))
            self.logger.info(("Number of speakers with possible multiple ids: "
                              + str(multiple_identified_speakers)))
            self.logger.info(("Number of all speaker entitiys in current"
                              + " protocoll is: "
                              + str(len(strings))))
            redner_dict_final = copy.deepcopy(redner_dict)
            self.redner_dict = redner_dict_final
            self.all_speakers.append(self.redner_dict)
            for key in features:
                redner_dict[key] = None
            # print("Speaker features after whole cross reference markup:",
            #       redner_dict_final)
        self.logger.info(("Saved speakers (identfied and not identified): "
                          + str(len(self.all_speakers))))
    def create_speaker_elements(self):
        """
        Creates a valid redner XML element for one redner_dict entry from the
        list self.all_speakers. Has to be done step by step becuase dictionary
        is not sorted and name sub elements have to be in specific order.
        """
        self.all_speaker_elements = []
        for redner_entry in tqdm(self.all_speakers, desc="Creating speaker element"):
            redner_element = etree.Element("redner")
            redner_element.set("id", str(redner_entry["id"]))
            name_element = etree.Element("name")
            titel_element = etree.Element("titel")
            titel_element.text = redner_entry["titel"]
            vorname_element = etree.Element("vorname")
            vorname_element.text = redner_entry["vorname"]
            namenszusatz_element = etree.Element("namenszusatz")
            namenszusatz_element.text = redner_entry["namenszusatz"]
            nachname_element = etree.Element("nachname")
            nachname_element.text = redner_entry["nachname"]
            damalige_fraktion_element = etree.Element("damalige_fraktion")
            damalige_fraktion_element.text = redner_entry["damalige_fraktion"]
            fraktion_element = etree.Element("fraktion")
            fraktion_element.text = redner_entry["fraktion"]
            partei_element = etree.Element("partei")
            partei_element.text = redner_entry["partei"]
            ortszusatz_element = etree.Element("ortszusatz")
            ortszusatz_element.text = redner_entry["ortszusatz"]
            rolle_lang_element = etree.Element("rolle_lang")
            rolle_lang_element.text = redner_entry["rolle_lang"]
            rolle_kurz_element = etree.Element("rolle_kurz")
            rolle_kurz_element.text = redner_entry["rolle_kurz"]
            original_string_element = etree.Element("original_string")
            original_string_element.text = redner_entry["original_string"]
            if(redner_entry["titel"] is not None):
                name_element.append(titel_element)
            name_element.append(vorname_element)
            if(redner_entry["namenszusatz"] is not None):
                name_element.append(namenszusatz_element)
            name_element.append(nachname_element)
            name_element.append(damalige_fraktion_element)
            name_element.append(fraktion_element)
            name_element.append(partei_element)
            if(redner_entry["ortszusatz"] is not None):
                name_element.append(ortszusatz_element)
            if(redner_entry["rolle_lang"] is not None):
                name_element.append(rolle_lang_element)
                name_element.append(rolle_kurz_element)
            name_element.append(original_string_element)
            name_element.tail = original_string_element.text
            redner_element.append(name_element)
            self.all_speaker_elements.append(redner_element)
            self.logger.info(("Speaker element is: "
                              + ElementTree.tostring(redner_element).decode("utf-8")))
    def set_speech_ids(self):
        """
        This functions sets a unique rede id for every rede element in one
        protocoll. Id is a ten digit integer preceded by the string ID.
        Example: ID1809900000
        First two digits are the wahlperiode the followinf three digits are the
        sitzungsnr (session number). The remaining digits are for counting the
        speeches. First speech is 00100, second is 00200, eleventh is 01100 and so on.
        Example: ID1809901100 --> eleventh speech
        Last tow digits are for corrections.
        """
        id_counter = 000
        speeches = self.xml_tree.xpath(".//sitzungsbeginn | .//rede")
        for speech in tqdm(speeches, desc="Creating speech ids"):
            id_counter_str = str(id_counter).zfill(5)
            id = "ID" + self.filename + id_counter_str
            speech.set("id", id)
            id_counter += 100
            self.logger.info(("Speech id is: " + id))
        self.xml_tree = self.xml_tree
--- a/bundesdata_markup_nlp/markup/init.py
+++ b/bundesdata_markup_nlp/markup/init.py
--- a/bundesdata_markup_nlp/markup/pycache/EntityMarkup.cpython-37.pyc
+++ b/bundesdata_markup_nlp/markup/pycache/EntityMarkup.cpython-37.pyc
--- a/bundesdata_markup_nlp/markup/pycache/MdBData.cpython-37.pyc
+++ b/bundesdata_markup_nlp/markup/pycache/MdBData.cpython-37.pyc
--- a/bundesdata_markup_nlp/markup/pycache/MetadataMarkup.cpython-37.pyc
+++ b/bundesdata_markup_nlp/markup/pycache/MetadataMarkup.cpython-37.pyc
--- a/bundesdata_markup_nlp/markup/pycache/SpeakerMarkup.cpython-37.pyc
+++ b/bundesdata_markup_nlp/markup/pycache/SpeakerMarkup.cpython-37.pyc
--- a/bundesdata_markup_nlp/markup/pycache/SpeakerNameMarkup.cpython-37.pyc
+++ b/bundesdata_markup_nlp/markup/pycache/SpeakerNameMarkup.cpython-37.pyc
--- a/bundesdata_markup_nlp/markup/pycache/init.cpython-37.pyc
+++ b/bundesdata_markup_nlp/markup/pycache/init.cpython-37.pyc
--- a/bundesdata_markup_nlp/markup/pycache/beautify_markup.cpython-37.pyc
+++ b/bundesdata_markup_nlp/markup/pycache/beautify_markup.cpython-37.pyc
--- a/bundesdata_markup_nlp/markup/pycache/metadata.cpython-37.pyc
+++ b/bundesdata_markup_nlp/markup/pycache/metadata.cpython-37.pyc
--- a/bundesdata_markup_nlp/markup/pycache/speaker_names.cpython-37.pyc
+++ b/bundesdata_markup_nlp/markup/pycache/speaker_names.cpython-37.pyc
--- a/bundesdata_markup_nlp/markup/pycache/speakers.cpython-37.pyc
+++ b/bundesdata_markup_nlp/markup/pycache/speakers.cpython-37.pyc
--- a/bundesdata_markup_nlp/markup/pycache/speeches.cpython-37.pyc
+++ b/bundesdata_markup_nlp/markup/pycache/speeches.cpython-37.pyc
--- a/bundesdata_markup_nlp/markup/beautify_markup.py
+++ b/bundesdata_markup_nlp/markup/beautify_markup.py
@@ -0,0 +1,49 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 from utility.FileGetter import FileGetter
 from utility.XMLProtocol import XMLProtocol
 import configparser
 from tqdm import tqdm
 def beautify_xml(case, alter_lines=False, line_width=0):
    """
    Beautifies the xml protocols so that they are easily readable by humans.
    Uses .beautify_xml_part() and .beautify_xml() to be able to format lines for
    specific parts of an xml. Alter lines can be set to Flase or True. Line
    width that will be used if alter_lines is True can be set to any value
    between 0 and 160.
    """
    config = configparser.ConfigParser()
    config.read("config.ini")
    if(case == "markup"):
        output_path = config["File paths"]["output_folder"]
        input_path = config["File paths"]["clear_speech_markup"]
        key_name = "beautiful_xml"
    elif(case == "nlp"):
        output_path = config["File paths"]["nlp_output"]
        input_path = config["File paths"]["nlp_lemmatized_tokenized"]
        key_name = "nlp_beuatiful_xml"
    files = FileGetter(input_path, "*.xml")
    files = files.get_files()
    for file_path in tqdm(sorted(files), desc="First beautification steps"):
        xml = XMLProtocol()
        xml.read_xml(file_path)
        xml.beautify_xml_part(file_path, ".//vorspann")
        xml.replace_elements(".//vorspann", [xml.beautified_part])
        xml.beautify_xml_part(file_path, ".//sitzungsverlauf", alter_lines,
                              line_width)
        xml.replace_elements(".//sitzungsverlauf", [xml.beautified_part])
        xml.save_to_file(output_path, file_path, key_name,
                         "File paths", key_name)
    config.read("config.ini")
    beautiful_xmls_path = config["File paths"][key_name]
    files = FileGetter(beautiful_xmls_path, "*.xml")
    files = files.get_files()
    for file_path in tqdm(files, desc="Second beautification steps"):
        xml.beautify_xml(file_path, False)
 if __name__ == '__main__':
    beautify_xml()
--- a/bundesdata_markup_nlp/markup/metadata.py
+++ b/bundesdata_markup_nlp/markup/metadata.py
@@ -0,0 +1,57 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 from utility.FileGetter import FileGetter
 from markup.MetadataMarkup import MetadataMarkup
 from tqdm import tqdm
 import os
 import configparser
 import logging
 def get_metadata():
    """
    This script creates a valid metadata head and first level xml tag strucutre
    for all files in one directory with subdirs. It needs all filepaths for all
    files to consider. File paths will be extracted by using the FileGetter
    class.
    After that it extracts the given metadata for one file each and writes it as
    valid XML according to the new offical schema into a new file at the given
    output path.
    """
    logger = logging.getLogger(__name__)
    print("Running metadata creation for original XML-protocolls.")
    config = configparser.ConfigParser()
    config.read("config.ini")
    input_path = config["File paths"]["input_folder_xmls"]
    output_path = config["File paths"]["output_folder"]
    Files = FileGetter(input_path, "*.xml")
    file_list = Files.get_files()
    metadata = MetadataMarkup()
    for file in tqdm(sorted(file_list), desc="Metadata status:"):
        logger.info("\nCreating metadata for: " + str(os.path.basename(file)))
        root = metadata.read_protcol(file)
        metadata.extract_metadata(root)
        metadata.built_iso_date(metadata.datum_ger_non_iso)
        metadata.built_date_string(metadata.datum_iso)
        metadata.delete_old_metadata(root)
        metadata.split_content(root)
        metadata.insert_new_metadata(root)
        metadata.get_session_times()
        metadata.write_to_attr("dbtplenarprotokoll", "sitzung-datum",
                               metadata.datum_ger_non_iso)
        metadata.write_to_attr("dbtplenarprotokoll", "sitzung-start-uhrzeit",
                               metadata.session_start_time)
        metadata.write_to_attr("dbtplenarprotokol", "sitzung-ende-uhrzeit",
                               metadata.session_end_time)
        metadata.write_to_attr("dbtplenarprotokoll", "sitzungs-nr",
                               metadata.sitzungsnr)
        metadata.write_to_attr("dbtplenarprotokol", "wahlperiode",
                               metadata.wahlperiode)
        metadata.save_to_file(output_path, file, "new_metadata", "File paths", "new_metadata")
        logger.info("New metadata created for: " + str(os.path.basename(file)))
    print("Succesfully extracted and wrote new metadata to XML-protocolls.")
 if __name__ == '__main__':
    get_metadata()
--- a/bundesdata_markup_nlp/markup/speaker_names.py
+++ b/bundesdata_markup_nlp/markup/speaker_names.py
@@ -0,0 +1,122 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 from markup.SpeakerNameMarkup import SpeakerNameMarkup
 from markup.MdBData import MdBData
 from utility.FileGetter import FileGetter
 from xml.etree import ElementTree
 from tqdm import tqdm
 import os
 import configparser
 import logging
 def get_names():
    """
    This script gets the identified speaker elements. It will analyse the text
    of those to determine <vorname>, <nachname>, @id etc. for every speaker.
    Also creates a speech id for every speech.
    """
    ###
    # Setting paths in config and start logging
    ###
    logger = logging.getLogger(__name__)
    config = configparser.ConfigParser()
    config.read("config.ini")
    xml_path = config["File paths"]["new_simple_markup"]
    output_path = config["File paths"]["output_folder"]
    parent_path = os.path.dirname(os.getcwd())
    stammdatenbank_full_path = os.path.join(parent_path,
                                            "data/MdB_data/MdB_Stammdaten.xml")
    ###
    # opens and reads Stammdatenbank
    ###
    stammdatenbank = MdBData()
    stammdatenbank.read_xml(stammdatenbank_full_path)
    ###
    # Getting sets of different name name/MdB features
    ###
    # getting first names
    first_names = stammdatenbank.get_set(".//VORNAME", stammdatenbank.xml_tree)
    first_names.discard(None)
    # getting las names
    last_names = stammdatenbank.get_set(".//NACHNAME", stammdatenbank.xml_tree)
    last_names.discard(None)
    # getting academic titles
    academic_titles = stammdatenbank.get_set(".//AKAD_TITEL",
                                             stammdatenbank.xml_tree)
    academic_titles_short = stammdatenbank.get_set(".//ANREDE_TITEL",
                                                   stammdatenbank.xml_tree)
    additional_academic_titles = [title for title in config["Additional name features"]["academic_titles"].split()]
    for title in additional_academic_titles:
        academic_titles.add(title)
    academic_titles = academic_titles.union(academic_titles_short)
    academic_titles.discard(None)
    # getting parties
    parties = stammdatenbank.get_set(".//PARTEI_KURZ", stammdatenbank.xml_tree)
    additional_parties = [party for party in config["Additional name features"]["parties"].split()]
    for party in additional_parties:
        parties.add(party)
    parties.discard(None)
    # getting name affixes
    name_affixes = stammdatenbank.get_set(".//PRAEFIX", stammdatenbank.xml_tree)
    name_affixes.discard(None)
    # getting cities
    cities = stammdatenbank.get_set(".//ORTSZUSATZ", stammdatenbank.xml_tree)
    cities.discard(None)
    # setting empty sets to later combine them with XML node names for XPaths
    party = set()  #
    periode = set()  #
    feature_complete = set()  #
    speaker_id = set()  #
    role_long = set()
    role_short = set()
    ###
    # creating dict with tuples of sets and corresponding XML node name
    ###
    sets = [(first_names, "VORNAME"), (last_names, "NACHNAME"),
            (academic_titles, "AKAD_TITEL"), (parties, "PARTEI_KURZ"),
            (name_affixes, "PRAEFIX"), (cities, "ORTSZUSATZ"),
            (party, "PARTEI_KURZ"), (periode, "WP"), (feature_complete, "None"),
            (speaker_id, "ID"), (role_long, "None"), (role_short, "None")]
    features = ["vorname", "nachname", "titel", "fraktion", "namenszusatz",
                "ortszusatz", "partei", "wahlperiode", "feature_complete",
                "id", "rolle_lang", "rolle_kurz"]
    feature_set_dict = dict(zip(features, sets))
    ###
    # opening XML protocolls
    # starting speaker markup for features
    ###
    files = FileGetter(xml_path, "*.xml")
    files = files.get_files()
    for file_path in tqdm(sorted(files),
                          desc="File status"):
        complex_speaker = SpeakerNameMarkup(file_path, ".//redner")
        complex_speaker.read_xml(file_path)
        complex_speaker.get_element_text()
        logger.info(("Doing cross reference markup for names to get redner ids."
                     + " For file: "
                     + os.path.basename(file_path)))
        complex_speaker.cross_reference_markup(complex_speaker.current_strings,
                                               feature_set_dict,
                                               stammdatenbank.xml_tree)
        complex_speaker.create_speaker_elements()
        complex_speaker.replace_elements(".//redner",
                                         complex_speaker.all_speaker_elements,
                                         True)
        xml_string = ElementTree.tostring(complex_speaker.xml_tree)
        bool = complex_speaker.simple_check_xml(xml_string, file_path, False,
                                                False)
        if(bool is False):
            logger.error(("This XML file is not well-formed. Program stopped."
                          " Fix or remove this file an run the program again."
                          ))
            print("Program has stopped. See logs for more info.")
            break
        complex_speaker.set_speech_ids()
        complex_speaker.save_to_file(output_path, file_path, "complex_markup",
                                     "File paths", "complex_markup")
 if __name__ == '__main__':
    get_names()
--- a/bundesdata_markup_nlp/markup/speakers.py
+++ b/bundesdata_markup_nlp/markup/speakers.py
@@ -0,0 +1,114 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 from utility.FileGetter import FileGetter
 from utility.XMLProtocol import XMLProtocol
 from markup.EntityMarkup import EntityMarkup
 from markup.SpeakerMarkup import SpeakerMarkup
 from tqdm import tqdm
 import configparser
 import logging
 import os
 def get_speakers():
    """
    This script identifies speakers in one xml with the new metadata structure
    created by metastructure.py and applies well-formed XML markup to them and their
    speeches. The markup trys to follow the official guideline from the Deutsche
    Bundesregierung but is more simplistic and deviates from it when it comes down
    to apply markup to the presiden of a session. This decision was made to
    guarantee that every speakers speech only contains what he or she is saying.
    Thus the markup follows the own minimal markup defined in the DTD
    'minimal_markup.dtd' which trys to mimic the official one as close as
    possible. The full offical markup cannot be applied to the XML protocolls
    automatically. Script uses classes and subclasses from EntityMarkup.py.
    """
    logger = logging.getLogger(__name__)
    print("Running simple markup for first speaker identification.")
    config = configparser.ConfigParser()
    config.read("config.ini")
    regex_conf_triples = config.items("Regular expressions speakers")
    regex_conf_triples = [regex[1].split(" ; ") for regex in regex_conf_triples]
    input_path = config["File paths"]["new_metadata"]
    output_path = config["File paths"]["output_folder"]
    files = FileGetter(input_path, "*.xml")
    file_list = files.get_files()
    sum_matches = 0
    for file_path in tqdm(sorted(file_list), desc="Speaker markup status"):
        identified = EntityMarkup(file_path)
        logger.info("Doing simple markup for: " + str(os.path.basename(file_path)))
        logger.info("\nMarkup status for: " + str(os.path.basename(file_path)))
        with open(file_path, 'r') as f:
            xml_as_string = f.read()
        xml_as_bytes = xml_as_string.encode("utf-8")
        bool = identified.simple_check_xml(xml_as_bytes, file_path, False,
                                           False)
        if(bool is False):
            logger.error(("This XML file is not well-formed. Program stopped."
                          " Fix or remove this file an run the program again."
                          ))
            print("Program has stopped. See logs for more info.")
            break
        identified.read_xml(file_path)
        identified.get_element_text()
        string_for_markup = identified.current_string
        # Start of simple markup
        for regex_conf_triplet in regex_conf_triples:
            regex = regex_conf_triplet[0]
            case = regex_conf_triplet[1]
            speaker = SpeakerMarkup(string_for_markup, regex)
            speaker.compile_regex(regex)
            speaker.identify_speaker()
            speaker.markup_speaker(case)
            string_for_markup = speaker.markuped_string
            sum_matches += speaker.matches_count
        logger.info(str(sum_matches) + " total matches in the protocol.")
        sum_matches = 0
        speaker.simple_check_xml(string_for_markup, file_path, False)
        # Saving simple markuped string to xml
        speaker.read_xml(file_path)
        speaker.replace_string(string_for_markup, "sitzungsverlauf")
        speaker.save_to_file(output_path, file_path, "simple_xml", "File paths",
                             "new_simple_markup")
    print("Simple markup finished.")
    config.read("config.ini")
    new_simple_xml_path = config["File paths"]["new_simple_markup"]
    # Start of president Replacer
    new_files = FileGetter(new_simple_xml_path, "*.xml")
    new_file_list = new_files.get_files()
    print("Replacing some XML-elements in the protocolls.")
    for file_path in tqdm(sorted(new_file_list), desc="Files replacement status"):
        logger.info("Replacing some xml elements for: " + str(os.path.basename(file_path)))
        for regex_conf_triplet in regex_conf_triples:
            if(regex_conf_triplet[1] != "first"
               or regex_conf_triplet[1] != "last"):
                regex = regex_conf_triplet[0]
                speaker_rolle_value = regex_conf_triplet[2]
                replacements = XMLProtocol()
                replacements.read_xml(file_path)
                replacements.compile_regex(regex)
                replacements.expand_element(".//rede", "typ",
                                            speaker_rolle_value)
                replacements.save_to_file(output_path, file_path, "simple_xml",
                                          "File paths", "new_simple_markup")
        start_time_attr_value = replacements.xml_tree.get("sitzung-start-uhrzeit")
        replacements.replace_tag_attr(".//sitzungsverlauf/rede[1]",
                                      "sitzungsbeginn",
                                      "sitzung-start-uhrzeit",
                                      start_time_attr_value,
                                      False)
        end_time_attr_value = replacements.xml_tree.get("sitzung-ende-uhrzeit")
        replacements.expand_element(".//sitzungsende", "sitzung-ende-uhrzeit",
                                    end_time_attr_value, False)
        replacements.save_to_file(output_path, file_path, "simple_xml",
                                  "File paths", "new_simple_markup")
 if __name__ == '__main__':
    get_speakers()
--- a/bundesdata_markup_nlp/markup/speeches.py
+++ b/bundesdata_markup_nlp/markup/speeches.py
@@ -0,0 +1,76 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 from utility.FileGetter import FileGetter
 from markup.EntityMarkup import EntityMarkup
 import configparser
 from tqdm import tqdm
 import logging
 def markup_speeches():
    """
    Marks up different entitys in the speech strings. For example comments.
    First it marks speech parts (<p>) line by line.
    """
    logger = logging.getLogger(__name__)
    config = configparser.ConfigParser()
    config.read("config.ini")
    complex_xmls = config["File paths"]["complex_markup"]
    output_path = config["File paths"]["output_folder"]
    regex_conf_pairs = config.items("Regular expressions speeches")
    regex_conf_pairs = [regex[1].split(" ; ") for regex in regex_conf_pairs]
    multiline_entities = config.items("Multiline entities")
    multiline_entities = [regex[1].split(" ; ") for regex in multiline_entities]
    files = FileGetter(complex_xmls, "*.xml")
    file_list = files.get_files()
    for file_path in tqdm(sorted(file_list), desc="File status speech markup"):
        entity = EntityMarkup(file_path)
        entity.read_xml(file_path)
        speeches = entity.xml_tree.xpath(".//rede")
        session_start = entity.xml_tree.xpath(".//sitzungsbeginn")[0]
        for speech in speeches:
            entity.markup_speech_lines(speech)
        entity.markup_speech_lines(session_start)
        session_lines = entity.xml_tree.xpath(".//p")
        for line in tqdm(session_lines, desc="Marking single line entities"):
            for pair in regex_conf_pairs:
                entity.inject_element(line, pair[0], pair[1])
        session_lines = entity.xml_tree.xpath(".//p") # gets new altered session lines (<p>)
        for pair in tqdm(multiline_entities, desc="Marking multiline entities:"):
            entity.get_multiline_entities(session_lines, pair[0], pair[1], pair[2])
        # For logging
        all_entities = 0
        only_single_line_entities = 0
        for pair in regex_conf_pairs:
            element_path = ".//" + pair[1]
            nr_entities = len(entity.xml_tree.xpath(element_path))
            logger.info(("Number of identified " + pair[1] + " elements is: "
                         + str(nr_entities)
                         + " (single line)"))
            all_entities += nr_entities
            only_single_line_entities += nr_entities
        for pair in multiline_entities:
            element_path = ".//" + pair[2]
            nr_entities = len(entity.xml_tree.xpath(element_path))
            logger.info(("Number of identified " + pair[2] + " elements is: "
                         + str(nr_entities)
                         + " (multi line)"))
            all_entities += nr_entities
        logger.info(("Number of all identified single line entities: "
                     + str(only_single_line_entities)))
        logger.info(("Number of all identified entities is: " + str(all_entities)
                     + " Also includes multiline matches. Number could be higher"
                     + " than it is if multiline matches are matching the same"
                     + " like the single line entitie regexes."))
        entity.save_to_file(output_path, file_path, "clear_speech_markup",
                            "File paths", "clear_speech_markup")
 if __name__ == '__main__':
    markup_speeches()
--- a/bundesdata_markup_nlp/nlp/pycache/lemmatization.cpython-37.pyc
+++ b/bundesdata_markup_nlp/nlp/pycache/lemmatization.cpython-37.pyc
--- a/bundesdata_markup_nlp/nlp/pycache/n_grams.cpython-37.pyc
+++ b/bundesdata_markup_nlp/nlp/pycache/n_grams.cpython-37.pyc
--- a/bundesdata_markup_nlp/nlp/pycache/tokenize.cpython-37.pyc
+++ b/bundesdata_markup_nlp/nlp/pycache/tokenize.cpython-37.pyc
--- a/bundesdata_markup_nlp/nlp/lemmatization.py
+++ b/bundesdata_markup_nlp/nlp/lemmatization.py
@@ -0,0 +1,84 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 import de_core_news_sm
 import configparser
 from utility.XMLProtocol import XMLProtocol
 from lxml import etree
 from tqdm import tqdm
 import re
 def lemmatization(files, no_stop_words=False):
    """
    Lemmatizes the speeches of the input XML protocols with the built in spacy
    lookup-table function. Can include or exclude stop words.
    Lemmatized text will be written into an new Element named
    <rede_lemmatisiert>. Always removes punctuation. Joines hyphenated strings
    before they will be lemmatised.
    """
    nlp = de_core_news_sm.load()
    config = configparser.ConfigParser()
    config.read("config.ini")
    output_path = config["File paths"]["nlp_output"]
    for file_path in tqdm(sorted(files), desc="Lemmatization file status"):
        xml = XMLProtocol()
        xml.read_xml(file_path)
        speeches = xml.xml_tree.xpath(".//rede | .//sitzungsbeginn")
        for speech in speeches:
            parts = speech.xpath(".//p")
            tmp_list = []
            for part in parts:
                if(part.text is not None):
                    tmp_list.append(re.sub(r"_", " ", str(part.text + "\n")))
                    """
                    replaces "_" with " ". Is needed because a string like
                    "Treffsicherheit einer Schrotflinte;_Sie haben nämlich kaum
                    den Punkt getroffen" will not be lemmatized correctly in spacy.
                    "Schrotflinte;_Sie" wil be recognized as one token.
                    Furthermore this meeses up the sorted ngram calculation.
                    Also adds \n at end of every line to help identifying
                    hyphenated words.
                    """
                part.getparent().remove(part)
            new_text = "".join(tmp_list)
            new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[a-ßzäüö])", "\g<wordend>\g<wordstart>", new_text)
            """
            joins hyphenated words together:
            'Länderfinanz- ausgleich' --> Länderfinanzausgleich.
            Better to do it here because most of the comments and metadata has
            already been marked.
            Ignores strings like: 'Finanz-, Handels- und Sicherheitspolitik'.
            Does not ignore them when they happen at a linebreak. This is a rare
            occasion though.
            """
            new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[A-ZÄÜÖ])", "\g<wordend>-\g<wordstart>", new_text)
            """
            Removes all line breaks again. This way compound names with a line
            break inbetween like "Sütterlin-\nWaack" will be recognized as one
            string by spacy. --> Sütterlin-Waack
            """
            lemmatized_speech = etree.Element("rede_lemmatisiert")
            doc = nlp(new_text)
            if(no_stop_words is False):
                lemmatized = " ".join([token.lemma_ for token in doc
                                       if token.pos_ != "PUNCT" and token.text != "_"])
                """
                Removes "_" from text. Has to be removed
                because it is some kind of special
                character in spacy.
                """
                filename_sufix = "_lemmatized_with_stopwords.xml"
            elif(no_stop_words is True):
                lemmatized = " ".join([token.lemma_ for token in doc
                                       if token.is_stop is False
                                       and token.pos_ != "PUNCT" and token.text != "_"])
                filename_sufix = "_lemmatized_without_stopwords.xml"
            lemmatized_speech.text = lemmatized
            speech.append(lemmatized_speech)
        xml.save_to_file(output_path, file_path, "lemmatized", "File paths",
                         "nlp_lemmatized_tokenized", filename_sufix)
 if __name__ == '__main__':
    lemmatization()
--- a/bundesdata_markup_nlp/nlp/n_grams.py
+++ b/bundesdata_markup_nlp/nlp/n_grams.py
@@ -0,0 +1,142 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*
 import configparser
 import csv
 import os
 import gc
 from utility.XMLProtocol import XMLProtocol
 from collections import Counter
 from tqdm import tqdm
 from sklearn.feature_extraction.text import CountVectorizer
 from itertools import groupby, chain
 from operator import itemgetter
 import locale
 locale.setlocale(locale.LC_COLLATE, "C")  # Sets locale to portable "C" locale.
 def n_grams(files, group_by_feature="year",
            input_type_name="lemmatized_without_stopwords"):
    """
    Clacluates 1 to 5 grams for given input protocols. Can either handel
    lemmatized or non lemmatized files. Writes the ngrams to a tab separated csv
    file. One row inclueds the ngram, the match count of it, the year or date,
    or rede_id or redner_id. One file per unigram, bigram, trigram etc. per
    group key will be created. (There wil be one file for unigrams starting with
    the letter 'A' one for unigrams starting with 'B' etc.)
    Third parameter is a string set by the user which will be added to
    the file names to help distinguish lemmatized and non lemmatized ngrams etc.
    The more protocols are used as input the more RAM the script needs.
    For all 4106 protocols 32GB of RAM with a 32GB swap file was used!
    """
    config = configparser.ConfigParser()
    config.read("config.ini")
    output_path = config["File paths"]["nlp_output"]
    output_path = os.path.join(output_path, "n-grams")
    if not os.path.exists(output_path):
        os.mkdir(output_path)
    for step in tqdm(range(6)[1:], desc="Current ngram calculating"):
        N_GRAMS = []
        file_name_prefix = str(step) + "_grams"
        counter_vectorizer = CountVectorizer(ngram_range=(step, step),
                                             lowercase=False)
        for file_path in tqdm(sorted(files), desc="File status"):
            xml = XMLProtocol()
            xml.read_xml(file_path)
            feature_year = xml.xml_tree.xpath("@sitzung-datum")[0][-4:]
            feature_mont_year = xml.xml_tree.xpath("@sitzung-datum")[0][-7:]
            speeches = xml.xml_tree.xpath(".//rede | .//sitzungsbeginn")
            for speech in speeches:
                # gets id of current speech
                feature_rede_id = speech.xpath("@id")
                if(len(feature_rede_id) == 0):
                    feature_rede_id = "sitzungsbeginn"
                else:
                    feature_rede_id = feature_rede_id[0]
                # gets id of current speaker
                feature_redner_id = speech.xpath(".//redner/@id")[0]
                # gets speech text from tokenized or lemmatized protocol
                speech_text = speech.xpath("node()[2]")[0]  # gets second child of speech
                if(speech_text.text is not None):
                    tmp_str = speech_text.text
                ngrams = counter_vectorizer.build_analyzer()
                ngrams_list = ngrams(tmp_str)
                if(group_by_feature == "year"):
                    pairs = [(pair,) + (feature_year,) for pair
                             in ngrams_list]
                elif(group_by_feature == "month_year"):
                    pairs = [(pair,) + (feature_mont_year,) for pair
                             in ngrams_list]
                elif(group_by_feature == "speaker"):
                    pairs = [(pair,) + (feature_redner_id,) for pair
                             in ngrams_list]
                elif(group_by_feature == "speech"):
                    pairs = [(pair,) + (feature_rede_id,) for pair
                             in ngrams_list]
                N_GRAMS.extend(pairs)
            speeches = None
        # puts uppercase ngram at first position in line to sort by this
        # will be delted later on
        print("Start counting ngrams.")
        N_GRAMS = Counter(N_GRAMS)
        print("Finished counting ngrams.")
        print("Start sorting ngrams")
        N_GRAMS = [item[0][0][0].upper()
                   + "||"
                   + item[0][0]
                   + "||"
                   + str(item[0][1])
                   + "||"
                   + str(item[1])
                   for item in N_GRAMS.items()]
        N_GRAMS = sorted(N_GRAMS, key=locale.strxfrm)
        print("Finished sorting ngrams")
        # sorts all ngrams into groups one group for each german uppercasse
        # letter except ß
        # Also one group for every decimal from 0 to 10
        # Other non ascii or non decimal ngrams will be sorted in own groups
        # These groups will be joined together later on into one non ascii group
        alphabetically = []
        tmp_list = []
        for letter, entries in tqdm(groupby(N_GRAMS, key=itemgetter(0)),
                                    desc="Grouping ngrams alphabetically"):
            if(letter):
                print(letter)
                for entry in entries:
                    tmp_list.append(entry)
            alphabetically.append(tmp_list)
            tmp_list = []
            N_GRAMS = None
            gc.collect() # frees RAM
        key_list = ([i for i in range(10)]
                    + "A B C D E F G H I J K L M N O P Q R S T U V W X Y Z".split()
                    + ["_Non_ASCII"])
        # groups all non ascii ngrams into one list to save them into one csv
        if(len(alphabetically) > 37):
            joined_tail = alphabetically[36:]
            joined_tail = chain.from_iterable(list(joined_tail))
            del alphabetically[36:]
            alphabetically.append(joined_tail)
        # save groups to individual files
        for group, key in tqdm(zip(alphabetically, key_list),
                               desc="Writing ngrams to files"):
            group_ngrams = [entry.split("||")[1:] for entry in group]
            file_name = (str(key)
                         + "_"
                         + file_name_prefix
                         + "_per_"
                         + group_by_feature
                         + "_"
                         + input_type_name
                         + ".csv")
            file_output_path = os.path.join(output_path, file_name)
            with open(file_output_path, "w", newline="", encoding="utf8") as file:
                writer = csv.writer(file, delimiter="\t")
                writer.writerows(group_ngrams)
        alphabetically = None
 if __name__ == '__main__':
    n_grams()
--- a/bundesdata_markup_nlp/nlp/tokenize.py
+++ b/bundesdata_markup_nlp/nlp/tokenize.py
@@ -0,0 +1,78 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 import de_core_news_sm
 import configparser
 from utility.XMLProtocol import XMLProtocol
 from lxml import etree
 from tqdm import tqdm
 import re
 def tokenize(files, no_stop_words=False):
    """
    Tokenizes the speeches of the input XML protocols. Can include or exclude
    stop words. Tokenized speeches will be written into a new element
    <rede_tokenisiert>. Always removes punctuation. Joines hyphenated strings
    before they will be tokenized.
    """
    nlp = de_core_news_sm.load()
    config = configparser.ConfigParser()
    config.read("config.ini")
    output_path = config["File paths"]["nlp_output"]
    for file_path in tqdm(sorted(files), desc="Tokenization file status"):
        xml = XMLProtocol()
        xml.read_xml(file_path)
        speeches = xml.xml_tree.xpath(".//rede | .//sitzungsbeginn")
        for speech in speeches:
            parts = speech.xpath(".//p")
            tmp_list = []
            for part in parts:
                if(part.text is not None):
                    tmp_list.append(re.sub(r"_", " ", str(part.text + "\n")))
                    """
                    replaces "_" with " ". Is needed because a string like
                    "Treffsicherheit einer Schrotflinte;_Sie haben nämlich kaum
                    den Punkt getroffen" will not be lemmatized correctly in spacy.
                    "Schrotflinte;_Sie" wil be recognized as one token.
                    Furthermore this meeses up the sorted ngram calculation.
                    Also adds \n at end of every line to help identifying
                    hyphenated words.
                    """
                part.getparent().remove(part)
            new_text = "".join(tmp_list)
            new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[a-ßzäüö])", "\g<wordend>\g<wordstart>", new_text)
            """
            joins hyphenated words together:
            'Länderfinanz- ausgleich' --> Länderfinanzausgleich.
            Better to do it here because most of the comments and metadata has
            already been marked.
            Ignores strings like: 'Finanz-, Handels- und Sicherheitspolitik'.
            Does not ignore them when they happen at a linebreak. This is a rare
            occasion though.
            """
            new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[A-ZÄÜÖ])", "\g<wordend>-\g<wordstart>", new_text)
            """
            Removes all line breaks again. This way compound names with a line
            break inbetween like "Sütterlin-\nWaack" will be recognized as one
            string by spacy. --> Sütterlin-Waack
            """
            tokenized_speech = etree.Element("rede_tokenisiert")
            doc = nlp(new_text)
            if(no_stop_words is False):
                tokenized = " ".join([token.text for token in doc
                                      if token.pos_ != "PUNCT"])
                filename_sufix = "_tokenized_with_stopwords.xml"
            elif(no_stop_words is True):
                tokenized = " ".join([token.text for token in doc
                                      if token.is_stop is False
                                      and token.pos_ != "PUNCT"])
                filename_sufix = "_tokenized_without_stopwords.xml"
            tokenized_speech.text = tokenized
            speech.append(tokenized_speech)
        xml.save_to_file(output_path, file_path, "tokenized", "File paths",
                         "nlp_lemmatized_tokenized", filename_sufix)
 if __name__ == '__main__':
    tokenize()
--- a/bundesdata_markup_nlp/samples/init.py
+++ b/bundesdata_markup_nlp/samples/init.py
--- a/bundesdata_markup_nlp/samples/create_samples.py
+++ b/bundesdata_markup_nlp/samples/create_samples.py
@@ -0,0 +1,95 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 import os
 import fnmatch
 import argparse
 import random
 import shutil
 """
 This is just a quick script to get randomized samples from the protocols.
 """
 def parse_arguments():
    """Argument Parser"""
    parser = argparse.ArgumentParser(description="Creates samples from given   \
                                     directory with given size. Creates two    \
                                     samples with no overlapping.")
    parser.add_argument("-p",
                        "--path",
                        help="Path to data files to create sample from.",
                        required=True,
                        type=str,
                        metavar="")
    parser.add_argument("-s",
                        "--size",
                        help="Size of sample.",
                        required=True,
                        type=int,
                        metavar="")
    parser.add_argument("-n", "--number_of_samples",
                        help="How many smaples should be created? should be    \
                        created?",
                        required=True,
                        type=int,
                        metavar="")
    parser.add_argument("-t",
                        "--file_type",
                        help="What file types should be used as the base for   \
                        the sample?. Accepts wildcars.",
                        required=True,
                        type=str)
    args = parser.parse_args()
    return args
 def get_files(path, file_type):
    """Creates file list with full paths of all files in the given directory and
    its sub directories and returns it."""
    list_of_files = []
    for path, subdirs, files in os.walk(path):
        for name in files:
            if fnmatch.fnmatch(name, file_type):
                list_of_files.append(os.path.join(path, name))
    return list_of_files
 def get_files_to_copy(list_of_files, sample_size):
    """Gets random filepaths from all filepaths to create a sample out of those.
    Filepaths that have already been use will be removed from the file list to
    create independent sampels."""
    counter = 0
    sample_list = []
    while counter < sample_size:
        counter += 1
        random_index = random.randint(0, len(list_of_files)-1)
        sample_list.append(list_of_files[random_index])
        del list_of_files[random_index]
        pass
    return list_of_files, sample_list
 def copy_files(path, sample_list, step_int):
    """Copys the given files to new directories."""
    sample_path = os.path.join(path, str(step_int))
    print(sample_path)
    os.mkdir(sample_path)
    for file in sample_list:
        shutil.copy2(file, sample_path)
 def main():
    args = parse_arguments()
    path = args.path
    file_list = get_files(path, args.file_type)
    for step in range(1, args.number_of_samples + 1):
        file_list = get_files_to_copy(file_list, args.size)[0]
        sample_list = get_files_to_copy(file_list, args.size)[1]
        copy_files(path, sample_list, step)
        file_list = get_files_to_copy(file_list, args.size)[0]
 if __name__ == '__main__':
    main()
--- a/bundesdata_markup_nlp/utility/FileGetter.py
+++ b/bundesdata_markup_nlp/utility/FileGetter.py
@@ -0,0 +1,35 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 import os
 import fnmatch
 """
 This class is for getting filepaths of all files in a given directory. Also
 gets files in subdirectories.
 """
 class FileGetter(object):
    """
    Class for getting file paths of given path wich will be opend and/or
    further processed later on.
    """
    def __init__(self, path, file_type):
        super(FileGetter, self).__init__()
        self.path = path
        self.file_type = file_type
    def get_files(self):
        """
        Creates file list with full paths of all files in the given
        directory and its sub directories and returns it.
        """
        list_of_files = []
        for path, subdirs, files in os.walk(self.path):
            for name in files:
                if fnmatch.fnmatch(name, self.file_type):
                    list_of_files.append(os.path.join(path, name))
        self.list_of_files = list_of_files
        return list_of_files
--- a/bundesdata_markup_nlp/utility/XMLProtocol.py
+++ b/bundesdata_markup_nlp/utility/XMLProtocol.py
@@ -0,0 +1,209 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 from utility import delete_folder
 from utility import update_config
 from xml.etree import ElementTree
 from os import path
 from lxml import etree
 import os
 import logging
 import re
 class XMLProtocol(object):
    """Class for standard operations on/with the XML protocols. Has functions
    for reading, saving and manipulationg an XML protocol. All other classes
    inherit from this one.
    """
    def __init__(self):
        super().__init__()
        self.logger = logging.getLogger(__name__)
    def read_protcol(self, file_path):
        """
        Takes a file path and parses the file as an XML returns a root element.
        """
        self.file_path = file_path
        self.filename = os.path.basename(self.file_path)
        parser = etree.XMLParser(remove_blank_text=True)
        self.tree = etree.parse(file_path, parser)  # for better xml indentation
        root = self.tree.getroot()
        self.logger.info("File successfully parsed as XML.")
        return root
    def read_xml(self, file_path):
        """Takes a file path and parses the file as an XML."""
        parser = etree.XMLParser(encoding='utf-8', remove_blank_text=True)
        tree = etree.parse(file_path, parser)  # for better xml indentation
        self.xml_tree = tree.getroot()
    def save_to_file(self, output_path, file_path, subfolder, config_section,
                     config_key, filename_sufix=""):
        """
        Writes the new markup to a new xml file. Takes the output path and
        creates a new folder there. Also updates the config file with the new
        path.
        """
        if(filename_sufix == ""):
            self.filename = path.basename(file_path)
        elif(filename_sufix != ""):
            self.filename = path.basename(file_path)[:-4] + filename_sufix
        save_path = os.path.join(output_path, subfolder)
        if not os.path.exists(save_path):
            os.mkdir(save_path)
        tree = etree.ElementTree(self.xml_tree)
        new_filename = self.filename
        save_file_path = os.path.join(save_path, new_filename)
        tree.write(save_file_path,
                   pretty_print=True,
                   xml_declaration=True,
                   encoding="utf8",
                   doctype="<!DOCTYPE dbtplenarprotokoll SYSTEM 'dbtplenarprotokoll_minimal.dtd\'>")
        self.logger.info("New XML saved to:" + save_file_path)
        update_config.update_config("config.ini", config_section, config_key,
                                    save_path)
    def beautify_xml_part(self, file_path, xpath, alter_lines=False,
                          line_width=80):
        """
        Beautifies part (element node) of an input XML.
        """
        tmp_path = os.path.join(os.path.dirname(file_path), "tmp")
        tree = etree.ElementTree(self.xml_tree)
        self.beautified_part = tree.find(xpath)
        self.beautified_part = ElementTree.tostring(self.beautified_part)
        self.beautified_part = etree.fromstring(self.beautified_part)
        self.beautified_part = etree.ElementTree(self.beautified_part)
        if not os.path.exists(tmp_path):
            os.mkdir(tmp_path)
        tmp_file_path = os.path.join(tmp_path, "tmp.xml")
        self.beautified_part.write(tmp_file_path,
                                   pretty_print=True,
                                   xml_declaration=True,
                                   encoding="utf8")
        if(alter_lines is True):
            os.system("html-beautify -r -q -w {} --no-preserve-newlines {}".format(line_width, tmp_file_path))
            self.beautified_part = etree.parse(tmp_file_path).getroot()
        elif(alter_lines is False):
            os.system("html-beautify -r -q {}".format(tmp_file_path))
            self.beautified_part = etree.parse(tmp_file_path).getroot()
        update_config.update_config("config.ini", "File paths", "tmp_path",
                                    tmp_path)
        delete_folder.delete_folder(tmp_path)
    def beautify_xml(self, file_path, alter_lines=False, line_width=80):
        if(alter_lines is True):
            os.system("html-beautify -r -q -w {} --no-preserve-newlines {}".format(line_width, file_path))
        elif(alter_lines is False):
            os.system("html-beautify -r -q {}".format(file_path))
    def expand_element(self, element_to_expand, expand_attr_key,
                       expand_attr_value, check_child=True):
        """
        This function takes an XPath expression for an xml element.
        The tag of this element will be expanded with the given
        expand_attrkey and expand_attr_value. Also needs a regex to determine if
        the current selected element is an element which should be replaced.
        For this the text of the first child of the current element is checked
        against the given regex. Per default the child element text of the
        current element is checked wether the regex matches the string or not.
        Set check_child to False to avoid this and just expand the current
        element.
        """
        elements = self.xml_tree.findall(element_to_expand)
        for element in elements:
            if(check_child is True):
                first_child = element.getchildren()[0]
                match = self.regex_compiled.search(first_child.text)
                if(match):
                    element.set(expand_attr_key, expand_attr_value)
                self.xml_tree = self.xml_tree
            else:
                element.set(expand_attr_key, expand_attr_value)
                self.xml_tree = self.xml_tree
    def replace_tag_name(self, element_to_replace, tag_name, check_child=True):
        """
        Replaces a given element tag(as XPath) name with a new tag name.
        """
        elements = self.xml_tree.findall(element_to_replace)
        for element in elements:
            if(check_child is True):
                first_child = element.getchildren()[0]
                match = self.regex_compiled.search(first_child.text)
                if(match):
                    element.tag = tag_name
            else:
                element.tag = tag_name
        self.xml_tree = self.xml_tree
    def replace_tag_attr(self, element_to_replace, tag_name, attr_key,
                         attr_value, check_child=True):
        """
        Replaces tag name of given element(as XPath) with new name and adds an
        attribute Can also check if the child of the current element contains
        some specific text like in the expand_element function.
        """
        elements = self.xml_tree.findall(element_to_replace)
        for element in elements:
            if(check_child is True):
                first_child = element.getchildren()[0]
                match = self.regex_compiled.search(first_child.text)
                if(match):
                    element.tag = tag_name
                    element.set(attr_key, attr_value)
            else:
                element.tag = tag_name
                element.set(attr_key, attr_value)
        self.xml_tree = self.xml_tree
    def replace_elements(self, elements_to_replace, replacment_elements,
                         keep_parent_text=False):
        """
        Replaces elements identifeid by XPath with new elements. Can either keep
        the text of the parent element or not.
        """
        elements = self.xml_tree.findall(elements_to_replace)
        parents_text_xpath = elements_to_replace + "/" + "parent::node()" + "/" + "text()"
        elements_text = self.xml_tree.xpath(parents_text_xpath)
        if(len(elements) == len(replacment_elements)):
            if(keep_parent_text is False):
                for element, replacement_element in zip(elements, replacment_elements):
                    element.getparent().replace(element, replacement_element)
            else:
                for element, replacement_element in zip(elements, replacment_elements):
                    element.getparent().replace(element, replacement_element)
                self.xml_tree = self.xml_tree
                elements = self.xml_tree.findall(elements_to_replace)
                for element, text in zip(elements, elements_text):
                    element.tail = text
            self.xml_tree = self.xml_tree
        else:
            self.logger.warning(("Elements missmatch. There are "
                                 + str(len(elements))
                                 + " that should be repalced."
                                 + " There are " + str(len(replacment_elements))
                                 + " present."
                                 + " No elements have been replaced."))
    def compile_regex(self, regex):
        self.regex_string = regex
        """
        Takes the input regex string and compiles it for better performance
        and redability.
        """
        self.regex_compiled = re.compile(self.regex_string, re.MULTILINE)
    def clean_text(self, regex, xpath, replacement_string="",):
        """
        Replaces regex matches with nothing by default or replacement string
        for an element matched by the xpath in the xml_tree. Works with
        matchgroups.
        """
        elements = self.xml_tree.xpath(xpath)
        for element in elements:
            replaced = re.sub(regex, replacement_string, element.text)
            element.text = replaced
        self.xml_tree = self.xml_tree
--- a/bundesdata_markup_nlp/utility/init.py
+++ b/bundesdata_markup_nlp/utility/init.py
--- a/bundesdata_markup_nlp/utility/pycache/FileGetter.cpython-37.pyc
+++ b/bundesdata_markup_nlp/utility/pycache/FileGetter.cpython-37.pyc
--- a/bundesdata_markup_nlp/utility/pycache/XMLProtocol.cpython-37.pyc
+++ b/bundesdata_markup_nlp/utility/pycache/XMLProtocol.cpython-37.pyc
--- a/bundesdata_markup_nlp/utility/pycache/init.cpython-37.pyc
+++ b/bundesdata_markup_nlp/utility/pycache/init.cpython-37.pyc
--- a/bundesdata_markup_nlp/utility/pycache/delete_folder.cpython-37.pyc
+++ b/bundesdata_markup_nlp/utility/pycache/delete_folder.cpython-37.pyc
--- a/bundesdata_markup_nlp/utility/pycache/update_config.cpython-37.pyc
+++ b/bundesdata_markup_nlp/utility/pycache/update_config.cpython-37.pyc
--- a/bundesdata_markup_nlp/utility/delete_folder.py
+++ b/bundesdata_markup_nlp/utility/delete_folder.py
@@ -0,0 +1,15 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 import shutil
 def delete_folder(folder_path):
    """
    Deletes folder idetified by input folder path string.
    """
    shutil.rmtree(folder_path)
 if __name__ == '__main__':
    delete_folder()
--- a/bundesdata_markup_nlp/utility/move_ngrams.py
+++ b/bundesdata_markup_nlp/utility/move_ngrams.py
@@ -0,0 +1,22 @@
 import os
 """
 Helper script to move n_gram csvs to seperate folders. Just copy this into the
 folder containing the n-grams and execute it. Change n to number of N in N-grams.
 """
 current_path = os.getcwd()
 files = []
 n = 5
 for file in os.listdir(current_path):
    if file.endswith(".csv"):
        files.append(file)
 files = sorted(files)
 dir_list = ["1_grams", "2_grams", "3_grams", "4_grams", "5_grams"][:n]
 for dir in dir_list:
    os.system("mkdir {}".format(dir))
 for step, dir in zip(range(0, n), dir_list):
    for file in files[step::n]:
        print(file)
        os.system("mv {} {}".format(file, dir))
--- a/bundesdata_markup_nlp/utility/update_config.py
+++ b/bundesdata_markup_nlp/utility/update_config.py
@@ -0,0 +1,21 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 import configparser
 def update_config(file_name, section, key, value):
    """
    This script updates a config file identified by file_name. Updates the data
    of one key value pair in a specific section.
    """
    config = configparser.ConfigParser()
    config.read(file_name)
    file = open(file_name, "w")
    config.set(section, key, value)
    config.write(file)
    file.close()
 if __name__ == '__main__':
    update_config()
--- a/docs/BT-PP_DTD_kommentiert_20150519.pdf
+++ b/docs/BT-PP_DTD_kommentiert_20150519.pdf
--- a/docs/metadaten.md
+++ b/docs/metadaten.md
@@ -0,0 +1,5 @@
 # Metadaten
 Quelle der Strukturdefinition: https://www.bundestag.de/blob/577234/f9159cee3e045cbc37dcd6de6322fcdd/dbtplenarprotokoll_kommentiert-data.pdf
 Heruntergleaden am: 06.11.2018
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,7 @@
 # Bundesdata
 lxml==4.2.5
 Babel==2.6.0
 tqdm==4.28.1
 spacy==2.0.18
 https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.0.0/de_core_news_sm-2.0.0.tar.gz
 scikit-learn[alldeps]==0.20.2