Initial commit

2019-02-21 19:29:44 +01:00
commit 4263e5f41e
52 changed files with 3024 additions and 0 deletions
@@ -0,0 +1,2 @@
+data/*
+.idea/*
@@ -0,0 +1,72 @@
+# Master_thesis
+Master Thesis Repository.
+
+## Benötigte Pakete und Sprachen
+
+- Python 3.7+
+- Python Pakete werden mittels requirements.txt installiert. Siehe Installation Schritt 2.
+
+## Installation
+1. Stellen sie sicher, dass das Paket `python3.7-dev` installiert ist. Wenn nicht: `sudo apt-get install python3.7-dev`
+1. Installieren Sie _virtualenv_ mittels `pip install virtualenv`. Oder dem jeweiligen package manager der eigenen Distribution.
+2. Installieren Sie JS Beautifier systemweit `sudo npm -g install js-beautify` (Optional! Wenn nicht gewünscht, kann der Schritt übersprungen werden. Der Schritt welches dieses Paket während der Auszeichnung benötigt kann übersprungen werden. Allerdings gibt es so keine schön formatierten XML-Dateien.)
+3. Erstelle virtual environment für das Projekt mittels `virtualenv --python=python3.7 path/to/folder`
+4. Aktivieren der virtuellen Umgebung mittels `source path/to/folder/bin/activate`
+5. `cd verzeichnis/des/repository`
+6. Installieren der Abhängigkeiten mit `pip install -r requirements.txt`.
+
+## Scriptaufrufe Beispiele:
+
+### @Home
+- `source ~/VirtualEnvs/bundesdata/bin/activate`
+- `cd ~/Documents/Eigene\ geschriebene\ Programme/master_thesis/bundesdata/`
+
+#### Development Data
+
+**Metadata**
+-`python markup/metastructure.py -p /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data/working_data/development_data_xml -f *.xml -o /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data/working_data`
+
+**Speakers**
+- `python markup/speakers.py -p /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data/working_data/xml_new_metadata_structure -f *.xml -o /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data/working_data`
+
+#### Full data
+
+**Metadata**
+-`python markup/metastructure.py -p /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data/protocols_raw_xml -f *.xml -o /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data`
+
+**Speakers**
+- `python markup/speakers.py -p /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data/xml_new_metadata_structure -f *.xml -o /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data`
+
+### @Uni
+
+
+#### Development Data
+- `source /home/stephan/VirtualEnvs/bundesdata/bin/activate`
+- `cd /home/stephan/Repos/master_thesis/bundesdata`
+
+**Speakers**
+- `python markup/speakers.py -p /home/stephan/Repos/master_thesis/data/working_data/xml_new_metadata_structure -f *.xml -o /home/stephan/Repos/master_thesis/data/working_data`
+
+**Metadata**
+-`python markup/metastructure.py -p /home/stephan/Repos/master_thesis/data/working_data/development_data_xml -f *.xml -o /home/stephan/Repos/master_thesis/data/working_data`
+
+
+#### Test Data
+- `source /home/stephan/VirtualEnvs/bundesdata/bin/activate`
+- `cd /home/stephan/Repos/master_thesis/bundesdata`
+
+**Speakers**
+- `python markup/speakers.py -p /home/stephan/Repos/master_thesis/data/working_data/test/xml_new_metadata_structure -f *.xml -o /home/stephan/Repos/master_thesis/data/working_data/test`
+
+**Metadata**
+-`python markup/metastructure.py -p /home/stephan/Repos/master_thesis/data/working_data/test_data_xml -f *.xml -o /home/stephan/Repos/master_thesis/data/working_data/test`
+
+#### Full data
+- `source /home/stephan/VirtualEnvs/bundesdata/bin/activate`
+- `cd /home/stephan/Repos/master_thesis/bundesdata`
+
+**Speakers**
+- `python markup/speakers.py -p /home/stephan/Repos/master_thesis/data/xml_new_metadata_structure -f *.xml -o /home/stephan/Repos/master_thesis/data`
+
+**Metadata**
+-`python markup/metastructure.py -p /home/stephan/Repos/master_thesis/data/protocols_raw_xml -f *.xml -o /home/stephan/Repos/master_thesis/data`
@@ -0,0 +1,214 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from markup import metadata, speakers, speaker_names, speeches
+from utility import update_config
+from markup import beautify_markup
+from utility import delete_folder
+import argparse
+import time
+import configparser
+from datetime import datetime
+import logging
+import os
+
+"""
+This is the mains script handeling the automatic markup of the protocols. Needs
+some user Input specified in parse-arguments().
+"""
+
+
+def parse_arguments():
+    """
+    Argument Parser
+    """
+    parser = argparse.ArgumentParser(description="Starts the markup process of \
+                                     the XML protocols. Uses either the input  \
+                                     and output paths currently specified in   \
+                                     the config file or the paths set when     \
+                                     calling the script from the terminal with \
+                                     the flag argument '-sp' or '--set_paths'. \
+                                     Using this parameter writes the given     \
+                                     paths into the config file.          \
+                                     Some steps of the markup process can be   \
+                                     skipped if they already have been executed\
+                                     once while useing the -kt option          \
+                                     by using the corresponding parameters.    \
+                                     ")
+    parser.add_argument("-sp",
+                        "--set_paths",
+                        nargs=2,
+                        help="User can set the input and output paths for the  \
+                        files created during the markup. The paths will be     \
+                        written to the config file.",
+                        required=False,
+                        type=str,
+                        metavar=("input_path", "output_path"))
+    parser.add_argument("-sm",
+                        "--skip_metadata",
+                        help="Skips the script creating metadata and first     \
+                        xml strucutre.",
+                        action="store_true",
+                        required=False)
+    parser.add_argument("-ss",
+                        "--skip_simple_speakers",
+                        help="Skips the script creating the first simple       \
+                        speaker markup.",
+                        action="store_true",
+                        required=False)
+    parser.add_argument("-sn",
+                        "--skip_name_markup",
+                        help="Skips the script creating the name markup.",
+                        action="store_true",
+                        required=False)
+    parser.add_argument("-ssp",
+                        "--skip_speeches",
+                        help="Skips the script creating markup inside of       \
+                        speeches.",
+                        action="store_true",
+                        required=False)
+    parser.add_argument("-sb",
+                        "--skip_beautify_xml",
+                        help="Skips the script creating beautiful xml files.",
+                        action="store_true",
+                        required=False)
+    parser.add_argument("-kt",
+                        "--keep_tmp_files",
+                        help="Keeps all temporary xml files beeing created     \
+                        during the entire markup process. Using this flag is   \
+                        needed when skipping steps of the entire markup during \
+                        a rerun of the script.                                 \
+                        If this is not set temporary files will always be      \
+                        deleted.",
+                        action="store_true",
+                        required=False)
+    parser.add_argument("-fr",
+                        "--fresh_run",
+                        help="Deltes all temporary folders in output folder    \
+                        also deletes all paths saved in the config file file   \
+                        before starting the markup process. The user has to set\
+                        the paths again with -sp.",
+                        action="store_true",
+                        required=False)
+    parser.add_argument("-la",
+                        "--log_all",
+                        help="If set the programm will log all information     \
+                        about the markup process (statistics etc.). Otherwise  \
+                        it only logs errors and warnings.",
+                        action="store_true",
+                        required=False)
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    """
+    Main function calling all other scripts for the automatic markup of the
+    protocols.
+    """
+    args = parse_arguments()
+    if(args.log_all is True):
+        level = logging.INFO
+    elif(args.log_all is False):
+        level = logging.WARNING
+    logging.basicConfig(filename="logs/bundesdata.log", level=level,
+                        format="%(asctime)s %(name)s %(levelname)s:%(message)s",
+                        datefmt='%Y/%m/%d %H:%M:%S',
+                        filemode="w")
+    logger = logging.getLogger(__name__)
+    start_time = datetime.now()
+    print("Start time of script is:", start_time)
+    print("Info and status about the markup process can be found in:",
+          "logs/bundesdata.log")
+    logger.info("Start time of script is: " + str(start_time))
+
+    # Deletes output folder and all folders inside that.
+    # Also removes all path options from the section "File paths"
+    if(args.fresh_run is True):
+        config = configparser.ConfigParser()
+        config.read("config.ini")
+        options = config.items("File paths")
+        for option in options:
+            if(option[0] == "output_folder"):
+                try:
+                    delete_folder.delete_folder(option[1])
+                except FileNotFoundError:
+                    pass
+            else:
+                config.remove_option("File paths", option[0])
+        with open("config.ini", 'w') as out:
+            config.write(out)
+
+    # sets paths and creates output folder
+    if(args.set_paths):
+        input_path = args.set_paths[0]
+        output_path = os.path.join(args.set_paths[1], "output")
+        if not os.path.exists(output_path):
+            os.mkdir(output_path)
+        config = configparser.ConfigParser()
+        config.read("config.ini")
+        update_config.update_config("config.ini", "File paths",
+                                    "input_folder_xmls", input_path)
+        update_config.update_config("config.ini", "File paths",
+                                    "output_folder", output_path)
+
+    if(args.skip_metadata is not True):
+        print("Starting metadata extraction and markup.")
+        metadata.get_metadata()
+        print("Metadata creation and content splits finished.")
+    elif(args.skip_metadata is True):
+        print("Skipping script metadata.py.")
+
+    time.sleep(1)
+    if(args.skip_simple_speakers is not True):
+        print("Starting first simple speeches and speaker markup.")
+        speakers.get_speakers()
+        print(("Finished simple markup."))
+    elif(args.skip_simple_speakers is True):
+        print("Skipping script speakers.py.")
+
+    time.sleep(1)
+    if(args.skip_name_markup is not True):
+        print("Starting complex markup of speaker names.")
+        speaker_names.get_names()
+        print("Finished complex name markup. (names etc.)")
+    elif(args.skip_name_markup is True):
+        print("Skipping script speaker_names.py.")
+
+    time.sleep(1)
+    if(args.skip_speeches is not True):
+        print("Starting markup of comments etc. in speeches.")
+        speeches.markup_speeches()
+        print("Finished markup of comments etc. in speeches.")
+    elif(args.skip_speeches is True):
+        print("Skipping script speeches.py.")
+
+    time.sleep(1)
+    if(args.skip_beautify_xml is not True):
+        print("Starting to prettyfie the xmls.")
+        beautify_markup.beautify_xml("markup")
+        print("Prettyfied the xmls.")
+    elif(args.skip_beautify_xml is True):
+        print("Skipping script beautify_markup.py.")
+
+    if(args.keep_tmp_files is not True):
+        config = configparser.ConfigParser()
+        config.read("config.ini")
+        folder_paths = []
+        folder_paths.append(config["File paths"]["new_metadata"])
+        folder_paths.append(config["File paths"]["new_simple_markup"])
+        folder_paths.append(config["File paths"]["complex_markup"])
+        folder_paths.append(config["File paths"]["clear_speech_markup"])
+        for folder_path in folder_paths:
+            delete_folder.delete_folder(folder_path)
+
+    end_time = datetime.now()
+    print("End time of script is:", str(end_time))
+    logger.info("End time of script is: " + str(end_time))
+    duration = end_time - start_time
+    print("Duration of script is:", duration)
+    logger.info("Script duration is: " + str(duration))
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,178 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import argparse
+import configparser
+import os
+import logging
+from utility.FileGetter import FileGetter
+from utility import update_config
+from utility import delete_folder
+from markup import beautify_markup
+from nlp import tokenize, lemmatization, n_grams
+from datetime import datetime
+
+"""
+This script handles the tokenization, lemmatization and ngramm calculation of
+the input protocols. Needs some user input specfied int parse_arguments().
+"""
+
+def parse_arguments():
+    """
+    Argument Parser
+    """
+    parser = argparse.ArgumentParser(description="Starts the nlp analysis of   \
+                                     the newly created XML-protocols")
+    parser.add_argument("-sp",
+                        "--set_paths",
+                        nargs=2,
+                        help="User can set the input and output paths for the  \
+                        files created during the nlp process. The paths will be\
+                        written to the config file.",
+                        required=False,
+                        type=str,
+                        metavar=("input_path", "output_path"))
+    parser.add_argument("-fr",
+                        "--fresh_run",
+                        help="Deltes all temporary folders and output folders  \
+                        created during a previously nlp run before this one    \
+                        starts.",
+                        action="store_true",
+                        required=False)
+    parser.add_argument("-sb",
+                        "--skip_beautify_xml",
+                        help="Skips the script creating beautiful xml files.",
+                        action="store_true",
+                        required=False)
+    parser.add_argument("-ns",
+                        "--no_stop_words",
+                        help="If this is used the lemmatization or tokenization\
+                        of the input protocols will exculde stop words.",
+                        required=False,
+                        action="store_true")
+    group = parser.add_mutually_exclusive_group(required=False)
+    group.add_argument("-lm",
+                       "--lemmatize",
+                       help="Lemmatizes the XML protocols in the input directory\
+                       and saves them into the output directory.",
+                       action="store_true",
+                       required=False)
+    group.add_argument("-tn",
+                       "--tokenize",
+                       help="Tokenizes the XML protocols in the input directory\
+                       and saves them into the output directory.",
+                       action="store_true",
+                       required=False)
+    group.add_argument("-cn",
+                       "--calculate_n_grams",
+                       nargs=2,
+                       help="Calculates n_grams for any tokenized or leammtized\
+                       XML protocol created by this script.                    \
+                       feature_to_group_n_grams_by can be set to the following:\
+                       'year','month_year', 'speaker' or 'speech'.",
+                       required=False,
+                       type=str,
+                       metavar=("feature_to_group_n_grams_by", "input_type_name"))
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    # logging and start time
+    logging.basicConfig(filename="logs/bundesdata_nlp.log", level=logging.INFO,
+                        format="%(asctime)s %(name)s %(levelname)s:%(message)s",
+                        datefmt='%Y/%m/%d %H:%M:%S',
+                        filemode="w")
+    logger = logging.getLogger(__name__)
+    start_time = datetime.now()
+    print("Start time of script is:", start_time)
+    print("Info and status about the nlp process can be found in:",
+          "logs/bundesdata_nlp.log")
+    logger.info("Start time of script is: " + str(start_time))
+    # get arguments
+    args = parse_arguments()
+    # reads config
+    config = configparser.ConfigParser()
+    config.read("config.ini")
+    # if fresh_run is true directory nlp_output will be deleted
+    if(args.fresh_run is True):
+        config = configparser.ConfigParser()
+        config.read("config.ini")
+        options = config.items("File paths")
+        for option in options:
+            if(option[0] == "nlp_output"):
+                try:
+                    delete_folder.delete_folder(option[1])
+                except FileNotFoundError:
+                    pass
+            else:
+                config.remove_option("File paths", option[0])
+        with open("config.ini", 'w') as out:
+            config.write(out)
+
+    # create outputfolder if it does not exists and wirtes path to config
+    if(args.set_paths):
+        output_path = os.path.join(args.set_paths[1], "nlp_output")
+        if not os.path.exists(output_path):
+            os.mkdir(output_path)
+        update_config.update_config("config.ini", "File paths",
+                                    "nlp_output", output_path)
+    else:
+        output_path = config["File paths"]["nlp_output"]
+        if not os.path.exists(output_path):
+            os.mkdir(output_path)
+            update_config.update_config("config.ini", "File paths",
+                                        "nlp_output", output_path)
+    # gets file_path list of input files and wirtes inputfolder path to config
+    if(args.set_paths):
+        input_path = args.set_paths[0]
+        update_config.update_config("config.ini", "File paths",
+                                    "nlp_input", input_path)
+    elif(args.calculate_n_grams):
+        input_path = config["File paths"]["nlp_beuatiful_xml"]
+    else:
+        input_path = config["File paths"]["nlp_input"]
+    files = FileGetter(input_path, "*.xml")
+    files = files.get_files()
+    # if statements deciding which script will be executed
+    if(args.lemmatize is True and args.no_stop_words is True):
+        print("Starting lemmatization excluding stop words.")
+        lemmatization.lemmatization(files, True)
+        print("Finished lemmatization excluding stop words.")
+    elif(args.lemmatize is True and args.no_stop_words is False):
+        print("Starting lemmatization including stop words.")
+        lemmatization.lemmatization(files)
+        print("Finished lemmatization including stop words.")
+
+    if(args.tokenize is True and args.no_stop_words is True):
+        print("Starting tokenization excluding stop words.")
+        tokenize.tokenize(files, True)
+        print("Finished tokenization excluding stop words.")
+    elif(args.tokenize is True and args.no_stop_words is False):
+        print("Starting tokenization including stop words.")
+        tokenize.tokenize(files)
+        print("Finished tokenization including stop words.")
+
+    if(args.calculate_n_grams):
+        print("Starting calculation of n-grams for input files.")
+        n_grams.n_grams(files, args.calculate_n_grams[0], args.calculate_n_grams[1])
+        print("Finished calculation of n-grams for input files.")
+
+    if(args.skip_beautify_xml is not True and args.lemmatize is True
+       or args.tokenize is True):
+        print("Starting to prettyfy the xmls.")
+        beautify_markup.beautify_xml("nlp", True, 80)
+        print("Prettyfied the xmls.")
+    elif(args.skip_beautify_xml is True):
+        print("Skipping script beautify_markup.py.")
+
+    end_time = datetime.now()
+    print("End time of script is:", str(end_time))
+    logger.info("End time of script is: " + str(end_time))
+    duration = end_time - start_time
+    print("Duration of script is:", duration)
+    logger.info("Script duration is: " + str(duration))
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,47 @@
+[Regular expressions time extraction]
+session_start_time = (?:Die Sitzung wird [umrn]+ (?:(?:(\d{1,2}) Uhr (?:(\d{1,2})?)|(?:(\d{1,2})\.(?:(\d{1,2})) Uhr)) ?(?:Minuten?)?.?)? ?(?:durch\n*[\w \.;'\(\)]*)?[\s \. A-z]*(?:(?:eröffnet\.)|(?:eingeleitet[\w „\",\.]+)))|(?:Begi[\w]+:? (\d{1,2})(?:[, \.]*)?(?:(\d{1,2}))? ?Uhr\.?)|(?:Die Sitzung wird [umrn]+ (\d{1,2}) Uhr eröffnet.)
+session_end_time = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr\s(\d{1,2}).?\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(\d{1,2})\sUhr (\d{1,2})\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr und\s.?(\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (\d{1,2}) Uhr (\d{1,2})\.\))
+
+[Regular expressions splits]
+session_start_president_split = (\n\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?:)
+attachment_split = ((?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\)))
+
+[Regular expressions speakers]
+speaker_president_first = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; first ; Präsident
+speaker_state_secretary = ^[ \-\.,\w]+ Staatssekretär[\-\w\n, \n]+: ; middle ; Staatssekretär
+speaker_minister_of_state = ^[ \-\.,\w]+ Staatsminister[\-\w\n, \n]+: ; middle ; Staatsminister
+speaker_applicant = [ \-\.,\w]+ (\([\w ]+\))?, (?:A|a)ntragsteller(?:in)?[\-\w\n, \n]*: ; middle ; Antragsteller
+speaker_president = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; middle ; Präsident
+speaker_undefined = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-\.]+\) ?: ; middle ; MdB
+speaker_defined = ^[\w \-\.,]+ ?Bundesminister(in)? [\w\-\., ]* ?: ; middle ; Bundesminister
+speaker_chancellor = ^[\w \-\.\,]+Bundeskanzler(in)? ?: ; middle ; Bundeskanzler
+speaker_secretary = ^[\w \-\.,]+ ?Schriftführer(in)? ?: ; middle ; Schriftführer
+speaker_rapporteur = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-]+\) ?, (?:B|b)erichterstatter: ; middle ; Berichterstatter
+end_of_session = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\)) ; last ; Zeitpunkt
+
+[Additional name features]
+academic_titles = Dr. Dr. h. c. ; Dr. h. c.
+parties = DIE LINKE ; CDU/CSU ; PDS/Linke Liste ; Fraktionslos ; F.D.P.
+
+[Regular expressions speeches]
+comments = \B\([^\(\)]*\)\B ; kommentar
+date_string = [\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,2} ?\. Wahlperiode (?:–|—|-|--) \d{1,3} ?\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]*|[\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,3}\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]* ; metadata
+
+[Multiline entities]
+multiline_comment = \B\([^\(\)]* ; [^\(\)]*\)\B ; kommentar
+
+[File paths]
+nlp_output = /home/stephan/Desktop/tmp_test/nlp_output
+nlp_input = /home/stephan/Desktop/tmp_test/nlp_output/nlp_beuatiful_xml/
+nlp_lemmatized_tokenized = /home/stephan/Desktop/tmp_test/nlp_output/lemmatized
+tmp_path = /home/stephan/Desktop/tmp_test/nlp_output/lemmatized/tmp
+nlp_beuatiful_xml = /home/stephan/Desktop/tmp_test/nlp_output/nlp_beuatiful_xml
+input_folder_xmls = /home/stephan/Desktop/tmp_test/protocols/
+output_folder = /home/stephan/Desktop/tmp_test/output
+new_metadata = /home/stephan/Desktop/tmp_test/output/new_metadata
+new_simple_markup = /home/stephan/Desktop/tmp_test/output/simple_xml
+complex_markup = /home/stephan/Desktop/tmp_test/output/complex_markup
+clear_speech_markup = /home/stephan/Desktop/tmp_test/output/clear_speech_markup
+beautiful_xml = /home/stephan/Desktop/tmp_test/output/beautiful_xml
+fixed_markup = /home/stephan/Repos/master_thesis/data/working_data/id_fixed/fixed_markup
+
@@ -0,0 +1,46 @@
+[Regular expressions time extraction]
+session_start_time = (?:Die Sitzung wird [umrn]+ (?:(?:(\d{1,2}) Uhr (?:(\d{1,2})?)|(?:(\d{1,2})\.(?:(\d{1,2})) Uhr)) ?(?:Minuten?)?.?)? ?(?:durch\n*[\w \.;'\(\)]*)?[\s \. A-z]*(?:(?:eröffnet\.)|(?:eingeleitet[\w „\",\.]+)))|(?:Begi[\w]+:? (\d{1,2})(?:[, \.]*)?(?:(\d{1,2}))? ?Uhr\.?)|(?:Die Sitzung wird [umrn]+ (\d{1,2}) Uhr eröffnet.)
+session_end_time = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr\s(\d{1,2}).?\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(\d{1,2})\sUhr (\d{1,2})\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr und\s.?(\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (\d{1,2}) Uhr (\d{1,2})\.\))
+
+[Regular expressions splits]
+session_start_president_split = (\n\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?:)
+attachment_split = ((?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\)))
+
+[Regular expressions speakers]
+speaker_president_first = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; first ; Präsident
+speaker_state_secretary = ^[ \-\.,\w]+ Staatssekretär[\-\w\n, \n]+: ; middle ; Staatssekretär
+speaker_minister_of_state = ^[ \-\.,\w]+ Staatsminister[\-\w\n, \n]+: ; middle ; Staatsminister
+speaker_applicant = [ \-\.,\w]+ (\([\w ]+\))?, (?:A|a)ntragsteller(?:in)?[\-\w\n, \n]*: ; middle ; Antragsteller
+speaker_president = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; middle ; Präsident
+speaker_undefined = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-\.]+\) ?: ; middle ; MdB
+speaker_defined = ^[\w \-\.,]+ ?Bundesminister(in)? [\w\-\., ]* ?: ; middle ; Bundesminister
+speaker_chancellor = ^[\w \-\.\,]+Bundeskanzler(in)? ?: ; middle ; Bundeskanzler
+speaker_secretary = ^[\w \-\.,]+ ?Schriftführer(in)? ?: ; middle ; Schriftführer
+speaker_rapporteur = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-]+\) ?, (?:B|b)erichterstatter: ; middle ; Berichterstatter
+end_of_session = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\)) ; last ; Zeitpunkt
+
+[Additional name features]
+academic_titles = Dr. Dr. h. c. ; Dr. h. c.
+parties = DIE LINKE ; CDU/CSU ; PDS/Linke Liste ; Fraktionslos ; F.D.P.
+
+[Regular expressions speeches]
+comments = \B\([^\(\)]*\)\B ; kommentar
+date_string = [\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,2} ?\. Wahlperiode (?:–|—|-|--) \d{1,3} ?\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]*|[\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,3}\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]* ; metadata
+
+[Multiline entities]
+multiline_comment = \B\([^\(\)]* ; [^\(\)]*\)\B ; kommentar
+
+[File paths]
+nlp_output = /home/stephan/Desktop/nlp_output
+nlp_input = /home/stephan/Repos/master_thesis_data/data/outputs/outputs_markup/development_data/beautiful_xml/
+nlp_lemmatized_tokenized = /home/stephan/Desktop/nlp_output/lemmatized
+tmp_path = /home/stephan/Repos/master_thesis/data/working_data/output/clear_speech_markup/tmp
+nlp_beuatiful_xml = /home/stephan/Desktop/nlp_output/nlp_beuatiful_xml
+input_folder_xmls = /home/stephan/Repos/master_thesis/data/working_data/sub_set/
+output_folder = /home/stephan/Repos/master_thesis/data/working_data/output
+new_metadata = /home/stephan/Repos/master_thesis/data/working_data/output/new_metadata
+new_simple_markup = /home/stephan/Repos/master_thesis/data/working_data/output/simple_xml
+complex_markup = /home/stephan/Repos/master_thesis/data/working_data/output/complex_markup
+clear_speech_markup = /home/stephan/Repos/master_thesis/data/working_data/output/clear_speech_markup
+beautiful_xml = /home/stephan/Repos/master_thesis/data/working_data/output/beautiful_xml
+
@@ -0,0 +1,105 @@
+[Regular expressions time extraction]
+# These regular expressions are used to extract the start and ending time of one
+# session. The regular expressions are kind of complex because they have to catch
+# a lot of human errors. To catch those errors the expression is repeatedly
+# "chained" by using the or statement with only minor differences between each
+# expression. This is the easiest way though to catch as many times as possible.
+# The expressions match the partial strings where the start or end time is mentioned.
+# With different match groups the hours and minutes will then be extracted.
+
+# START TIME: Matches the start time.
+session_start_time = (?:Die Sitzung wird [umrn]+ (?:(?:(\d{1,2}) Uhr (?:(\d{1,2})?)|(?:(\d{1,2})\.(?:(\d{1,2})) Uhr)) ?(?:Minuten?)?.?)? ?(?:durch\n*[\w \.;'\(\)]*)?[\s \. A-z]*(?:(?:eröffnet\.)|(?:eingeleitet[\w „\",\.]+)))|(?:Begi[\w]+:? (\d{1,2})(?:[, \.]*)?(?:(\d{1,2}))? ?Uhr\.?)|(?:Die Sitzung wird [umrn]+ (\d{1,2}) Uhr eröffnet.)|(?:eingeleitet[\w „\",\.]+)))|(?:Begi[\w]+:? (\d{1,2})(?:[, \.]*)?(?:(\d{1,2}))? ?Uhr\.?)|(?:Die Sitzung wird [umrn]+ (\d{1,2}) Uhr eröffnet.)
+
+# END TIME: Matches the end time.
+session_end_time = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr\s(\d{1,2}).?\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(\d{1,2})\sUhr (\d{1,2})\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr und\s.?(\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (\d{1,2}) Uhr (\d{1,2})\.\))
+
+
+[Regular expressions splits]
+# These expressions are used for splitting the protocols at the location if
+# matched.
+# All match groups are non catching except the  group catching the entire regex
+# to insert it later on again. This is the main difference to the time extractions.
+# These splits are needed to automatically separate the actual session content
+# from the table of contents and the attachments.
+
+# Split at first president occurrence.
+session_start_president_split = (\n\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?:)
+
+# Split at the end time of session.
+attachment_split = ((?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\)))
+
+
+[Regular expressions speakers]
+# These are the regular expressions for matching the speakers in the protocols.
+# They consist of tuples with three values.
+# First element of the tuple is the regex.
+# Second element is a case that tells if this regex should be used as a
+# First, middle, or last element/match during the markup process.
+# Third element describes the type of speech the speaker is holding in German, to use it as an attribute later on.
+# The value tuple is divided with " ; " to convert it into a list later on.
+# It is similar to csv syntax. If needed the user can add more key, value pairs following the same
+# pattern to automatically identify even more speaker roles.
+
+speaker_president_first = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; first ; Präsident
+speaker_state_secretary = ^[ \-\.,\w]+ Staatssekretär[\-\w\n, \n]+: ; middle ; Staatssekretär
+speaker_minister_of_state = ^[ \-\.,\w]+ Staatsminister[\-\w\n, \n]+: ; middle ; Staatsminister
+speaker_applicant = [ \-\.,\w]+ (\([\w ]+\))?, (?:A|a)ntragsteller(?:in)?[\-\w\n, \n]*: ; middle ; Antragsteller
+speaker_president = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; middle ; Präsident
+speaker_undefined = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-\.]+\) ?: ; middle ; MdB
+speaker_defined = ^[\w \-\.,]+ ?Bundesminister(in)? [\w\-\., ]* ?: ; middle ; Bundesminister
+speaker_chancellor = ^[\w \-\.\,]+Bundeskanzler(in)? ?: ; middle ; Bundeskanzler
+speaker_secretary = ^[\w \-\.,]+ ?Schriftführer(in)? ?: ; middle ; Schriftführer
+speaker_rapporteur = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-]+\) ?, (?:B|b)erichterstatter: ; middle ; Berichterstatter
+end_of_session = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\)) ; last ; Zeitpunkt
+
+[Additional name features]
+# In this section the user can add additional strings which are not part of the
+# Stammdatenbank but are used inside the protocolls.
+academic_titles = Dr. Dr. h. c. ; Dr. h. c.
+parties = DIE LINKE ; CDU/CSU ; PDS/Linke Liste ; Fraktionslos ; F.D.P.
+
+[Regular expressions speeches]
+# These regular expressions are used to markup some entities inside of the actual speeches.
+# The value of any given key is a tuple with two values splitted by " ; " like in the section
+# \[Regular expressions speakers\]. First value is the regex and the second value is the tagname
+# wirrten as a string. This list of key, value pairs can also be extended by the user to identify
+# even more entities inside of the speeches. Just add key, value pairs following the same pattern.
+# These expressions are only used to identify entities which are present in one <p> without
+# linebreaks.
+
+comments = \B\([^\(\)]*\)\B ; kommentar
+date_string_with_periode = [\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,2} ?\. Wahlperiode (?:–|—|-|--) \d{1,3} ?\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]* ; metadata
+date_string_without_periode = [\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,3}\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]* ; metadata
+
+[Multiline entities]
+# These regulare expressions are used to identifie entities in speeches which span over multiple <p>
+# elements. The value of any given key is a tuple with three values splitted by " ; " like in the
+# section [Regular expressions speakers]. First value is a regex describing how the start of the
+# entity string looks like. The second value is a regex describing how the end of the entity string
+# looks like. Third value is the tagname written as a normal string.
+multiline_comment = \B\([^\(\)]* ; [^\(\)]*\)\B ; kommentar
+
+[File paths]
+# This is where the paths for input and output folders are set. The input folder
+# path should contain the XML-protocols that will be processed.
+# The output folder path specifies the place where all the intermediate files
+# and the final new XML protocols with the new automatic created markup will be
+# saved.
+
+input_folder_xmls = /home/stephan/Repos/master_thesis/data/working_data/development_data_xml
+output_folder = /home/stephan/Repos/master_thesis/data/working_data/
+
+# These paths will be set while running the programm.
+nlp_output = /home/stephan/Desktop/nlp_output
+nlp_input = /home/stephan/Desktop/protocols/
+nlp_lemmatized_tokenized = /home/stephan/Desktop/nlp_output/lemmatized
+tmp_path = /home/stephan/Desktop/nlp_output/lemmatized/tmp
+nlp_beuatiful_xml = /home/stephan/Desktop/nlp_output/nlp_beuatiful_xml
+input_folder_xmls = /home/stephan/Repos/master_thesis_data/inputs/excluded_periods/
+output_folder = /home/stephan/Desktop/output
+new_metadata = /home/stephan/Desktop/output/new_metadata
+new_simple_markup = /home/stephan/Desktop/output/simple_xml
+complex_markup = /home/stephan/Desktop/output/complex_markup
+clear_speech_markup = /home/stephan/Desktop/output/clear_speech_markup
+beautiful_xml = /home/stephan/Desktop/output/beautiful_xml
+fixed_markup = /home/stephan/Repos/master_thesis/data/working_data/id_fixed/fixed_markup
@@ -0,0 +1,225 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from markup.MetadataMarkup import MetadataMarkup
+from lxml import etree
+from xml.etree import ElementTree
+from xml.sax.saxutils import escape
+import logging
+import os
+import re
+
+
+class EntityMarkup(MetadataMarkup):
+    """Class for getting an XML node in which entities will be marked.
+    In practice this class and its mehtods can be used to get the text of a
+    given Node and marks every speaker in this text string.
+    Also passes methods and fields to the more specific
+    SimpleSpeakersMarkup."""
+
+    def __init__(self, file_path, element_name=".//sitzungsverlauf"):
+        super().__init__()
+        self.file_path = file_path
+        self.element_name = element_name
+        self.xml_tree = None
+        self.current_string = str()
+        self.filename = os.path.basename(file_path)
+        self.logger = logging.getLogger(__name__)
+
+    def get_element_text(self):
+        """
+        Gets the strings of all elements matched by an element x-path. Element
+        name will be passed when the class is istanced. Distunguishes between
+        one string or several strings.
+        """
+        self.all_elements = self.xml_tree.iterfind(self.element_name)
+        len_all_elements = len(list(self.all_elements))
+        self.current_strings = []
+        if(len_all_elements == 1):
+            self.all_elements = self.xml_tree.iterfind(self.element_name)
+            self.current_string = escape(list(self.all_elements)[0].text)
+            self.current_strings.append(self.current_string)
+        elif(len_all_elements > 1):
+            self.current_strings = []
+            self.all_elements = self.xml_tree.iterfind(self.element_name)
+            for element in self.all_elements:
+                string = escape(element.text)
+                self.current_strings.append(string)
+        self.all_elements = self.xml_tree.iterfind(self.element_name)
+
+    def replace_string(self, replacement_string, element_name):
+        """
+        This function takes the newly manipulated xml string and overwrites
+        the old string with it.
+        """
+        replacement_string = (
+                              "<" + element_name + ">"
+                              + replacement_string
+                              + "</" + element_name + ">"
+                              )
+        for element in self.xml_tree.xpath("//%s" % element_name):
+            element.getparent().remove(element)
+        replacement_element = etree.fromstring(replacement_string)
+        self.xml_tree.insert(1, replacement_element)
+
+    def simple_check_xml(self, xml_string, file_name, save_valid, node=True):
+        """
+        Checks if a given xml element is well-formed xml. If it is checking a
+        partial string it adds a root element. If node is False it is checking a
+        document as a string.
+        """
+        try:
+            if(node is True):
+                folder_path = "logs/well-formed_strings/"
+                file_path = os.path.join(folder_path, os.path.basename(file_name))
+                xml_string = "<root>" + xml_string + "</root>"
+                tree = etree.fromstring(xml_string)
+                self.logger.info(("The node string is well-formed. Simple markup is"
+                                  " correct. Node string can be found in "
+                                  + folder_path))
+                self.logger.info(tree)
+                if(save_valid is True):
+                    self.logger.info("Node string can be found in" + folder_path)
+                    if not os.path.exists(folder_path):
+                        os.mkdir(folder_path)
+                    with open(file_path, "w") as text_file:
+                        text_file.write(xml_string)
+            else:
+                folder_path = "logs/well-formed_files/"
+                file_path = os.path.join(folder_path, os.path.basename(file_name))
+                xml_string = xml_string
+                tree = etree.fromstring(xml_string)
+                self.logger.info("The XML file is well-formed.")
+                self.logger.info(tree)
+                if(save_valid is True):
+                    self.logger.info("File can be found in" + folder_path)
+                    if not os.path.exists(folder_path):
+                        os.mkdir(folder_path)
+                    with open(file_path, "w") as text_file:
+                        text_file.write(xml_string.decode("utf-8"))
+        except Exception as e:
+            if(node is True):
+                folder_path = "logs/not_well-formed_strings/"
+                file_path = os.path.join(folder_path, os.path.basename(file_name))
+                if not os.path.exists(folder_path):
+                    os.mkdir(folder_path)
+                with open(file_path, "w") as text_file:
+                    text_file.write(xml_string)
+                self.logger.error(("XML node string is not well-formed. XML can be"
+                                   " found in " + folder_path))
+                self.logger.error(e)
+            else:
+                folder_path = "logs/not_well-formed_files/"
+                file_path = os.path.join(folder_path, os.path.basename(file_name))
+                if not os.path.exists(folder_path):
+                    os.mkdir(folder_path)
+                with open(file_path, "w") as text_file:
+                    text_file.write(xml_string.decode("utf-8"))
+                self.logger.error(("XML file is not well-formed. XML can be"
+                                   " found in " + folder_path))
+                self.logger.error(e)
+                return False
+
+    def inject_element(self, current_element, regex, tagname,
+                       strip_newlines=False):
+        """
+        Injects new xml elements into the selected element text. The new element
+        will be created by using a regular expression which matches a partial
+        string in the current_element text string. The match will be the
+        new_element text string. The tagname sets the tagname of the
+        new_element. Optionally Attributes can be set aswell.
+        """
+        element_string = ElementTree.tostring(current_element, encoding="unicode", method="xml")
+        match = re.search(regex, element_string)
+        if(match):
+            index_shift = 0
+            if(strip_newlines is True):
+                counter = match.group().count("\n")
+                match_str = re.sub(r"\n", "", match.group())
+            else:
+                counter = 0
+                match_str = match.group()
+            index_start = match.start() + index_shift - counter
+            index_end = match.end() + index_shift - counter
+            new_element = etree.Element(tagname)
+            new_element.text = match_str
+            new_element_str = ElementTree.tostring(new_element, encoding="unicode", method="xml")
+            element_string = (element_string[:index_start]
+                              + new_element_str
+                              + element_string[index_end:])
+            index_shift += len(new_element_str) - len(match_str)
+            replacement_element = etree.fromstring(element_string.encode("utf8"))
+            current_element.getparent().replace(current_element, replacement_element)
+
+    def markup_speech_lines(self, current_element):
+        """
+        Inserts markup in every speech that marks every line <p> with
+        attribute klasse="J". J is set for every line even if it is O. In the
+        early protocols (period 1. to 10.) One line is most of the time a
+        sentence. In the later periods one line is capped at around 80
+        characters.
+        """
+        lines = current_element.xpath("text()")
+        if(len(lines) > 0):
+            lines = lines[0].splitlines()
+        current_element.xpath(".//redner")[0].tail = ""
+        for line in lines:
+            part_element = etree.Element("p")
+            part_element.set("klasse", "J")
+            part_element.text = line
+            current_element.append(part_element)
+
+    def get_multiline_entities(self, elements, start_of_str, end_of_str,
+                               tagname):
+        """
+        This function identifies multiline entities (i.e. Kommentare/Comments)
+        wich are split over multiple elements which have been marked with the
+        markup_speech_lines() function.
+        Gets the text of those and joins them together into one
+        string. The first elements text will be set to the newly created string
+        surrounded by new xml tags with tagname set to input tagname.
+        All other elements with the rest of the string will be deleted.
+        start_of_str should be a regex that describes the pattern how the start
+        of the supposed multiline entity looks like. end_of_str describes the
+        pattern how the end of the supposed multiline entity looks like.
+        """
+        self.multiline_text = []
+        self.multiline_elements = []
+        start_found = False
+        end_found = False
+        for element in elements:
+            if(start_found is False and end_found is False
+               and element.text is not None):
+                start_match = re.search(start_of_str, element.text)
+                if(start_match is not None):
+                    self.multiline_text.append(start_match.group())
+                    self.multiline_elements.append(element)
+                    start_found = True
+                    continue
+            elif(start_found is True and end_found is False
+                 and element.text is not None):
+                end_match = re.search(end_of_str, element.text)
+                if(end_match):
+                    self.multiline_text.append(end_match.group())
+                    self.multiline_elements.append(element)
+                    end_found = True
+                    continue
+                else:
+                    self.multiline_text.append(element.text)
+                    self.multiline_elements.append(element)
+                    continue
+            elif(start_found is True and end_found is True):
+                new_element_text = re.sub(r"- ", "", " ".join(self.multiline_text)) # joins the sting parts and also removes hyphenation
+                part_element = etree.Element("p")
+                part_element.set("klasse", "J")
+                comment_element = etree.Element(tagname)
+                comment_element.text = new_element_text
+                part_element.append(comment_element)
+                self.multiline_elements[0].getparent().replace(self.multiline_elements[0], part_element)
+                for element in self.multiline_elements[1:]:
+                    element.getparent().remove(element)
+                start_found = False
+                end_found = False
+                self.multiline_text = []
+                self.multiline_elements = []
+                continue
@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from utility.XMLProtocol import XMLProtocol
+import logging
+
+
+class MdBData(XMLProtocol):
+    """Class to handel operations on the Stammdatenbank."""
+
+    def __init__(self):
+        super(XMLProtocol, self).__init__()
+        self.logger = logging.getLogger(__name__)
+
+    def get_set(self, element_path, element_tree):
+        """
+        Creates Sets from input path on element_tree.
+        """
+        tmp_list = [element.text for element in
+                    element_tree.iterfind(element_path) if element is not None]
+        set_of_elements = set(tmp_list)
+        return set_of_elements
@@ -0,0 +1,267 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from utility.XMLProtocol import XMLProtocol
+from utility import update_config
+from lxml import etree
+from datetime import datetime
+from babel.dates import format_date
+import os
+import re
+import logging
+import configparser
+
+
+class MetadataMarkup(XMLProtocol):
+    """
+    This class is for opening one XML-protocoll, extracting the included
+    metadata and creating a new valid metadata head.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.plenarprotokoll_string = str()  # will be extracted with extract_metadata()
+        self.wahlperiode = int()  # will be extracted with extract_metadata()
+        self.sitzungsnr = int()  # will be extracted with extract_metadata()
+        self.herausgeber = "Deutscher Bundestag"  # Always the same in every protocoll
+        self.berichtart = "Steongrafischer Bericht"  # Always the same in every protocoll
+        self.sitzungstitel_string = ". Sitzung"  # Always the same in every protocoll
+        self.ort = "Berlin"  # Always the same in every protocoll
+        self.datum_ger_non_iso = str()  # will be extracted with extract_metadata()
+        self.datum_iso = str()  # ISO-date will be built from self.datum_ger_non_iso
+        self.datum_string = str()  # will be built from self.datum_iso
+        self.attachment = str()  # will be extracted from a split. Will not work
+        # all the time. But will not break the XML.
+        self.logger = logging.getLogger(__name__)
+
+    def extract_metadata(self, etree_element_object):
+        """
+        Extracts metadata from the given XML-tags and wirtes them into the
+        instance variables
+        """
+        root = etree_element_object
+        metadata_list = []
+        for element in root.iter():
+            if(element.tag != "TEXT"):
+                metadata_list.append(element.text)
+        metadata_list = metadata_list[1:]
+        self.wahlperiode = metadata_list[0]
+        self.plenarprotokoll_string = metadata_list[1].lower().title()
+        self.sitzungsnr = metadata_list[2].split("/")[1]
+        self.datum_ger_non_iso = metadata_list[3]
+        self.logger.info("Metadata successfully extracted.")
+        self.logger.info("Wahlperiode is:" + self.wahlperiode)
+        self.logger.info("Plenarprotokoll is:" + self.plenarprotokoll_string)
+        self.logger.info("Sitzungsnummer is:" + self.sitzungsnr)
+        self.logger.info("German non ISO date is:" + self.datum_ger_non_iso)
+
+    def built_iso_date(self, ger_date):
+        """
+        Gets the german date and converts it to an ISO standard date.
+        """
+        self.datum_iso = datetime.strptime(ger_date, "%d.%m.%Y").date()
+        self.logger.info("ISO date created:" + str(self.datum_iso))
+
+    def built_date_string(self, iso_date):
+        """
+        Gets the ISO date and creates from it an german full string date.
+        """
+        date_string = format_date(iso_date, format="full", locale="de_DE")
+        date_string = re.sub(r",", ", den", date_string)
+        self.datum_string = date_string
+        self.logger.info("Date string created:" + self.datum_string)
+
+    def delete_old_metadata(self, etree_element_object):
+        """
+        Deletes old metadata tags and text. Renames root tag.
+        """
+        for element in etree_element_object.iter():
+            if(element.tag != "TEXT" and element.tag != "DOKUMENT"):
+                element.getparent().remove(element)
+            elif(element.tag == "DOKUMENT"):
+                element.tag = "dbtplenarprotokoll"
+            elif(element.tag == "TEXT"):
+                self.full_content = element.text
+                element.getparent().remove(element)
+        self.logger.info("Old metadata deleted.")
+
+    def insert_new_metadata(self, etree_element_object):
+        """
+        Inserts the extracted metadata and splitted content into new created
+        and valid xml tags according to the official schema.
+        """
+        vorspann_element = etree.Element("vorspann")
+        xml_string = """
+    <kopfdaten>
+        <plenarprotokoll-nummer>{} <wahlperiode>{}</wahlperiode>/<sitzungsnr>{}</sitzungsnr>
+        (neu)</plenarprotokoll-nummer>
+        <herausgeber>{}</herausgeber>
+        <berichtart>{}</berichtart>
+        <sitzungstitel><sitzungsnr>{}</sitzungsnr>. Sitzung</sitzungstitel>
+        <veranstaltungsdaten><ort>{}</ort>, <datum date="{}">{}</datum></veranstaltungsdaten>
+    </kopfdaten>"""\
+            .format(self.plenarprotokoll_string, self.wahlperiode,
+                    self.sitzungsnr, self.herausgeber, self.berichtart,
+                    self.sitzungsnr, self.ort, self.datum_ger_non_iso,
+                    self.datum_string)
+        etree_from_str = etree.fromstring(xml_string)
+        etree_element_object.insert(0, vorspann_element)
+        vorspann_element.append(etree_from_str)
+        toc_element = etree.Element("inhaltsverzeichnis")
+        toc_element.text = self.toc
+        vorspann_element.append(toc_element)
+        content_element = etree.Element("sitzungsverlauf")
+        content_element.text = self.president + self.content
+        etree_element_object.insert(2, content_element)
+        anlagen_element = etree.Element("anlagen")
+        anlagen_element. text = self.attachment
+        etree_element_object.insert(3, anlagen_element)
+        rednerliste_element = etree.Element("rednerliste",
+                                            sitzungsdatum=self.datum_ger_non_iso)
+        etree_element_object.insert(4, rednerliste_element)
+        self.xml_tree = etree_element_object
+        self.logger.info("New metadata XML-head inserted." + xml_string)
+
+    def split_content(self, etree_element_object):
+        """Splits the full content to: table of content, speeches and in some
+        cases attachments."""
+        config = configparser.ConfigParser()
+        config.read("config.ini")
+
+        session_start_split = config["Regular expressions splits"]["session_start_president_split"]
+        regex_start = re.compile(session_start_split)
+        tmp_list = regex_start.split(self.full_content, maxsplit=1)
+        self.toc = tmp_list[0]
+        self.president = tmp_list[1]
+        self.content = tmp_list[2]
+
+        attachment_split = config["Regular expressions splits"]["attachment_split"]
+        regex_att = re.compile(attachment_split)
+        tmp_list = regex_att.split(self.content)
+        tmp_list = [element for element in tmp_list if element is not None]
+        if(tmp_list[-1] == ""):  # if the split does not match anything last item is empty string.
+            self.content = "".join(tmp_list[0:-1])
+            self.attachment = "Keine Anlage extrahiert."
+            self.logger.warning(("There is no attachment."))
+        else:
+            self.content = "".join(tmp_list[0:-1])
+            self.attachment = tmp_list[-1]
+            self.logger.info("Attachment found.")
+        self.logger.info("Contet splitted at:" + str(regex_start))
+        self.logger.info("Contet splitted at:" + str(regex_att))
+
+    def get_session_times(self):
+        """This function looks into the entire protocoll content to extract the
+        last closing time and the starting time. If only one of both or none are
+        found, the missing time will be set to xx:xx."""
+        config = configparser.ConfigParser()
+        config.read("config.ini")
+        regex_conf_values = config.items("Regular expressions time extraction")
+        regex_conf_values = [regex[1] for regex in regex_conf_values]
+        tmp_list = []
+        identifier = 0
+        start_time_found = True
+        end_time_found = True
+
+        for regex in (regex_conf_values):
+            identifier += 1
+            regex = re.compile(regex)
+            if(identifier == 1):
+                # Always gets first start time.
+                matches = list(regex.finditer(self.full_content))
+                if(len(matches) > 1):
+                    match = matches[-1]
+                elif(len(matches) == 0):
+                    match = None
+                else:
+                    match = matches[0]
+            elif(identifier == 2):
+                # Always gets last closing time
+                matches = list(regex.finditer(self.full_content))
+                if(len(matches) > 1):
+                    match = matches[-1]
+                elif(len(matches) == 0):
+                    match = None
+                else:
+                    match = matches[0]
+
+            if(match is None and identifier == 1):
+                self.logger.warning("No start time found for " + str(regex))
+                start_time_found = False
+            elif(match is None and identifier == 2):
+                self.logger.warning("No end time found for " + str(regex))
+                end_time_found = False
+            elif(match):
+                session_time = [group for group in match.groups()
+                                if group is not None]
+                session_time = ["0" + group if len(group) == 1 else group for
+                                group in session_time]  # Adds a 0 in front if digit len is 1
+                if(len(session_time) == 2):
+                    tmp_list.append(":".join(session_time))
+                elif(len(session_time) == 1):
+                    tmp_list.append(session_time[0] + ":00")
+
+        if(len(tmp_list) == 2):
+            self.session_start_time = tmp_list[0]
+            self.session_end_time = tmp_list[1]
+            self.logger.info("Start time found: " + self.session_start_time)
+            self.logger.info("End time found: " + self.session_end_time)
+            self.logger.info("Successfully matched start and end times.")
+        elif(len(tmp_list) == 1 and start_time_found is True and end_time_found
+             is False):
+            self.session_start_time = tmp_list[0]
+            self.session_end_time = "xx:xx"
+            self.logger.warning("Only start time found: "
+                                + self.session_start_time)
+            self.logger.warning("End time set to: "
+                                + self.session_end_time)
+        elif(len(tmp_list) == 1 and start_time_found is False and end_time_found
+             is True):
+            self.session_end_time = tmp_list[0]
+            self.session_start_time = "xx:xx"
+            self.logger.warning("Only end time found: "
+                                + self.session_end_time)
+            self.logger.warning("Start time set to: "
+                                + self.session_start_time)
+
+    def write_to_attr(self, element, attr_key, attr_value):
+        """
+        Writes two strings as a an attribute key value pair to a given
+        element.
+        """
+        elements = self.xml_tree.findall(element)
+        if(elements == []):
+            element = self.tree.getroot()
+            elements.append(element)
+        for element in elements:
+            element.set(attr_key, attr_value)
+            self.xml_tree = self.xml_tree
+            self.logger.info("Wrote attribute "
+                             + attr_key
+                             + "="
+                             + "\""
+                             + attr_value
+                             + "\"")
+
+    def save_to_file(self, output_path, file_path, subfolder, config_section,
+                     config_key):
+        """
+        Writes the new markup to a new xml file. Takes the output path and
+        creates a new folder there. Also updates the config file with the new
+        path.
+        """
+        self.filename = os.path.basename(file_path)
+        save_path = os.path.join(output_path, subfolder)
+        if not os.path.exists(save_path):
+            os.mkdir(save_path)
+        tree = etree.ElementTree(self.xml_tree)
+        new_filename = self.filename
+        save_file_path = os.path.join(save_path, new_filename)
+        tree.write(save_file_path,
+                   pretty_print=True,
+                   xml_declaration=True,
+                   encoding="utf8",
+                   doctype="<!DOCTYPE dbtplenarprotokoll SYSTEM 'dbtplenarprotokoll_minimal.dtd\'>")
+        self.logger.info("New XML saved to:" + save_file_path)
+        update_config.update_config("config.ini", config_section, config_key,
+                                    save_path)
@@ -0,0 +1,161 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+from markup.EntityMarkup import EntityMarkup
+import re
+import logging
+
+
+class SpeakerMarkup(EntityMarkup):
+    """
+    Class for specific markup of different speakers identified by different
+    regular expressions included in the config file.
+    """
+
+    def __init__(self, string, regex):
+        super(SpeakerMarkup).__init__()
+        self.string_to_search = string
+        self.regex_string = regex
+        self.logger = logging.getLogger(__name__)
+
+    def identify_speaker(self):
+        """
+        Gets match objects from the speakers in the given text node. Also
+        calculates length of it and puts the matches in a list.
+        """
+        self.matches = re.finditer(self.regex_compiled, self.string_to_search)
+        tmp_list = []
+        for match in self.matches:
+            tmp_list.append(match)
+        self.matches_count = len(tmp_list)
+        self.matches = tmp_list
+
+    def markup_speaker(self, case="middle"):
+        """
+        This is where the first simple markup happens. It uses the matches
+        and replaces them with simple markup for further processing. The
+        'first' markup uses re.sub. The second and third one work on string
+        basis.
+        """
+
+        def markup_logging():
+            """Helper function for creating log file output."""
+            if(self.matches_count == 0):
+                self.logger.warning("0 matches for given expression:"
+                                    + self.regex_string)
+            elif(self.matches_count > 0):
+                self.logger.info(str(self.matches_count)
+                                 + " matches for given expression:"
+                                 + self.regex_string)
+            elif(self.matches_count == 1):
+                self.logger.info(str(self.matches_count)
+                                 + " match for given expression:"
+                                 + self.regex_string)
+
+        if(case == "first"):
+            # Uses re.sub because it is only for one match.
+            start_tags = "<rede><redner>"
+            end_tags = "</redner>"
+            self.matches_count = 1  # sets count to 1 because it only marks the first match
+            markup_logging()
+            first_match = self.matches[0]
+            start_xml = start_tags + first_match.group() + end_tags
+            if(len(first_match.group().split()) <= 10):
+                self.string_to_search = self.regex_compiled.sub(start_xml,
+                                                                self.string_to_search,
+                                                                count=1)
+            self.markuped_string = self.string_to_search
+
+        elif(case == "middle"):
+            """
+            Does not use re.sub because it is faster to work on the string.
+            Also it avoids looping two times to get the specific match.group()
+            which caused some errors.
+            """
+            index_shift = 0
+            start_tags = "\n</rede><rede><redner>"
+            end_tags = "</redner>"
+            markup_logging()
+            for match in self.matches:
+                index_start = match.start() + index_shift
+                index_end = match.end() + index_shift
+                whole_match_len = len(match.group())
+                # Handels cases where lots of text before the actual speaker is # matched
+                linebrks_in_match = len(match.group().split("\n"))
+                if(linebrks_in_match >= 2):
+                    last_part_match = "".join(match.group().split("\n")[1:])
+                    first_line_of_match = match.group().split("\n")[0]
+                    if(len(first_line_of_match.split()) <= 10):
+                        match = first_line_of_match + last_part_match
+                    else:
+                        match = last_part_match
+
+                    delta_start_index = whole_match_len - len(match)
+                    index_start = index_start + delta_start_index
+
+                    self.string_to_search = (self.string_to_search[:index_start]
+                                             + start_tags
+                                             + match
+                                             + end_tags
+                                             + self.string_to_search[index_end:]
+                                             )
+                    index_shift += len(start_tags) + len(end_tags)
+
+                else:
+                    self.string_to_search = (self.string_to_search[:index_start]
+                                             + start_tags
+                                             + match.group()
+                                             + end_tags
+                                             + self.string_to_search[index_end:]
+                                             )
+                    index_shift += len(start_tags) + len(end_tags)
+
+            self.markuped_string = self.string_to_search
+
+        elif(case == "last"):
+            index_shift = 0
+            """
+            Matches the end of the session to add the last closing <rede> tag
+            to the last speech for well-formed xml. Uses re.sub because it is
+            only one operation.
+            """
+            end_tag = "</rede>"
+            session_close_time_tag = ('<sitzungsende/>')
+            # Created end tags will be inserted into the protocol
+            if(len(self.matches) == 1):
+                self.logger.info("Last speech successfully tagged.")
+                markup_logging()
+                for match in self.matches:
+                    end_xml = end_tag + match.group() + session_close_time_tag
+                    if(len(match.group().split()) <= 15):
+                        self.string_to_search = self.regex_compiled.sub(end_xml,
+                                                                        self.string_to_search,
+                                                                        count=1)
+                self.markuped_string = self.string_to_search
+
+            elif(len(self.matches) == 0):
+                self.logger.warning(("No end of session found! Last tag " + end_tag
+                                     + " will be added to the end of the protocol."
+                                     " This might add some unrelated text to the "
+                                     "last speech."))
+                markup_logging()
+                self.markuped_string = self.string_to_search + end_tag
+
+            else:
+                markup_logging()
+                self.logger.warning(("There are " + str(len(self.matches))
+                                     + " session endings. Ignoring the endings"
+                                     + " before the last final ending of the "
+                                     + " session."))
+                match = self.matches[-1]
+                end_xml = end_tag + match.group() + session_close_time_tag
+                whole_match_len = len(match.group())
+                index_start = match.start() + index_shift
+                index_end = match.end() + index_shift
+                last_line = match.group().split("\n")[-1]  # Always takes the last line of a match avoiding lots of text before the actual speaker.
+                delta_start_index = whole_match_len - len(last_line)
+                index_start = index_start + delta_start_index
+                self.string_to_search = (self.string_to_search[:index_start]
+                                         + end_xml
+                                         + self.string_to_search[index_end:])
+                index_shift += len(end_tag)
+                self.markuped_string = self.string_to_search
@@ -0,0 +1,554 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from markup.SpeakerMarkup import SpeakerMarkup
+from xml.etree import ElementTree
+from lxml import etree
+from tqdm import tqdm
+from itertools import combinations
+import copy
+import logging
+import re
+import os
+
+
+class SpeakerNameMarkup(SpeakerMarkup):
+    """
+    This class is for the complex markup of the speakers in one given protocol.
+    Creates the name tag with all needed inforamtion from the Stammdatenbank.
+    Has to cross reference the speaker with said Stammdatenbank.
+    """
+    known_redner_dicts = dict()
+    last_wahlperiode = int()
+
+    def __init__(self, file_path, element_name=".//redner"):
+        super(SpeakerNameMarkup).__init__()
+        self.file_path = file_path
+        self.filename = os.path.basename(self.file_path)[:-4]
+        self.element_name = element_name
+        self.redner_dict = dict()
+        self.all_speakers = []
+        self.logger = logging.getLogger(__name__)
+
+    def cross_reference_markup(self, strings, feature_set_dict,
+                               MdB_etree):
+        """
+        Checks if features like name, surename academic title and city are
+        present in the input string. Consists of main function and helper
+        functions. First the string will be split in tokens. Every token will
+        be checked a gainst sets of valid names, surnames, academic titles and
+        fractions. If there is a match a dictionary entriy will be set
+        accordingly.
+        Also uses the add_missing_MdB_feature helper function in a second step
+        to add features which are not present in the string or have been
+        identified wrongly.
+        The function crates a dictionary containing all features of one speaker
+        to crate a valid XML element from it later on.
+        """
+
+        def initiate_dict(keys, extra_keys):
+            """
+            Creates a dictionarie with a set of keys and sets them to None.
+            Some specific key values will be set to specific values.
+            """
+            for key in keys:
+                redner_dict[key] = None
+            for key in extra_keys:
+                redner_dict[key] = None
+            redner_dict["feature_complete"] = False
+            redner_dict["original_string"] = string
+            redner_dict["identified"] = False
+            redner_dict["damalige_fraktion"] = None
+
+        def get_names(keys, dict, token):
+            """
+            Checks if token is in set vorname or nachname. If it is dictionary
+            values will be set accordingly. Avoids that surname will be
+            overwirtten by a name wich is also a valid surname.
+            """
+            for key in keys[0:2]:  # Only for vorname, nachname in written order
+                if(token in feature_set_dict[key][0] and redner_dict[key]
+                   is None):
+                    redner_dict[key] = token
+                elif(token in feature_set_dict["nachname"][0]
+                     and redner_dict["nachname"] is not None):
+                    redner_dict["nachname"] = token
+                else:
+                    continue
+
+        def get_feature(key, string, set):
+            """
+            Checks if a token is a valid feature (like name affix or academic
+            title, ortszusatz or namenszusatz) and adds it to the dictionary.
+            Does not check for names.
+            """
+            for feature in set:
+                if(key == "titel"):
+                    regex = r"(\b{}\B)".format(re.escape(feature))  # could be Dr. and . is not a word boundary.
+                elif(key is "namenszusatz"):
+                    regex = r"\b({})\b".format(re.escape(feature))  # No . in word so word boundary at start and end of regex.
+                elif(key is "fraktion"):
+                        regex = r"\B(\({}\))\B".format(re.escape(feature))  # always surrounded by parentheses, but also has to match them to avoid matching i. e. "CDU" in "CDU/CSU"
+                elif(key is "ortszusatz"):
+                    regex = r"\B{}\B".format(re.escape(feature))  # always surrounded by parentheses
+                else:
+                    regex = r"(\b{}\b)".format(re.escape(feature))
+                match = re.search(regex, string)
+                if(match):
+                    if(key == "fraktion"):
+                        redner_dict[key] = match.group()[1:-1]  # removes ()
+                        break
+                    else:
+                        redner_dict[key] = match.group()
+                        break
+                else:
+                    redner_dict[key] = None
+
+        def get_role(string):
+            """Checks redner string for role. Identifies 'Bundesministerin für
+            Familie, Senioren, Frauen und Jugend' etc."""
+            if("Staatssekretär" in string or "Staatssekretärin" in string):
+                regex = r"(Staatssekretär(in)?)"
+                splits = re.split(regex, string, maxsplit=1)
+                role_long = splits[1] + splits[-1]
+                redner_dict["rolle_lang"] = role_long
+                role_short = [word[0] for word in role_long.split()
+                              if word[0].isupper()]
+                role_short = splits[1] + " " + "".join(role_short)
+                redner_dict["rolle_kurz"] = role_short
+            elif("Bundesminister" in string or "Bundesministerin" in string):
+                regex = r"(Bundesminister(in)?)"
+                splits = re.split(regex, string, maxsplit=1)
+                role_long = splits[1] + splits[-1]
+                redner_dict["rolle_lang"] = role_long
+                role_short = [word[0] for word in role_long.split()
+                              if word[0].isupper()]
+                role_short = splits[1] + " " + "".join(role_short)
+                redner_dict["rolle_kurz"] = role_short
+
+        def check_name(redner_dict):
+            """
+            Checks if vorname and nachname are the same. Sets vorname to None if
+            True. Vorname will be set later on with add_missing_MdB_feature.
+            """
+            if(redner_dict["nachname"] == redner_dict["vorname"]):
+                redner_dict["vorname"] = None
+
+        def get_party(redner_dict):
+            """
+            Creates a party key in the dictionary containing the party of the
+            speaker. Party is not the same as fraction. This is mainly done
+            because CDU/CSU is the fraction in the bundestag but speakers can
+            belong to either the CDU or CSU. If the fraction is not CDU/CSU
+            party will be set to fraction. Also handels problems with GRÜNE.
+            """
+            if(redner_dict["fraktion"] != "CDU/CSU"
+               and redner_dict["fraktion"] != "CDU"
+               and redner_dict["fraktion"] != "CSU"):
+                redner_dict["partei"] = redner_dict["fraktion"]
+            elif(redner_dict["fraktion"] == "CDU"
+                 or redner_dict["fraktion"] == "CSU"):
+                redner_dict["partei"] = redner_dict["fraktion"]
+                redner_dict["fraktion"] = "CDU/CSU"
+            if(redner_dict["fraktion"] == "GRÜNE"):
+                redner_dict["fraktion"] = "BÜNDNIS 90/DIE GRÜNEN"
+
+        def check_party_and_fraction():
+            """
+            Checks if party and fraction have been set correctly. Will be used
+            after add_missing_MdB_feature. To correct some errors with CDU/CSU.
+            """
+            if(redner_dict["fraktion"] is not None
+               and redner_dict["partei"] == "CDU"
+               or redner_dict["partei"] == "CSU"):
+                redner_dict["fraktion"] = "CDU/CSU"
+
+            if(redner_dict["partei"] is None
+               and redner_dict["fraktion"] is not None
+               and redner_dict["fraktion"] != "CDU"
+               and redner_dict["fraktion"] != "CSU"):
+                redner_dict["partei"] = redner_dict["fraktion"]
+
+        def get_match_in_str(key, string, regex):
+            """
+            Matches a regex in the current string and adds it as a value to the
+            given key into the dictionary.
+            """
+            match = re.search(regex, string)
+            if(match):
+                redner_dict[key] = match.group()
+            else:
+                redner_dict[key] = None
+
+        def add_missing_MdB_feature(string, redner_dict, feature_set_dict,
+                                    MdB_etree, conditions_key_list,
+                                    feature_lookup, feature_to_add,
+                                    logging_state=False, multi_ids=False):
+            """
+            This function trys to get missing features for on speaker. Input is
+            a list of features(conditions_key_list) which are used as parameters
+            in an xpath expression. The Xpath is built dynamically from the
+            list.
+            If the Xpath matches one unique entry the feature(feature_to_add)
+            will be set to the match of feature_lookup in the matched element.
+            """
+            ###
+            # Xpath creation from conditions_key_list
+            ###
+            xpath_parts = []
+            conds = conditions_key_list
+            len_conds = len(conds)
+            if(len_conds == 1):
+                for condition in conds:
+                    xpath_part = ".//MDB[.//{}/text()='{}']"                   \
+                                  .format(feature_set_dict[condition][1],
+                                          redner_dict[condition])
+                    xpath_parts.append(xpath_part)
+                xpath = "".join(xpath_parts)
+                if("None" in xpath):
+                    xpath = None
+            elif(len_conds == 2):
+                xpath_first_part = ".//MDB[.//{}/text()='{}'"                  \
+                                    .format(feature_set_dict[conds[0]][1],
+                                            redner_dict[conds[0]])
+                xpath_parts.insert(0, xpath_first_part)
+                xpath_last_part = ".//{}/text()='{}']"                         \
+                                   .format(feature_set_dict[conds[-1]][1],
+                                           redner_dict[conds[-1]])
+                xpath_parts.append(xpath_last_part)
+                xpath = " and ".join(xpath_parts)
+                if("None" in xpath):
+                    xpath = None
+            elif(len_conds > 2):
+                xpath_first_part = ".//MDB[.//{}/text()='{}'"                  \
+                                    .format(feature_set_dict[conds[0]][1],
+                                            redner_dict[conds[0]])
+                xpath_parts.insert(0, xpath_first_part)
+                for condition in conds[1:-1]:
+                    xpath_inner_part = ".//{}/text()='{}'"                     \
+                                        .format(feature_set_dict[condition][1],
+                                                redner_dict[condition])
+                    xpath_parts.append(xpath_inner_part)
+                xpath_last_part = ".//{}/text()='{}']"                         \
+                                   .format(feature_set_dict[conds[-1]][1],
+                                           redner_dict[conds[-1]])
+                xpath_parts.append(xpath_last_part)
+                xpath = " and ".join(xpath_parts)
+                if("None" in xpath):  # sets xpaths to None if it uses a feature which is None
+                    xpath = None
+            xpath_parts = []  # empties xpath_parts list
+            try:  # tries every xpath
+                matches = MdB_etree.xpath(xpath)
+            except TypeError:  # handles xpaths that are None
+                matches = []
+            # If xpath has unique match new feature value will be set to given feature
+            if(len(matches) == 1):
+                matches = matches[0]
+                feature_lookup = ".//" + feature_lookup
+                new_feature = matches.xpath(feature_lookup)[0].text
+                self.logger.info((" There is one unique match "
+                                  + " for this speaker: "
+                                  + str(redner_dict)
+                                  + " Extracted feature "
+                                  + feature_lookup + ": "
+                                  + str(new_feature)
+                                  + " with: "
+                                  + str(conds)))
+                redner_dict[feature_to_add] = new_feature
+                self.logger.info(("New speaker features are: "
+                                  + str(redner_dict)))
+            # Handels mathches tha are not unique for logging and mutli id
+            elif(len(matches) > 1):
+                self.logger.warning((" There are "
+                                     + str(len(matches))
+                                     + " matches for this speaker: "
+                                     + str(redner_dict)
+                                     + " .Could not extract: "
+                                     + feature_lookup
+                                     + " Features used are: "
+                                     + str(conds)))
+            elif(len(matches) > 1 and multi_ids is True):
+                ids = matches
+                for id, i in ids, enumerate(ids):
+                    key = "id" + i
+                    redner_dict[key] = id
+                return matches
+
+        def get_periode(MdB_etree):
+            periode = self.xml_tree.xpath(".//wahlperiode")
+            if(periode):
+                redner_dict["wahlperiode"] = periode[0].text
+                return periode[0].text
+
+    ###
+    # Start of main function cross_reference_markup
+    ###
+
+        # Initiates empty dict and gets keys for it
+        redner_dict = dict()
+        features = list(feature_set_dict.keys())
+
+        # Counters to calculate how successful the identification of speakers is
+        identified_speakers = 0
+        unidentified_speakers = 0
+        multiple_identified_speakers = 0
+
+        # Cross references every <redner> string
+        for string in tqdm(strings, desc="Cross reference name markup for speakers in strings"):
+            self.logger.info("\nStarting name markup process for new speaker:")
+            # Sets values in redner_dict to None or specific value
+            initiate_dict(features, [feature for feature in features])
+            tokens = string.replace(":", "").replace(",", "").split()  # replaces ":" and "," with nothing because some names would be "name:" and some names would contain a ","
+            for token in tokens:
+                get_names(features, feature_set_dict, token)
+            self.logger.info("nachname is: " + str(redner_dict["nachname"]))
+            feature_keys = [key for key in features if key not in ["vorname",
+                                                                   "nachname"]]
+            for f_key in feature_keys:
+                get_feature(f_key, string, feature_set_dict[f_key][0])
+            get_party(redner_dict)
+            check_name(redner_dict)
+            regex_p = r"^\w*(?:P|p)räsident\w*"
+            get_match_in_str("präsident", string, regex_p)
+            get_role(string)
+
+        ###
+        # Checks if script is still running for the same current periode.
+        # If this is not the case the known_redner_dicts will be emptied.
+        ###
+            current_wahlperiode = get_periode(MdB_etree)
+            if(current_wahlperiode != SpeakerNameMarkup.last_wahlperiode):
+                SpeakerNameMarkup.known_redner_dicts = dict()
+            SpeakerNameMarkup.last_wahlperiode = current_wahlperiode
+
+        ###
+        # Creates possible combinations of features which will be used in
+        # add_missing_MdB_feature to identify missing features like vorname or
+        # nachname.
+        ###
+
+            combination_features = [feature for feature in features if feature
+                                    not in ["namenszusatz",
+                                            "feature_complete",
+                                            "id",
+                                            "titel",
+                                            "rolle_kurz",
+                                            "rolle_lang",
+                                            "original_string",
+                                            "identified",
+                                            "damalige_fraktion"]]
+            subsets = []
+            for length in range(0, 5):
+                for subset in combinations(combination_features, length):
+                    subsets.append(list(subset))
+            subsets = subsets[1:]
+            combination_features.remove("wahlperiode")
+            combination_features.remove("nachname")
+
+        ###
+        # First while loop trying to identify every feature for one speaker.
+        # Uses combinations from above. Before calling the function
+        # add_missing_MdB_feature there is a check if the speaker has alreeady
+        # been identified before. If this is the case features will be set to
+        # the already identfied features. This saves a lot of time.
+        ###
+
+            counter_feats = 0
+            while(redner_dict["feature_complete"] is False):
+                redner_dict["damalige_fraktion"] = redner_dict["fraktion"]
+                # print("Doing name markup for:", redner_dict)
+                # Checks if speaker has been already identified before.
+                if(string in SpeakerNameMarkup.known_redner_dicts):
+                    # print("Speaker has already been identified once.")
+                    redner_dict = SpeakerNameMarkup.known_redner_dicts[string].copy()
+                    # print("Speaker features are set to:",
+                    #       SpeakerNameMarkup.known_redner_dicts[string])
+                    redner_dict["identified"] = True
+                    self.logger.info(("Speaker has alreeady been identified "
+                                      + "once."))
+                    self.logger.info(("Speaker features are set to: "
+                                      + str(SpeakerNameMarkup.known_redner_dicts[string])))
+                    if(SpeakerNameMarkup.known_redner_dicts[string]["feature_complete"] is not False):
+                        identified_speakers += 1
+                    break
+                else:
+                    for feature in combination_features:
+                        for subset in subsets:
+                            add_missing_MdB_feature(string,
+                                                    redner_dict,
+                                                    feature_set_dict,
+                                                    MdB_etree,
+                                                    subset,
+                                                    feature_set_dict[feature][1],
+                                                    feature)
+                            check_party_and_fraction()
+                        if(redner_dict["vorname"] is not None
+                           and redner_dict["nachname"] is not None
+                           and redner_dict["fraktion"] is not None
+                           and redner_dict["partei"] is not None):
+                            redner_dict["feature_complete"] = True
+                    counter_feats += 1
+                    if(counter_feats == len(combination_features)):
+                        redner_dict["feature_complete"] = False
+                        break
+
+        ###
+        # Second while loop uses four features to identfie the unique ID for one
+        # speaker with add_missing_MdB_feature. Also tries to identfie speakers
+        # with lesser known features. In this case there can be multiple possile
+        # ids for one speaker these will be saved in a special dictionary entry.
+        # Rare case.
+        ###
+
+            counter_ids = 0
+            while(redner_dict["id"] is None):
+                if(redner_dict["feature_complete"] is True):
+                    add_missing_MdB_feature(string,
+                                            redner_dict,
+                                            feature_set_dict,
+                                            MdB_etree,
+                                            ["vorname", "nachname", "partei",
+                                             "wahlperiode"],
+                                            feature_set_dict["id"][1],
+                                            "id")
+                    key_original_string = redner_dict["original_string"]
+                    SpeakerNameMarkup.known_redner_dicts.update(
+                                      {key_original_string: redner_dict.copy()})
+                    redner_dict["identified"] = True
+                    if(counter_ids == 1):
+                        redner_dict["id"] = None
+                        redner_dict["feature_complete"] = False
+                        redner_dict["identified"] = False
+                        self.logger.warning(("Unique ID could not be assigned. "
+                                             + "Feature complete: True "
+                                             + "Features are: "
+                                             + str(redner_dict)))
+                        SpeakerNameMarkup.known_redner_dicts.update(
+                                          {key_original_string: redner_dict.copy()})
+                        unidentified_speakers += 1
+                        identified_speakers -= 1  # because identified_speakers was set before
+                        break
+                    identified_speakers += 1
+                elif(redner_dict["feature_complete"] is not True):
+                    redner_dict["id"] = None
+                    ids = add_missing_MdB_feature(string,
+                                                  redner_dict,
+                                                  feature_set_dict,
+                                                  MdB_etree,
+                                                  ["nachname", "partei",
+                                                   "wahlperiode"],
+                                                  feature_set_dict["id"][1],
+                                                  "id", False, True)
+                    if(ids is not None and len(ids) > 1):
+                        redner_dict["identified"] = "Multiple"
+                        multiple_identified_speakers += 1
+                        identified_speakers -= 1
+                        break
+                    elif(ids is None):
+                        self.logger.warning(("Unique ID could not be assigned. "
+                                             + "Feature complete: False "
+                                             + "Features are: "
+                                             + str(redner_dict)))
+                        redner_dict["identified"] = False
+                        unidentified_speakers += 1
+                        break
+                counter_ids += 1
+
+            self.logger.info(("Number of identified speakers with valid id and"
+                              + " name markup is: "
+                              + str(identified_speakers)))
+            self.logger.info(("Number of unidentified speakers without valid"
+                              + " id and name markup is: "
+                              + str(unidentified_speakers)))
+            self.logger.info(("Number of speakers with possible multiple ids: "
+                              + str(multiple_identified_speakers)))
+            self.logger.info(("Number of all speaker entitiys in current"
+                              + " protocoll is: "
+                              + str(len(strings))))
+            redner_dict_final = copy.deepcopy(redner_dict)
+            self.redner_dict = redner_dict_final
+            self.all_speakers.append(self.redner_dict)
+            for key in features:
+                redner_dict[key] = None
+
+            # print("Speaker features after whole cross reference markup:",
+            #       redner_dict_final)
+        self.logger.info(("Saved speakers (identfied and not identified): "
+                          + str(len(self.all_speakers))))
+
+    def create_speaker_elements(self):
+        """
+        Creates a valid redner XML element for one redner_dict entry from the
+        list self.all_speakers. Has to be done step by step becuase dictionary
+        is not sorted and name sub elements have to be in specific order.
+        """
+        self.all_speaker_elements = []
+        for redner_entry in tqdm(self.all_speakers, desc="Creating speaker element"):
+            redner_element = etree.Element("redner")
+            redner_element.set("id", str(redner_entry["id"]))
+            name_element = etree.Element("name")
+            titel_element = etree.Element("titel")
+            titel_element.text = redner_entry["titel"]
+            vorname_element = etree.Element("vorname")
+            vorname_element.text = redner_entry["vorname"]
+            namenszusatz_element = etree.Element("namenszusatz")
+            namenszusatz_element.text = redner_entry["namenszusatz"]
+            nachname_element = etree.Element("nachname")
+            nachname_element.text = redner_entry["nachname"]
+            damalige_fraktion_element = etree.Element("damalige_fraktion")
+            damalige_fraktion_element.text = redner_entry["damalige_fraktion"]
+            fraktion_element = etree.Element("fraktion")
+            fraktion_element.text = redner_entry["fraktion"]
+            partei_element = etree.Element("partei")
+            partei_element.text = redner_entry["partei"]
+            ortszusatz_element = etree.Element("ortszusatz")
+            ortszusatz_element.text = redner_entry["ortszusatz"]
+            rolle_lang_element = etree.Element("rolle_lang")
+            rolle_lang_element.text = redner_entry["rolle_lang"]
+            rolle_kurz_element = etree.Element("rolle_kurz")
+            rolle_kurz_element.text = redner_entry["rolle_kurz"]
+            original_string_element = etree.Element("original_string")
+            original_string_element.text = redner_entry["original_string"]
+
+            if(redner_entry["titel"] is not None):
+                name_element.append(titel_element)
+            name_element.append(vorname_element)
+            if(redner_entry["namenszusatz"] is not None):
+                name_element.append(namenszusatz_element)
+            name_element.append(nachname_element)
+            name_element.append(damalige_fraktion_element)
+            name_element.append(fraktion_element)
+            name_element.append(partei_element)
+            if(redner_entry["ortszusatz"] is not None):
+                name_element.append(ortszusatz_element)
+            if(redner_entry["rolle_lang"] is not None):
+                name_element.append(rolle_lang_element)
+                name_element.append(rolle_kurz_element)
+            name_element.append(original_string_element)
+            name_element.tail = original_string_element.text
+            redner_element.append(name_element)
+            self.all_speaker_elements.append(redner_element)
+            self.logger.info(("Speaker element is: "
+                              + ElementTree.tostring(redner_element).decode("utf-8")))
+
+    def set_speech_ids(self):
+        """
+        This functions sets a unique rede id for every rede element in one
+        protocoll. Id is a ten digit integer preceded by the string ID.
+        Example: ID1809900000
+        First two digits are the wahlperiode the followinf three digits are the
+        sitzungsnr (session number). The remaining digits are for counting the
+        speeches. First speech is 00100, second is 00200, eleventh is 01100 and so on.
+        Example: ID1809901100 --> eleventh speech
+        Last tow digits are for corrections.
+        """
+
+        id_counter = 000
+        speeches = self.xml_tree.xpath(".//sitzungsbeginn | .//rede")
+        for speech in tqdm(speeches, desc="Creating speech ids"):
+            id_counter_str = str(id_counter).zfill(5)
+            id = "ID" + self.filename + id_counter_str
+            speech.set("id", id)
+            id_counter += 100
+            self.logger.info(("Speech id is: " + id))
+        self.xml_tree = self.xml_tree
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from utility.FileGetter import FileGetter
+from utility.XMLProtocol import XMLProtocol
+import configparser
+from tqdm import tqdm
+
+
+def beautify_xml(case, alter_lines=False, line_width=0):
+    """
+    Beautifies the xml protocols so that they are easily readable by humans.
+    Uses .beautify_xml_part() and .beautify_xml() to be able to format lines for
+    specific parts of an xml. Alter lines can be set to Flase or True. Line
+    width that will be used if alter_lines is True can be set to any value
+    between 0 and 160.
+    """
+    config = configparser.ConfigParser()
+    config.read("config.ini")
+    if(case == "markup"):
+        output_path = config["File paths"]["output_folder"]
+        input_path = config["File paths"]["clear_speech_markup"]
+        key_name = "beautiful_xml"
+    elif(case == "nlp"):
+        output_path = config["File paths"]["nlp_output"]
+        input_path = config["File paths"]["nlp_lemmatized_tokenized"]
+        key_name = "nlp_beuatiful_xml"
+    files = FileGetter(input_path, "*.xml")
+    files = files.get_files()
+    for file_path in tqdm(sorted(files), desc="First beautification steps"):
+        xml = XMLProtocol()
+        xml.read_xml(file_path)
+        xml.beautify_xml_part(file_path, ".//vorspann")
+        xml.replace_elements(".//vorspann", [xml.beautified_part])
+        xml.beautify_xml_part(file_path, ".//sitzungsverlauf", alter_lines,
+                              line_width)
+        xml.replace_elements(".//sitzungsverlauf", [xml.beautified_part])
+        xml.save_to_file(output_path, file_path, key_name,
+                         "File paths", key_name)
+    config.read("config.ini")
+    beautiful_xmls_path = config["File paths"][key_name]
+    files = FileGetter(beautiful_xmls_path, "*.xml")
+    files = files.get_files()
+    for file_path in tqdm(files, desc="Second beautification steps"):
+        xml.beautify_xml(file_path, False)
+
+
+if __name__ == '__main__':
+    beautify_xml()
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from utility.FileGetter import FileGetter
+from markup.MetadataMarkup import MetadataMarkup
+from tqdm import tqdm
+import os
+import configparser
+import logging
+
+
+def get_metadata():
+    """
+    This script creates a valid metadata head and first level xml tag strucutre
+    for all files in one directory with subdirs. It needs all filepaths for all
+    files to consider. File paths will be extracted by using the FileGetter
+    class.
+    After that it extracts the given metadata for one file each and writes it as
+    valid XML according to the new offical schema into a new file at the given
+    output path.
+    """
+    logger = logging.getLogger(__name__)
+    print("Running metadata creation for original XML-protocolls.")
+    config = configparser.ConfigParser()
+    config.read("config.ini")
+    input_path = config["File paths"]["input_folder_xmls"]
+    output_path = config["File paths"]["output_folder"]
+    Files = FileGetter(input_path, "*.xml")
+    file_list = Files.get_files()
+    metadata = MetadataMarkup()
+    for file in tqdm(sorted(file_list), desc="Metadata status:"):
+        logger.info("\nCreating metadata for: " + str(os.path.basename(file)))
+        root = metadata.read_protcol(file)
+        metadata.extract_metadata(root)
+        metadata.built_iso_date(metadata.datum_ger_non_iso)
+        metadata.built_date_string(metadata.datum_iso)
+        metadata.delete_old_metadata(root)
+        metadata.split_content(root)
+        metadata.insert_new_metadata(root)
+        metadata.get_session_times()
+        metadata.write_to_attr("dbtplenarprotokoll", "sitzung-datum",
+                               metadata.datum_ger_non_iso)
+        metadata.write_to_attr("dbtplenarprotokoll", "sitzung-start-uhrzeit",
+                               metadata.session_start_time)
+        metadata.write_to_attr("dbtplenarprotokol", "sitzung-ende-uhrzeit",
+                               metadata.session_end_time)
+        metadata.write_to_attr("dbtplenarprotokoll", "sitzungs-nr",
+                               metadata.sitzungsnr)
+        metadata.write_to_attr("dbtplenarprotokol", "wahlperiode",
+                               metadata.wahlperiode)
+        metadata.save_to_file(output_path, file, "new_metadata", "File paths", "new_metadata")
+        logger.info("New metadata created for: " + str(os.path.basename(file)))
+    print("Succesfully extracted and wrote new metadata to XML-protocolls.")
+
+
+if __name__ == '__main__':
+    get_metadata()
@@ -0,0 +1,122 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from markup.SpeakerNameMarkup import SpeakerNameMarkup
+from markup.MdBData import MdBData
+from utility.FileGetter import FileGetter
+from xml.etree import ElementTree
+from tqdm import tqdm
+import os
+import configparser
+import logging
+
+
+def get_names():
+    """
+    This script gets the identified speaker elements. It will analyse the text
+    of those to determine <vorname>, <nachname>, @id etc. for every speaker.
+    Also creates a speech id for every speech.
+    """
+    ###
+    # Setting paths in config and start logging
+    ###
+    logger = logging.getLogger(__name__)
+    config = configparser.ConfigParser()
+    config.read("config.ini")
+    xml_path = config["File paths"]["new_simple_markup"]
+    output_path = config["File paths"]["output_folder"]
+    parent_path = os.path.dirname(os.getcwd())
+    stammdatenbank_full_path = os.path.join(parent_path,
+                                            "data/MdB_data/MdB_Stammdaten.xml")
+    ###
+    # opens and reads Stammdatenbank
+    ###
+    stammdatenbank = MdBData()
+    stammdatenbank.read_xml(stammdatenbank_full_path)
+    ###
+    # Getting sets of different name name/MdB features
+    ###
+    # getting first names
+    first_names = stammdatenbank.get_set(".//VORNAME", stammdatenbank.xml_tree)
+    first_names.discard(None)
+    # getting las names
+    last_names = stammdatenbank.get_set(".//NACHNAME", stammdatenbank.xml_tree)
+    last_names.discard(None)
+    # getting academic titles
+    academic_titles = stammdatenbank.get_set(".//AKAD_TITEL",
+                                             stammdatenbank.xml_tree)
+    academic_titles_short = stammdatenbank.get_set(".//ANREDE_TITEL",
+                                                   stammdatenbank.xml_tree)
+    additional_academic_titles = [title for title in config["Additional name features"]["academic_titles"].split()]
+    for title in additional_academic_titles:
+        academic_titles.add(title)
+    academic_titles = academic_titles.union(academic_titles_short)
+    academic_titles.discard(None)
+    # getting parties
+    parties = stammdatenbank.get_set(".//PARTEI_KURZ", stammdatenbank.xml_tree)
+    additional_parties = [party for party in config["Additional name features"]["parties"].split()]
+    for party in additional_parties:
+        parties.add(party)
+    parties.discard(None)
+    # getting name affixes
+    name_affixes = stammdatenbank.get_set(".//PRAEFIX", stammdatenbank.xml_tree)
+    name_affixes.discard(None)
+    # getting cities
+    cities = stammdatenbank.get_set(".//ORTSZUSATZ", stammdatenbank.xml_tree)
+    cities.discard(None)
+    # setting empty sets to later combine them with XML node names for XPaths
+    party = set()  #
+    periode = set()  #
+    feature_complete = set()  #
+    speaker_id = set()  #
+    role_long = set()
+    role_short = set()
+    ###
+    # creating dict with tuples of sets and corresponding XML node name
+    ###
+    sets = [(first_names, "VORNAME"), (last_names, "NACHNAME"),
+            (academic_titles, "AKAD_TITEL"), (parties, "PARTEI_KURZ"),
+            (name_affixes, "PRAEFIX"), (cities, "ORTSZUSATZ"),
+            (party, "PARTEI_KURZ"), (periode, "WP"), (feature_complete, "None"),
+            (speaker_id, "ID"), (role_long, "None"), (role_short, "None")]
+    features = ["vorname", "nachname", "titel", "fraktion", "namenszusatz",
+                "ortszusatz", "partei", "wahlperiode", "feature_complete",
+                "id", "rolle_lang", "rolle_kurz"]
+    feature_set_dict = dict(zip(features, sets))
+    ###
+    # opening XML protocolls
+    # starting speaker markup for features
+    ###
+    files = FileGetter(xml_path, "*.xml")
+    files = files.get_files()
+    for file_path in tqdm(sorted(files),
+                          desc="File status"):
+        complex_speaker = SpeakerNameMarkup(file_path, ".//redner")
+        complex_speaker.read_xml(file_path)
+        complex_speaker.get_element_text()
+        logger.info(("Doing cross reference markup for names to get redner ids."
+                     + " For file: "
+                     + os.path.basename(file_path)))
+        complex_speaker.cross_reference_markup(complex_speaker.current_strings,
+                                               feature_set_dict,
+                                               stammdatenbank.xml_tree)
+        complex_speaker.create_speaker_elements()
+        complex_speaker.replace_elements(".//redner",
+                                         complex_speaker.all_speaker_elements,
+                                         True)
+        xml_string = ElementTree.tostring(complex_speaker.xml_tree)
+        bool = complex_speaker.simple_check_xml(xml_string, file_path, False,
+                                                False)
+        if(bool is False):
+            logger.error(("This XML file is not well-formed. Program stopped."
+                          " Fix or remove this file an run the program again."
+                          ))
+            print("Program has stopped. See logs for more info.")
+            break
+        complex_speaker.set_speech_ids()
+        complex_speaker.save_to_file(output_path, file_path, "complex_markup",
+                                     "File paths", "complex_markup")
+
+
+if __name__ == '__main__':
+    get_names()
@@ -0,0 +1,114 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from utility.FileGetter import FileGetter
+from utility.XMLProtocol import XMLProtocol
+from markup.EntityMarkup import EntityMarkup
+from markup.SpeakerMarkup import SpeakerMarkup
+from tqdm import tqdm
+import configparser
+import logging
+import os
+
+
+def get_speakers():
+    """
+    This script identifies speakers in one xml with the new metadata structure
+    created by metastructure.py and applies well-formed XML markup to them and their
+    speeches. The markup trys to follow the official guideline from the Deutsche
+    Bundesregierung but is more simplistic and deviates from it when it comes down
+    to apply markup to the presiden of a session. This decision was made to
+    guarantee that every speakers speech only contains what he or she is saying.
+    Thus the markup follows the own minimal markup defined in the DTD
+    'minimal_markup.dtd' which trys to mimic the official one as close as
+    possible. The full offical markup cannot be applied to the XML protocolls
+    automatically. Script uses classes and subclasses from EntityMarkup.py.
+    """
+    logger = logging.getLogger(__name__)
+    print("Running simple markup for first speaker identification.")
+    config = configparser.ConfigParser()
+    config.read("config.ini")
+    regex_conf_triples = config.items("Regular expressions speakers")
+    regex_conf_triples = [regex[1].split(" ; ") for regex in regex_conf_triples]
+    input_path = config["File paths"]["new_metadata"]
+    output_path = config["File paths"]["output_folder"]
+    files = FileGetter(input_path, "*.xml")
+    file_list = files.get_files()
+    sum_matches = 0
+
+    for file_path in tqdm(sorted(file_list), desc="Speaker markup status"):
+
+        identified = EntityMarkup(file_path)
+        logger.info("Doing simple markup for: " + str(os.path.basename(file_path)))
+        logger.info("\nMarkup status for: " + str(os.path.basename(file_path)))
+        with open(file_path, 'r') as f:
+            xml_as_string = f.read()
+        xml_as_bytes = xml_as_string.encode("utf-8")
+        bool = identified.simple_check_xml(xml_as_bytes, file_path, False,
+                                           False)
+        if(bool is False):
+            logger.error(("This XML file is not well-formed. Program stopped."
+                          " Fix or remove this file an run the program again."
+                          ))
+            print("Program has stopped. See logs for more info.")
+            break
+        identified.read_xml(file_path)
+        identified.get_element_text()
+        string_for_markup = identified.current_string
+        # Start of simple markup
+        for regex_conf_triplet in regex_conf_triples:
+            regex = regex_conf_triplet[0]
+            case = regex_conf_triplet[1]
+            speaker = SpeakerMarkup(string_for_markup, regex)
+            speaker.compile_regex(regex)
+            speaker.identify_speaker()
+            speaker.markup_speaker(case)
+            string_for_markup = speaker.markuped_string
+            sum_matches += speaker.matches_count
+
+        logger.info(str(sum_matches) + " total matches in the protocol.")
+        sum_matches = 0
+        speaker.simple_check_xml(string_for_markup, file_path, False)
+        # Saving simple markuped string to xml
+        speaker.read_xml(file_path)
+        speaker.replace_string(string_for_markup, "sitzungsverlauf")
+        speaker.save_to_file(output_path, file_path, "simple_xml", "File paths",
+                             "new_simple_markup")
+
+    print("Simple markup finished.")
+
+    config.read("config.ini")
+    new_simple_xml_path = config["File paths"]["new_simple_markup"]
+    # Start of president Replacer
+    new_files = FileGetter(new_simple_xml_path, "*.xml")
+    new_file_list = new_files.get_files()
+    print("Replacing some XML-elements in the protocolls.")
+    for file_path in tqdm(sorted(new_file_list), desc="Files replacement status"):
+        logger.info("Replacing some xml elements for: " + str(os.path.basename(file_path)))
+        for regex_conf_triplet in regex_conf_triples:
+            if(regex_conf_triplet[1] != "first"
+               or regex_conf_triplet[1] != "last"):
+                regex = regex_conf_triplet[0]
+                speaker_rolle_value = regex_conf_triplet[2]
+                replacements = XMLProtocol()
+                replacements.read_xml(file_path)
+                replacements.compile_regex(regex)
+                replacements.expand_element(".//rede", "typ",
+                                            speaker_rolle_value)
+                replacements.save_to_file(output_path, file_path, "simple_xml",
+                                          "File paths", "new_simple_markup")
+        start_time_attr_value = replacements.xml_tree.get("sitzung-start-uhrzeit")
+        replacements.replace_tag_attr(".//sitzungsverlauf/rede[1]",
+                                      "sitzungsbeginn",
+                                      "sitzung-start-uhrzeit",
+                                      start_time_attr_value,
+                                      False)
+        end_time_attr_value = replacements.xml_tree.get("sitzung-ende-uhrzeit")
+        replacements.expand_element(".//sitzungsende", "sitzung-ende-uhrzeit",
+                                    end_time_attr_value, False)
+        replacements.save_to_file(output_path, file_path, "simple_xml",
+                                  "File paths", "new_simple_markup")
+
+
+if __name__ == '__main__':
+    get_speakers()
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from utility.FileGetter import FileGetter
+from markup.EntityMarkup import EntityMarkup
+import configparser
+from tqdm import tqdm
+import logging
+
+def markup_speeches():
+    """
+    Marks up different entitys in the speech strings. For example comments.
+    First it marks speech parts (<p>) line by line.
+    """
+    logger = logging.getLogger(__name__)
+    config = configparser.ConfigParser()
+    config.read("config.ini")
+    complex_xmls = config["File paths"]["complex_markup"]
+    output_path = config["File paths"]["output_folder"]
+    regex_conf_pairs = config.items("Regular expressions speeches")
+    regex_conf_pairs = [regex[1].split(" ; ") for regex in regex_conf_pairs]
+    multiline_entities = config.items("Multiline entities")
+    multiline_entities = [regex[1].split(" ; ") for regex in multiline_entities]
+    files = FileGetter(complex_xmls, "*.xml")
+    file_list = files.get_files()
+    for file_path in tqdm(sorted(file_list), desc="File status speech markup"):
+        entity = EntityMarkup(file_path)
+        entity.read_xml(file_path)
+        speeches = entity.xml_tree.xpath(".//rede")
+        session_start = entity.xml_tree.xpath(".//sitzungsbeginn")[0]
+        for speech in speeches:
+            entity.markup_speech_lines(speech)
+        entity.markup_speech_lines(session_start)
+
+        session_lines = entity.xml_tree.xpath(".//p")
+        for line in tqdm(session_lines, desc="Marking single line entities"):
+            for pair in regex_conf_pairs:
+                entity.inject_element(line, pair[0], pair[1])
+
+        session_lines = entity.xml_tree.xpath(".//p") # gets new altered session lines (<p>)
+        for pair in tqdm(multiline_entities, desc="Marking multiline entities:"):
+            entity.get_multiline_entities(session_lines, pair[0], pair[1], pair[2])
+        # For logging
+        all_entities = 0
+        only_single_line_entities = 0
+        for pair in regex_conf_pairs:
+            element_path = ".//" + pair[1]
+            nr_entities = len(entity.xml_tree.xpath(element_path))
+            logger.info(("Number of identified " + pair[1] + " elements is: "
+                         + str(nr_entities)
+                         + " (single line)"))
+            all_entities += nr_entities
+            only_single_line_entities += nr_entities
+
+        for pair in multiline_entities:
+            element_path = ".//" + pair[2]
+            nr_entities = len(entity.xml_tree.xpath(element_path))
+            logger.info(("Number of identified " + pair[2] + " elements is: "
+                         + str(nr_entities)
+                         + " (multi line)"))
+            all_entities += nr_entities
+
+        logger.info(("Number of all identified single line entities: "
+                     + str(only_single_line_entities)))
+
+        logger.info(("Number of all identified entities is: " + str(all_entities)
+                     + " Also includes multiline matches. Number could be higher"
+                     + " than it is if multiline matches are matching the same"
+                     + " like the single line entitie regexes."))
+
+        entity.save_to_file(output_path, file_path, "clear_speech_markup",
+                            "File paths", "clear_speech_markup")
+
+
+if __name__ == '__main__':
+    markup_speeches()
@@ -0,0 +1,84 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import de_core_news_sm
+import configparser
+from utility.XMLProtocol import XMLProtocol
+from lxml import etree
+from tqdm import tqdm
+import re
+
+
+def lemmatization(files, no_stop_words=False):
+    """
+    Lemmatizes the speeches of the input XML protocols with the built in spacy
+    lookup-table function. Can include or exclude stop words.
+    Lemmatized text will be written into an new Element named
+    <rede_lemmatisiert>. Always removes punctuation. Joines hyphenated strings
+    before they will be lemmatised.
+    """
+    nlp = de_core_news_sm.load()
+    config = configparser.ConfigParser()
+    config.read("config.ini")
+    output_path = config["File paths"]["nlp_output"]
+    for file_path in tqdm(sorted(files), desc="Lemmatization file status"):
+        xml = XMLProtocol()
+        xml.read_xml(file_path)
+        speeches = xml.xml_tree.xpath(".//rede | .//sitzungsbeginn")
+        for speech in speeches:
+            parts = speech.xpath(".//p")
+            tmp_list = []
+            for part in parts:
+                if(part.text is not None):
+                    tmp_list.append(re.sub(r"_", " ", str(part.text + "\n")))
+                    """
+                    replaces "_" with " ". Is needed because a string like
+                    "Treffsicherheit einer Schrotflinte;_Sie haben nämlich kaum
+                    den Punkt getroffen" will not be lemmatized correctly in spacy.
+                    "Schrotflinte;_Sie" wil be recognized as one token.
+                    Furthermore this meeses up the sorted ngram calculation.
+                    Also adds \n at end of every line to help identifying
+                    hyphenated words.
+                    """
+                part.getparent().remove(part)
+            new_text = "".join(tmp_list)
+            new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[a-ßzäüö])", "\g<wordend>\g<wordstart>", new_text)
+            """
+            joins hyphenated words together:
+            'Länderfinanz- ausgleich' --> Länderfinanzausgleich.
+            Better to do it here because most of the comments and metadata has
+            already been marked.
+            Ignores strings like: 'Finanz-, Handels- und Sicherheitspolitik'.
+            Does not ignore them when they happen at a linebreak. This is a rare
+            occasion though.
+            """
+            new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[A-ZÄÜÖ])", "\g<wordend>-\g<wordstart>", new_text)
+            """
+            Removes all line breaks again. This way compound names with a line
+            break inbetween like "Sütterlin-\nWaack" will be recognized as one
+            string by spacy. --> Sütterlin-Waack
+            """
+            lemmatized_speech = etree.Element("rede_lemmatisiert")
+            doc = nlp(new_text)
+            if(no_stop_words is False):
+                lemmatized = " ".join([token.lemma_ for token in doc
+                                       if token.pos_ != "PUNCT" and token.text != "_"])
+                """
+                Removes "_" from text. Has to be removed
+                because it is some kind of special
+                character in spacy.
+                """
+                filename_sufix = "_lemmatized_with_stopwords.xml"
+            elif(no_stop_words is True):
+                lemmatized = " ".join([token.lemma_ for token in doc
+                                       if token.is_stop is False
+                                       and token.pos_ != "PUNCT" and token.text != "_"])
+                filename_sufix = "_lemmatized_without_stopwords.xml"
+            lemmatized_speech.text = lemmatized
+            speech.append(lemmatized_speech)
+        xml.save_to_file(output_path, file_path, "lemmatized", "File paths",
+                         "nlp_lemmatized_tokenized", filename_sufix)
+
+
+if __name__ == '__main__':
+    lemmatization()
@@ -0,0 +1,142 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*
+
+import configparser
+import csv
+import os
+import gc
+from utility.XMLProtocol import XMLProtocol
+from collections import Counter
+from tqdm import tqdm
+from sklearn.feature_extraction.text import CountVectorizer
+from itertools import groupby, chain
+from operator import itemgetter
+import locale
+locale.setlocale(locale.LC_COLLATE, "C")  # Sets locale to portable "C" locale.
+
+
+def n_grams(files, group_by_feature="year",
+            input_type_name="lemmatized_without_stopwords"):
+    """
+    Clacluates 1 to 5 grams for given input protocols. Can either handel
+    lemmatized or non lemmatized files. Writes the ngrams to a tab separated csv
+    file. One row inclueds the ngram, the match count of it, the year or date,
+    or rede_id or redner_id. One file per unigram, bigram, trigram etc. per
+    group key will be created. (There wil be one file for unigrams starting with
+    the letter 'A' one for unigrams starting with 'B' etc.)
+    Third parameter is a string set by the user which will be added to
+    the file names to help distinguish lemmatized and non lemmatized ngrams etc.
+    The more protocols are used as input the more RAM the script needs.
+    For all 4106 protocols 32GB of RAM with a 32GB swap file was used!
+    """
+    config = configparser.ConfigParser()
+    config.read("config.ini")
+    output_path = config["File paths"]["nlp_output"]
+    output_path = os.path.join(output_path, "n-grams")
+    if not os.path.exists(output_path):
+        os.mkdir(output_path)
+    for step in tqdm(range(6)[1:], desc="Current ngram calculating"):
+        N_GRAMS = []
+        file_name_prefix = str(step) + "_grams"
+        counter_vectorizer = CountVectorizer(ngram_range=(step, step),
+                                             lowercase=False)
+        for file_path in tqdm(sorted(files), desc="File status"):
+            xml = XMLProtocol()
+            xml.read_xml(file_path)
+            feature_year = xml.xml_tree.xpath("@sitzung-datum")[0][-4:]
+            feature_mont_year = xml.xml_tree.xpath("@sitzung-datum")[0][-7:]
+            speeches = xml.xml_tree.xpath(".//rede | .//sitzungsbeginn")
+            for speech in speeches:
+                # gets id of current speech
+                feature_rede_id = speech.xpath("@id")
+                if(len(feature_rede_id) == 0):
+                    feature_rede_id = "sitzungsbeginn"
+                else:
+                    feature_rede_id = feature_rede_id[0]
+                # gets id of current speaker
+                feature_redner_id = speech.xpath(".//redner/@id")[0]
+                # gets speech text from tokenized or lemmatized protocol
+                speech_text = speech.xpath("node()[2]")[0]  # gets second child of speech
+                if(speech_text.text is not None):
+                    tmp_str = speech_text.text
+
+                ngrams = counter_vectorizer.build_analyzer()
+                ngrams_list = ngrams(tmp_str)
+
+                if(group_by_feature == "year"):
+                    pairs = [(pair,) + (feature_year,) for pair
+                             in ngrams_list]
+                elif(group_by_feature == "month_year"):
+                    pairs = [(pair,) + (feature_mont_year,) for pair
+                             in ngrams_list]
+                elif(group_by_feature == "speaker"):
+                    pairs = [(pair,) + (feature_redner_id,) for pair
+                             in ngrams_list]
+                elif(group_by_feature == "speech"):
+                    pairs = [(pair,) + (feature_rede_id,) for pair
+                             in ngrams_list]
+                N_GRAMS.extend(pairs)
+            speeches = None
+        # puts uppercase ngram at first position in line to sort by this
+        # will be delted later on
+        print("Start counting ngrams.")
+        N_GRAMS = Counter(N_GRAMS)
+        print("Finished counting ngrams.")
+        print("Start sorting ngrams")
+        N_GRAMS = [item[0][0][0].upper()
+                   + "||"
+                   + item[0][0]
+                   + "||"
+                   + str(item[0][1])
+                   + "||"
+                   + str(item[1])
+                   for item in N_GRAMS.items()]
+        N_GRAMS = sorted(N_GRAMS, key=locale.strxfrm)
+        print("Finished sorting ngrams")
+        # sorts all ngrams into groups one group for each german uppercasse
+        # letter except ß
+        # Also one group for every decimal from 0 to 10
+        # Other non ascii or non decimal ngrams will be sorted in own groups
+        # These groups will be joined together later on into one non ascii group
+        alphabetically = []
+        tmp_list = []
+        for letter, entries in tqdm(groupby(N_GRAMS, key=itemgetter(0)),
+                                    desc="Grouping ngrams alphabetically"):
+            if(letter):
+                print(letter)
+                for entry in entries:
+                    tmp_list.append(entry)
+            alphabetically.append(tmp_list)
+            tmp_list = []
+            N_GRAMS = None
+            gc.collect() # frees RAM
+        key_list = ([i for i in range(10)]
+                    + "A B C D E F G H I J K L M N O P Q R S T U V W X Y Z".split()
+                    + ["_Non_ASCII"])
+        # groups all non ascii ngrams into one list to save them into one csv
+        if(len(alphabetically) > 37):
+            joined_tail = alphabetically[36:]
+            joined_tail = chain.from_iterable(list(joined_tail))
+            del alphabetically[36:]
+            alphabetically.append(joined_tail)
+        # save groups to individual files
+        for group, key in tqdm(zip(alphabetically, key_list),
+                               desc="Writing ngrams to files"):
+            group_ngrams = [entry.split("||")[1:] for entry in group]
+            file_name = (str(key)
+                         + "_"
+                         + file_name_prefix
+                         + "_per_"
+                         + group_by_feature
+                         + "_"
+                         + input_type_name
+                         + ".csv")
+            file_output_path = os.path.join(output_path, file_name)
+            with open(file_output_path, "w", newline="", encoding="utf8") as file:
+                writer = csv.writer(file, delimiter="\t")
+                writer.writerows(group_ngrams)
+        alphabetically = None
+
+
+if __name__ == '__main__':
+    n_grams()
@@ -0,0 +1,78 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import de_core_news_sm
+import configparser
+from utility.XMLProtocol import XMLProtocol
+from lxml import etree
+from tqdm import tqdm
+import re
+
+
+def tokenize(files, no_stop_words=False):
+    """
+    Tokenizes the speeches of the input XML protocols. Can include or exclude
+    stop words. Tokenized speeches will be written into a new element
+    <rede_tokenisiert>. Always removes punctuation. Joines hyphenated strings
+    before they will be tokenized.
+    """
+    nlp = de_core_news_sm.load()
+    config = configparser.ConfigParser()
+    config.read("config.ini")
+    output_path = config["File paths"]["nlp_output"]
+    for file_path in tqdm(sorted(files), desc="Tokenization file status"):
+        xml = XMLProtocol()
+        xml.read_xml(file_path)
+        speeches = xml.xml_tree.xpath(".//rede | .//sitzungsbeginn")
+        for speech in speeches:
+            parts = speech.xpath(".//p")
+            tmp_list = []
+            for part in parts:
+                if(part.text is not None):
+                    tmp_list.append(re.sub(r"_", " ", str(part.text + "\n")))
+                    """
+                    replaces "_" with " ". Is needed because a string like
+                    "Treffsicherheit einer Schrotflinte;_Sie haben nämlich kaum
+                    den Punkt getroffen" will not be lemmatized correctly in spacy.
+                    "Schrotflinte;_Sie" wil be recognized as one token.
+                    Furthermore this meeses up the sorted ngram calculation.
+                    Also adds \n at end of every line to help identifying
+                    hyphenated words.
+                    """
+                part.getparent().remove(part)
+            new_text = "".join(tmp_list)
+            new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[a-ßzäüö])", "\g<wordend>\g<wordstart>", new_text)
+            """
+            joins hyphenated words together:
+            'Länderfinanz- ausgleich' --> Länderfinanzausgleich.
+            Better to do it here because most of the comments and metadata has
+            already been marked.
+            Ignores strings like: 'Finanz-, Handels- und Sicherheitspolitik'.
+            Does not ignore them when they happen at a linebreak. This is a rare
+            occasion though.
+            """
+            new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[A-ZÄÜÖ])", "\g<wordend>-\g<wordstart>", new_text)
+            """
+            Removes all line breaks again. This way compound names with a line
+            break inbetween like "Sütterlin-\nWaack" will be recognized as one
+            string by spacy. --> Sütterlin-Waack
+            """
+            tokenized_speech = etree.Element("rede_tokenisiert")
+            doc = nlp(new_text)
+            if(no_stop_words is False):
+                tokenized = " ".join([token.text for token in doc
+                                      if token.pos_ != "PUNCT"])
+                filename_sufix = "_tokenized_with_stopwords.xml"
+            elif(no_stop_words is True):
+                tokenized = " ".join([token.text for token in doc
+                                      if token.is_stop is False
+                                      and token.pos_ != "PUNCT"])
+                filename_sufix = "_tokenized_without_stopwords.xml"
+            tokenized_speech.text = tokenized
+            speech.append(tokenized_speech)
+        xml.save_to_file(output_path, file_path, "tokenized", "File paths",
+                         "nlp_lemmatized_tokenized", filename_sufix)
+
+
+if __name__ == '__main__':
+    tokenize()
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import os
+import fnmatch
+import argparse
+import random
+import shutil
+
+"""
+This is just a quick script to get randomized samples from the protocols.
+"""
+
+
+def parse_arguments():
+    """Argument Parser"""
+    parser = argparse.ArgumentParser(description="Creates samples from given   \
+                                     directory with given size. Creates two    \
+                                     samples with no overlapping.")
+    parser.add_argument("-p",
+                        "--path",
+                        help="Path to data files to create sample from.",
+                        required=True,
+                        type=str,
+                        metavar="")
+    parser.add_argument("-s",
+                        "--size",
+                        help="Size of sample.",
+                        required=True,
+                        type=int,
+                        metavar="")
+    parser.add_argument("-n", "--number_of_samples",
+                        help="How many smaples should be created? should be    \
+                        created?",
+                        required=True,
+                        type=int,
+                        metavar="")
+    parser.add_argument("-t",
+                        "--file_type",
+                        help="What file types should be used as the base for   \
+                        the sample?. Accepts wildcars.",
+                        required=True,
+                        type=str)
+    args = parser.parse_args()
+    return args
+
+
+def get_files(path, file_type):
+    """Creates file list with full paths of all files in the given directory and
+    its sub directories and returns it."""
+    list_of_files = []
+    for path, subdirs, files in os.walk(path):
+        for name in files:
+            if fnmatch.fnmatch(name, file_type):
+                list_of_files.append(os.path.join(path, name))
+    return list_of_files
+
+
+def get_files_to_copy(list_of_files, sample_size):
+    """Gets random filepaths from all filepaths to create a sample out of those.
+    Filepaths that have already been use will be removed from the file list to
+    create independent sampels."""
+    counter = 0
+    sample_list = []
+    while counter < sample_size:
+        counter += 1
+        random_index = random.randint(0, len(list_of_files)-1)
+        sample_list.append(list_of_files[random_index])
+        del list_of_files[random_index]
+        pass
+    return list_of_files, sample_list
+
+
+def copy_files(path, sample_list, step_int):
+    """Copys the given files to new directories."""
+    sample_path = os.path.join(path, str(step_int))
+    print(sample_path)
+    os.mkdir(sample_path)
+    for file in sample_list:
+        shutil.copy2(file, sample_path)
+
+
+def main():
+    args = parse_arguments()
+    path = args.path
+    file_list = get_files(path, args.file_type)
+    for step in range(1, args.number_of_samples + 1):
+        file_list = get_files_to_copy(file_list, args.size)[0]
+        sample_list = get_files_to_copy(file_list, args.size)[1]
+        copy_files(path, sample_list, step)
+        file_list = get_files_to_copy(file_list, args.size)[0]
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,35 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import os
+import fnmatch
+
+"""
+This class is for getting filepaths of all files in a given directory. Also
+gets files in subdirectories.
+"""
+
+
+class FileGetter(object):
+    """
+    Class for getting file paths of given path wich will be opend and/or
+    further processed later on.
+    """
+
+    def __init__(self, path, file_type):
+        super(FileGetter, self).__init__()
+        self.path = path
+        self.file_type = file_type
+
+    def get_files(self):
+        """
+        Creates file list with full paths of all files in the given
+        directory and its sub directories and returns it.
+        """
+        list_of_files = []
+        for path, subdirs, files in os.walk(self.path):
+            for name in files:
+                if fnmatch.fnmatch(name, self.file_type):
+                    list_of_files.append(os.path.join(path, name))
+        self.list_of_files = list_of_files
+        return list_of_files
@@ -0,0 +1,209 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from utility import delete_folder
+from utility import update_config
+from xml.etree import ElementTree
+from os import path
+from lxml import etree
+import os
+import logging
+import re
+
+
+class XMLProtocol(object):
+    """Class for standard operations on/with the XML protocols. Has functions
+    for reading, saving and manipulationg an XML protocol. All other classes
+    inherit from this one.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.logger = logging.getLogger(__name__)
+
+    def read_protcol(self, file_path):
+        """
+        Takes a file path and parses the file as an XML returns a root element.
+        """
+        self.file_path = file_path
+        self.filename = os.path.basename(self.file_path)
+        parser = etree.XMLParser(remove_blank_text=True)
+        self.tree = etree.parse(file_path, parser)  # for better xml indentation
+        root = self.tree.getroot()
+        self.logger.info("File successfully parsed as XML.")
+        return root
+
+    def read_xml(self, file_path):
+        """Takes a file path and parses the file as an XML."""
+        parser = etree.XMLParser(encoding='utf-8', remove_blank_text=True)
+        tree = etree.parse(file_path, parser)  # for better xml indentation
+        self.xml_tree = tree.getroot()
+
+    def save_to_file(self, output_path, file_path, subfolder, config_section,
+                     config_key, filename_sufix=""):
+        """
+        Writes the new markup to a new xml file. Takes the output path and
+        creates a new folder there. Also updates the config file with the new
+        path.
+        """
+        if(filename_sufix == ""):
+            self.filename = path.basename(file_path)
+        elif(filename_sufix != ""):
+            self.filename = path.basename(file_path)[:-4] + filename_sufix
+        save_path = os.path.join(output_path, subfolder)
+        if not os.path.exists(save_path):
+            os.mkdir(save_path)
+        tree = etree.ElementTree(self.xml_tree)
+        new_filename = self.filename
+        save_file_path = os.path.join(save_path, new_filename)
+        tree.write(save_file_path,
+                   pretty_print=True,
+                   xml_declaration=True,
+                   encoding="utf8",
+                   doctype="<!DOCTYPE dbtplenarprotokoll SYSTEM 'dbtplenarprotokoll_minimal.dtd\'>")
+        self.logger.info("New XML saved to:" + save_file_path)
+        update_config.update_config("config.ini", config_section, config_key,
+                                    save_path)
+
+    def beautify_xml_part(self, file_path, xpath, alter_lines=False,
+                          line_width=80):
+        """
+        Beautifies part (element node) of an input XML.
+        """
+        tmp_path = os.path.join(os.path.dirname(file_path), "tmp")
+        tree = etree.ElementTree(self.xml_tree)
+        self.beautified_part = tree.find(xpath)
+        self.beautified_part = ElementTree.tostring(self.beautified_part)
+        self.beautified_part = etree.fromstring(self.beautified_part)
+        self.beautified_part = etree.ElementTree(self.beautified_part)
+        if not os.path.exists(tmp_path):
+            os.mkdir(tmp_path)
+        tmp_file_path = os.path.join(tmp_path, "tmp.xml")
+        self.beautified_part.write(tmp_file_path,
+                                   pretty_print=True,
+                                   xml_declaration=True,
+                                   encoding="utf8")
+        if(alter_lines is True):
+            os.system("html-beautify -r -q -w {} --no-preserve-newlines {}".format(line_width, tmp_file_path))
+            self.beautified_part = etree.parse(tmp_file_path).getroot()
+        elif(alter_lines is False):
+            os.system("html-beautify -r -q {}".format(tmp_file_path))
+            self.beautified_part = etree.parse(tmp_file_path).getroot()
+        update_config.update_config("config.ini", "File paths", "tmp_path",
+                                    tmp_path)
+        delete_folder.delete_folder(tmp_path)
+
+    def beautify_xml(self, file_path, alter_lines=False, line_width=80):
+        if(alter_lines is True):
+            os.system("html-beautify -r -q -w {} --no-preserve-newlines {}".format(line_width, file_path))
+        elif(alter_lines is False):
+            os.system("html-beautify -r -q {}".format(file_path))
+
+    def expand_element(self, element_to_expand, expand_attr_key,
+                       expand_attr_value, check_child=True):
+        """
+        This function takes an XPath expression for an xml element.
+        The tag of this element will be expanded with the given
+        expand_attrkey and expand_attr_value. Also needs a regex to determine if
+        the current selected element is an element which should be replaced.
+        For this the text of the first child of the current element is checked
+        against the given regex. Per default the child element text of the
+        current element is checked wether the regex matches the string or not.
+        Set check_child to False to avoid this and just expand the current
+        element.
+        """
+        elements = self.xml_tree.findall(element_to_expand)
+        for element in elements:
+            if(check_child is True):
+                first_child = element.getchildren()[0]
+                match = self.regex_compiled.search(first_child.text)
+                if(match):
+                    element.set(expand_attr_key, expand_attr_value)
+                self.xml_tree = self.xml_tree
+            else:
+                element.set(expand_attr_key, expand_attr_value)
+                self.xml_tree = self.xml_tree
+
+    def replace_tag_name(self, element_to_replace, tag_name, check_child=True):
+        """
+        Replaces a given element tag(as XPath) name with a new tag name.
+        """
+        elements = self.xml_tree.findall(element_to_replace)
+        for element in elements:
+            if(check_child is True):
+                first_child = element.getchildren()[0]
+                match = self.regex_compiled.search(first_child.text)
+                if(match):
+                    element.tag = tag_name
+            else:
+                element.tag = tag_name
+        self.xml_tree = self.xml_tree
+
+    def replace_tag_attr(self, element_to_replace, tag_name, attr_key,
+                         attr_value, check_child=True):
+        """
+        Replaces tag name of given element(as XPath) with new name and adds an
+        attribute Can also check if the child of the current element contains
+        some specific text like in the expand_element function.
+        """
+        elements = self.xml_tree.findall(element_to_replace)
+        for element in elements:
+            if(check_child is True):
+                first_child = element.getchildren()[0]
+                match = self.regex_compiled.search(first_child.text)
+                if(match):
+                    element.tag = tag_name
+                    element.set(attr_key, attr_value)
+            else:
+                element.tag = tag_name
+                element.set(attr_key, attr_value)
+        self.xml_tree = self.xml_tree
+
+    def replace_elements(self, elements_to_replace, replacment_elements,
+                         keep_parent_text=False):
+        """
+        Replaces elements identifeid by XPath with new elements. Can either keep
+        the text of the parent element or not.
+        """
+        elements = self.xml_tree.findall(elements_to_replace)
+        parents_text_xpath = elements_to_replace + "/" + "parent::node()" + "/" + "text()"
+        elements_text = self.xml_tree.xpath(parents_text_xpath)
+        if(len(elements) == len(replacment_elements)):
+            if(keep_parent_text is False):
+                for element, replacement_element in zip(elements, replacment_elements):
+                    element.getparent().replace(element, replacement_element)
+            else:
+                for element, replacement_element in zip(elements, replacment_elements):
+                    element.getparent().replace(element, replacement_element)
+                self.xml_tree = self.xml_tree
+                elements = self.xml_tree.findall(elements_to_replace)
+                for element, text in zip(elements, elements_text):
+                    element.tail = text
+            self.xml_tree = self.xml_tree
+        else:
+            self.logger.warning(("Elements missmatch. There are "
+                                 + str(len(elements))
+                                 + " that should be repalced."
+                                 + " There are " + str(len(replacment_elements))
+                                 + " present."
+                                 + " No elements have been replaced."))
+
+    def compile_regex(self, regex):
+        self.regex_string = regex
+        """
+        Takes the input regex string and compiles it for better performance
+        and redability.
+        """
+        self.regex_compiled = re.compile(self.regex_string, re.MULTILINE)
+
+    def clean_text(self, regex, xpath, replacement_string="",):
+        """
+        Replaces regex matches with nothing by default or replacement string
+        for an element matched by the xpath in the xml_tree. Works with
+        matchgroups.
+        """
+        elements = self.xml_tree.xpath(xpath)
+        for element in elements:
+            replaced = re.sub(regex, replacement_string, element.text)
+            element.text = replaced
+        self.xml_tree = self.xml_tree
@@ -0,0 +1,15 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import shutil
+
+
+def delete_folder(folder_path):
+    """
+    Deletes folder idetified by input folder path string.
+    """
+    shutil.rmtree(folder_path)
+
+
+if __name__ == '__main__':
+    delete_folder()
@@ -0,0 +1,22 @@
+import os
+
+"""
+Helper script to move n_gram csvs to seperate folders. Just copy this into the
+folder containing the n-grams and execute it. Change n to number of N in N-grams.
+"""
+current_path = os.getcwd()
+files = []
+n = 5
+for file in os.listdir(current_path):
+    if file.endswith(".csv"):
+        files.append(file)
+files = sorted(files)
+
+dir_list = ["1_grams", "2_grams", "3_grams", "4_grams", "5_grams"][:n]
+for dir in dir_list:
+    os.system("mkdir {}".format(dir))
+
+for step, dir in zip(range(0, n), dir_list):
+    for file in files[step::n]:
+        print(file)
+        os.system("mv {} {}".format(file, dir))
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import configparser
+
+
+def update_config(file_name, section, key, value):
+    """
+    This script updates a config file identified by file_name. Updates the data
+    of one key value pair in a specific section.
+    """
+    config = configparser.ConfigParser()
+    config.read(file_name)
+    file = open(file_name, "w")
+    config.set(section, key, value)
+    config.write(file)
+    file.close()
+
+
+if __name__ == '__main__':
+    update_config()
@@ -0,0 +1,5 @@
+# Metadaten
+
+Quelle der Strukturdefinition: https://www.bundestag.de/blob/577234/f9159cee3e045cbc37dcd6de6322fcdd/dbtplenarprotokoll_kommentiert-data.pdf
+Heruntergleaden am: 06.11.2018
+
@@ -0,0 +1,7 @@
+# Bundesdata
+lxml==4.2.5
+Babel==2.6.0
+tqdm==4.28.1
+spacy==2.0.18
+https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.0.0/de_core_news_sm-2.0.0.tar.gz
+scikit-learn[alldeps]==0.20.2