commit 4263e5f41ec1dac8c3f04d1a09b7d31f9fe20e03 Author: Stephan Porada Date: Thu Feb 21 19:29:44 2019 +0100 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100755 index 0000000..a0c0ae1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +data/* +.idea/* diff --git a/README.md b/README.md new file mode 100755 index 0000000..954e2f4 --- /dev/null +++ b/README.md @@ -0,0 +1,72 @@ +# Master_thesis +Master Thesis Repository. + +## Benötigte Pakete und Sprachen + +- Python 3.7+ +- Python Pakete werden mittels requirements.txt installiert. Siehe Installation Schritt 2. + +## Installation +1. Stellen sie sicher, dass das Paket `python3.7-dev` installiert ist. Wenn nicht: `sudo apt-get install python3.7-dev` +1. Installieren Sie _virtualenv_ mittels `pip install virtualenv`. Oder dem jeweiligen package manager der eigenen Distribution. +2. Installieren Sie JS Beautifier systemweit `sudo npm -g install js-beautify` (Optional! Wenn nicht gewünscht, kann der Schritt übersprungen werden. Der Schritt welches dieses Paket während der Auszeichnung benötigt kann übersprungen werden. Allerdings gibt es so keine schön formatierten XML-Dateien.) +3. Erstelle virtual environment für das Projekt mittels `virtualenv --python=python3.7 path/to/folder` +4. Aktivieren der virtuellen Umgebung mittels `source path/to/folder/bin/activate` +5. `cd verzeichnis/des/repository` +6. Installieren der Abhängigkeiten mit `pip install -r requirements.txt`. + +## Scriptaufrufe Beispiele: + +### @Home +- `source ~/VirtualEnvs/bundesdata/bin/activate` +- `cd ~/Documents/Eigene\ geschriebene\ Programme/master_thesis/bundesdata/` + +#### Development Data + +**Metadata** +-`python markup/metastructure.py -p /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data/working_data/development_data_xml -f *.xml -o /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data/working_data` + +**Speakers** +- `python markup/speakers.py -p /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data/working_data/xml_new_metadata_structure -f *.xml -o /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data/working_data` + +#### Full data + +**Metadata** +-`python markup/metastructure.py -p /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data/protocols_raw_xml -f *.xml -o /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data` + +**Speakers** +- `python markup/speakers.py -p /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data/xml_new_metadata_structure -f *.xml -o /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data` + +### @Uni + + +#### Development Data +- `source /home/stephan/VirtualEnvs/bundesdata/bin/activate` +- `cd /home/stephan/Repos/master_thesis/bundesdata` + +**Speakers** +- `python markup/speakers.py -p /home/stephan/Repos/master_thesis/data/working_data/xml_new_metadata_structure -f *.xml -o /home/stephan/Repos/master_thesis/data/working_data` + +**Metadata** +-`python markup/metastructure.py -p /home/stephan/Repos/master_thesis/data/working_data/development_data_xml -f *.xml -o /home/stephan/Repos/master_thesis/data/working_data` + + +#### Test Data +- `source /home/stephan/VirtualEnvs/bundesdata/bin/activate` +- `cd /home/stephan/Repos/master_thesis/bundesdata` + +**Speakers** +- `python markup/speakers.py -p /home/stephan/Repos/master_thesis/data/working_data/test/xml_new_metadata_structure -f *.xml -o /home/stephan/Repos/master_thesis/data/working_data/test` + +**Metadata** +-`python markup/metastructure.py -p /home/stephan/Repos/master_thesis/data/working_data/test_data_xml -f *.xml -o /home/stephan/Repos/master_thesis/data/working_data/test` + +#### Full data +- `source /home/stephan/VirtualEnvs/bundesdata/bin/activate` +- `cd /home/stephan/Repos/master_thesis/bundesdata` + +**Speakers** +- `python markup/speakers.py -p /home/stephan/Repos/master_thesis/data/xml_new_metadata_structure -f *.xml -o /home/stephan/Repos/master_thesis/data` + +**Metadata** +-`python markup/metastructure.py -p /home/stephan/Repos/master_thesis/data/protocols_raw_xml -f *.xml -o /home/stephan/Repos/master_thesis/data` diff --git a/bundesdata_markup_nlp/__init__.py b/bundesdata_markup_nlp/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/bundesdata_markup_nlp/bundesdata_markup.py b/bundesdata_markup_nlp/bundesdata_markup.py new file mode 100755 index 0000000..ce4dd31 --- /dev/null +++ b/bundesdata_markup_nlp/bundesdata_markup.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from markup import metadata, speakers, speaker_names, speeches +from utility import update_config +from markup import beautify_markup +from utility import delete_folder +import argparse +import time +import configparser +from datetime import datetime +import logging +import os + +""" +This is the mains script handeling the automatic markup of the protocols. Needs +some user Input specified in parse-arguments(). +""" + + +def parse_arguments(): + """ + Argument Parser + """ + parser = argparse.ArgumentParser(description="Starts the markup process of \ + the XML protocols. Uses either the input \ + and output paths currently specified in \ + the config file or the paths set when \ + calling the script from the terminal with \ + the flag argument '-sp' or '--set_paths'. \ + Using this parameter writes the given \ + paths into the config file. \ + Some steps of the markup process can be \ + skipped if they already have been executed\ + once while useing the -kt option \ + by using the corresponding parameters. \ + ") + parser.add_argument("-sp", + "--set_paths", + nargs=2, + help="User can set the input and output paths for the \ + files created during the markup. The paths will be \ + written to the config file.", + required=False, + type=str, + metavar=("input_path", "output_path")) + parser.add_argument("-sm", + "--skip_metadata", + help="Skips the script creating metadata and first \ + xml strucutre.", + action="store_true", + required=False) + parser.add_argument("-ss", + "--skip_simple_speakers", + help="Skips the script creating the first simple \ + speaker markup.", + action="store_true", + required=False) + parser.add_argument("-sn", + "--skip_name_markup", + help="Skips the script creating the name markup.", + action="store_true", + required=False) + parser.add_argument("-ssp", + "--skip_speeches", + help="Skips the script creating markup inside of \ + speeches.", + action="store_true", + required=False) + parser.add_argument("-sb", + "--skip_beautify_xml", + help="Skips the script creating beautiful xml files.", + action="store_true", + required=False) + parser.add_argument("-kt", + "--keep_tmp_files", + help="Keeps all temporary xml files beeing created \ + during the entire markup process. Using this flag is \ + needed when skipping steps of the entire markup during \ + a rerun of the script. \ + If this is not set temporary files will always be \ + deleted.", + action="store_true", + required=False) + parser.add_argument("-fr", + "--fresh_run", + help="Deltes all temporary folders in output folder \ + also deletes all paths saved in the config file file \ + before starting the markup process. The user has to set\ + the paths again with -sp.", + action="store_true", + required=False) + parser.add_argument("-la", + "--log_all", + help="If set the programm will log all information \ + about the markup process (statistics etc.). Otherwise \ + it only logs errors and warnings.", + action="store_true", + required=False) + args = parser.parse_args() + return args + + +def main(): + """ + Main function calling all other scripts for the automatic markup of the + protocols. + """ + args = parse_arguments() + if(args.log_all is True): + level = logging.INFO + elif(args.log_all is False): + level = logging.WARNING + logging.basicConfig(filename="logs/bundesdata.log", level=level, + format="%(asctime)s %(name)s %(levelname)s:%(message)s", + datefmt='%Y/%m/%d %H:%M:%S', + filemode="w") + logger = logging.getLogger(__name__) + start_time = datetime.now() + print("Start time of script is:", start_time) + print("Info and status about the markup process can be found in:", + "logs/bundesdata.log") + logger.info("Start time of script is: " + str(start_time)) + + # Deletes output folder and all folders inside that. + # Also removes all path options from the section "File paths" + if(args.fresh_run is True): + config = configparser.ConfigParser() + config.read("config.ini") + options = config.items("File paths") + for option in options: + if(option[0] == "output_folder"): + try: + delete_folder.delete_folder(option[1]) + except FileNotFoundError: + pass + else: + config.remove_option("File paths", option[0]) + with open("config.ini", 'w') as out: + config.write(out) + + # sets paths and creates output folder + if(args.set_paths): + input_path = args.set_paths[0] + output_path = os.path.join(args.set_paths[1], "output") + if not os.path.exists(output_path): + os.mkdir(output_path) + config = configparser.ConfigParser() + config.read("config.ini") + update_config.update_config("config.ini", "File paths", + "input_folder_xmls", input_path) + update_config.update_config("config.ini", "File paths", + "output_folder", output_path) + + if(args.skip_metadata is not True): + print("Starting metadata extraction and markup.") + metadata.get_metadata() + print("Metadata creation and content splits finished.") + elif(args.skip_metadata is True): + print("Skipping script metadata.py.") + + time.sleep(1) + if(args.skip_simple_speakers is not True): + print("Starting first simple speeches and speaker markup.") + speakers.get_speakers() + print(("Finished simple markup.")) + elif(args.skip_simple_speakers is True): + print("Skipping script speakers.py.") + + time.sleep(1) + if(args.skip_name_markup is not True): + print("Starting complex markup of speaker names.") + speaker_names.get_names() + print("Finished complex name markup. (names etc.)") + elif(args.skip_name_markup is True): + print("Skipping script speaker_names.py.") + + time.sleep(1) + if(args.skip_speeches is not True): + print("Starting markup of comments etc. in speeches.") + speeches.markup_speeches() + print("Finished markup of comments etc. in speeches.") + elif(args.skip_speeches is True): + print("Skipping script speeches.py.") + + time.sleep(1) + if(args.skip_beautify_xml is not True): + print("Starting to prettyfie the xmls.") + beautify_markup.beautify_xml("markup") + print("Prettyfied the xmls.") + elif(args.skip_beautify_xml is True): + print("Skipping script beautify_markup.py.") + + if(args.keep_tmp_files is not True): + config = configparser.ConfigParser() + config.read("config.ini") + folder_paths = [] + folder_paths.append(config["File paths"]["new_metadata"]) + folder_paths.append(config["File paths"]["new_simple_markup"]) + folder_paths.append(config["File paths"]["complex_markup"]) + folder_paths.append(config["File paths"]["clear_speech_markup"]) + for folder_path in folder_paths: + delete_folder.delete_folder(folder_path) + + end_time = datetime.now() + print("End time of script is:", str(end_time)) + logger.info("End time of script is: " + str(end_time)) + duration = end_time - start_time + print("Duration of script is:", duration) + logger.info("Script duration is: " + str(duration)) + + +if __name__ == '__main__': + main() diff --git a/bundesdata_markup_nlp/bundesdata_nlp.py b/bundesdata_markup_nlp/bundesdata_nlp.py new file mode 100755 index 0000000..5a5661a --- /dev/null +++ b/bundesdata_markup_nlp/bundesdata_nlp.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import argparse +import configparser +import os +import logging +from utility.FileGetter import FileGetter +from utility import update_config +from utility import delete_folder +from markup import beautify_markup +from nlp import tokenize, lemmatization, n_grams +from datetime import datetime + +""" +This script handles the tokenization, lemmatization and ngramm calculation of +the input protocols. Needs some user input specfied int parse_arguments(). +""" + +def parse_arguments(): + """ + Argument Parser + """ + parser = argparse.ArgumentParser(description="Starts the nlp analysis of \ + the newly created XML-protocols") + parser.add_argument("-sp", + "--set_paths", + nargs=2, + help="User can set the input and output paths for the \ + files created during the nlp process. The paths will be\ + written to the config file.", + required=False, + type=str, + metavar=("input_path", "output_path")) + parser.add_argument("-fr", + "--fresh_run", + help="Deltes all temporary folders and output folders \ + created during a previously nlp run before this one \ + starts.", + action="store_true", + required=False) + parser.add_argument("-sb", + "--skip_beautify_xml", + help="Skips the script creating beautiful xml files.", + action="store_true", + required=False) + parser.add_argument("-ns", + "--no_stop_words", + help="If this is used the lemmatization or tokenization\ + of the input protocols will exculde stop words.", + required=False, + action="store_true") + group = parser.add_mutually_exclusive_group(required=False) + group.add_argument("-lm", + "--lemmatize", + help="Lemmatizes the XML protocols in the input directory\ + and saves them into the output directory.", + action="store_true", + required=False) + group.add_argument("-tn", + "--tokenize", + help="Tokenizes the XML protocols in the input directory\ + and saves them into the output directory.", + action="store_true", + required=False) + group.add_argument("-cn", + "--calculate_n_grams", + nargs=2, + help="Calculates n_grams for any tokenized or leammtized\ + XML protocol created by this script. \ + feature_to_group_n_grams_by can be set to the following:\ + 'year','month_year', 'speaker' or 'speech'.", + required=False, + type=str, + metavar=("feature_to_group_n_grams_by", "input_type_name")) + args = parser.parse_args() + return args + + +def main(): + # logging and start time + logging.basicConfig(filename="logs/bundesdata_nlp.log", level=logging.INFO, + format="%(asctime)s %(name)s %(levelname)s:%(message)s", + datefmt='%Y/%m/%d %H:%M:%S', + filemode="w") + logger = logging.getLogger(__name__) + start_time = datetime.now() + print("Start time of script is:", start_time) + print("Info and status about the nlp process can be found in:", + "logs/bundesdata_nlp.log") + logger.info("Start time of script is: " + str(start_time)) + # get arguments + args = parse_arguments() + # reads config + config = configparser.ConfigParser() + config.read("config.ini") + # if fresh_run is true directory nlp_output will be deleted + if(args.fresh_run is True): + config = configparser.ConfigParser() + config.read("config.ini") + options = config.items("File paths") + for option in options: + if(option[0] == "nlp_output"): + try: + delete_folder.delete_folder(option[1]) + except FileNotFoundError: + pass + else: + config.remove_option("File paths", option[0]) + with open("config.ini", 'w') as out: + config.write(out) + + # create outputfolder if it does not exists and wirtes path to config + if(args.set_paths): + output_path = os.path.join(args.set_paths[1], "nlp_output") + if not os.path.exists(output_path): + os.mkdir(output_path) + update_config.update_config("config.ini", "File paths", + "nlp_output", output_path) + else: + output_path = config["File paths"]["nlp_output"] + if not os.path.exists(output_path): + os.mkdir(output_path) + update_config.update_config("config.ini", "File paths", + "nlp_output", output_path) + # gets file_path list of input files and wirtes inputfolder path to config + if(args.set_paths): + input_path = args.set_paths[0] + update_config.update_config("config.ini", "File paths", + "nlp_input", input_path) + elif(args.calculate_n_grams): + input_path = config["File paths"]["nlp_beuatiful_xml"] + else: + input_path = config["File paths"]["nlp_input"] + files = FileGetter(input_path, "*.xml") + files = files.get_files() + # if statements deciding which script will be executed + if(args.lemmatize is True and args.no_stop_words is True): + print("Starting lemmatization excluding stop words.") + lemmatization.lemmatization(files, True) + print("Finished lemmatization excluding stop words.") + elif(args.lemmatize is True and args.no_stop_words is False): + print("Starting lemmatization including stop words.") + lemmatization.lemmatization(files) + print("Finished lemmatization including stop words.") + + if(args.tokenize is True and args.no_stop_words is True): + print("Starting tokenization excluding stop words.") + tokenize.tokenize(files, True) + print("Finished tokenization excluding stop words.") + elif(args.tokenize is True and args.no_stop_words is False): + print("Starting tokenization including stop words.") + tokenize.tokenize(files) + print("Finished tokenization including stop words.") + + if(args.calculate_n_grams): + print("Starting calculation of n-grams for input files.") + n_grams.n_grams(files, args.calculate_n_grams[0], args.calculate_n_grams[1]) + print("Finished calculation of n-grams for input files.") + + if(args.skip_beautify_xml is not True and args.lemmatize is True + or args.tokenize is True): + print("Starting to prettyfy the xmls.") + beautify_markup.beautify_xml("nlp", True, 80) + print("Prettyfied the xmls.") + elif(args.skip_beautify_xml is True): + print("Skipping script beautify_markup.py.") + + end_time = datetime.now() + print("End time of script is:", str(end_time)) + logger.info("End time of script is: " + str(end_time)) + duration = end_time - start_time + print("Duration of script is:", duration) + logger.info("Script duration is: " + str(duration)) + + +if __name__ == '__main__': + main() diff --git a/bundesdata_markup_nlp/config.ini b/bundesdata_markup_nlp/config.ini new file mode 100755 index 0000000..893918d --- /dev/null +++ b/bundesdata_markup_nlp/config.ini @@ -0,0 +1,47 @@ +[Regular expressions time extraction] +session_start_time = (?:Die Sitzung wird [umrn]+ (?:(?:(\d{1,2}) Uhr (?:(\d{1,2})?)|(?:(\d{1,2})\.(?:(\d{1,2})) Uhr)) ?(?:Minuten?)?.?)? ?(?:durch\n*[\w \.;'\(\)]*)?[\s \. A-z]*(?:(?:eröffnet\.)|(?:eingeleitet[\w „\",\.]+)))|(?:Begi[\w]+:? (\d{1,2})(?:[, \.]*)?(?:(\d{1,2}))? ?Uhr\.?)|(?:Die Sitzung wird [umrn]+ (\d{1,2}) Uhr eröffnet.) +session_end_time = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr\s(\d{1,2}).?\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(\d{1,2})\sUhr (\d{1,2})\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr und\s.?(\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (\d{1,2}) Uhr (\d{1,2})\.\)) + +[Regular expressions splits] +session_start_president_split = (\n\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?:) +attachment_split = ((?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\))) + +[Regular expressions speakers] +speaker_president_first = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; first ; Präsident +speaker_state_secretary = ^[ \-\.,\w]+ Staatssekretär[\-\w\n, \n]+: ; middle ; Staatssekretär +speaker_minister_of_state = ^[ \-\.,\w]+ Staatsminister[\-\w\n, \n]+: ; middle ; Staatsminister +speaker_applicant = [ \-\.,\w]+ (\([\w ]+\))?, (?:A|a)ntragsteller(?:in)?[\-\w\n, \n]*: ; middle ; Antragsteller +speaker_president = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; middle ; Präsident +speaker_undefined = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-\.]+\) ?: ; middle ; MdB +speaker_defined = ^[\w \-\.,]+ ?Bundesminister(in)? [\w\-\., ]* ?: ; middle ; Bundesminister +speaker_chancellor = ^[\w \-\.\,]+Bundeskanzler(in)? ?: ; middle ; Bundeskanzler +speaker_secretary = ^[\w \-\.,]+ ?Schriftführer(in)? ?: ; middle ; Schriftführer +speaker_rapporteur = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-]+\) ?, (?:B|b)erichterstatter: ; middle ; Berichterstatter +end_of_session = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\)) ; last ; Zeitpunkt + +[Additional name features] +academic_titles = Dr. Dr. h. c. ; Dr. h. c. +parties = DIE LINKE ; CDU/CSU ; PDS/Linke Liste ; Fraktionslos ; F.D.P. + +[Regular expressions speeches] +comments = \B\([^\(\)]*\)\B ; kommentar +date_string = [\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,2} ?\. Wahlperiode (?:–|—|-|--) \d{1,3} ?\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]*|[\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,3}\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]* ; metadata + +[Multiline entities] +multiline_comment = \B\([^\(\)]* ; [^\(\)]*\)\B ; kommentar + +[File paths] +nlp_output = /home/stephan/Desktop/tmp_test/nlp_output +nlp_input = /home/stephan/Desktop/tmp_test/nlp_output/nlp_beuatiful_xml/ +nlp_lemmatized_tokenized = /home/stephan/Desktop/tmp_test/nlp_output/lemmatized +tmp_path = /home/stephan/Desktop/tmp_test/nlp_output/lemmatized/tmp +nlp_beuatiful_xml = /home/stephan/Desktop/tmp_test/nlp_output/nlp_beuatiful_xml +input_folder_xmls = /home/stephan/Desktop/tmp_test/protocols/ +output_folder = /home/stephan/Desktop/tmp_test/output +new_metadata = /home/stephan/Desktop/tmp_test/output/new_metadata +new_simple_markup = /home/stephan/Desktop/tmp_test/output/simple_xml +complex_markup = /home/stephan/Desktop/tmp_test/output/complex_markup +clear_speech_markup = /home/stephan/Desktop/tmp_test/output/clear_speech_markup +beautiful_xml = /home/stephan/Desktop/tmp_test/output/beautiful_xml +fixed_markup = /home/stephan/Repos/master_thesis/data/working_data/id_fixed/fixed_markup + diff --git a/bundesdata_markup_nlp/config_(backup).ini b/bundesdata_markup_nlp/config_(backup).ini new file mode 100755 index 0000000..0fcdcfb --- /dev/null +++ b/bundesdata_markup_nlp/config_(backup).ini @@ -0,0 +1,46 @@ +[Regular expressions time extraction] +session_start_time = (?:Die Sitzung wird [umrn]+ (?:(?:(\d{1,2}) Uhr (?:(\d{1,2})?)|(?:(\d{1,2})\.(?:(\d{1,2})) Uhr)) ?(?:Minuten?)?.?)? ?(?:durch\n*[\w \.;'\(\)]*)?[\s \. A-z]*(?:(?:eröffnet\.)|(?:eingeleitet[\w „\",\.]+)))|(?:Begi[\w]+:? (\d{1,2})(?:[, \.]*)?(?:(\d{1,2}))? ?Uhr\.?)|(?:Die Sitzung wird [umrn]+ (\d{1,2}) Uhr eröffnet.) +session_end_time = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr\s(\d{1,2}).?\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(\d{1,2})\sUhr (\d{1,2})\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr und\s.?(\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (\d{1,2}) Uhr (\d{1,2})\.\)) + +[Regular expressions splits] +session_start_president_split = (\n\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?:) +attachment_split = ((?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\))) + +[Regular expressions speakers] +speaker_president_first = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; first ; Präsident +speaker_state_secretary = ^[ \-\.,\w]+ Staatssekretär[\-\w\n, \n]+: ; middle ; Staatssekretär +speaker_minister_of_state = ^[ \-\.,\w]+ Staatsminister[\-\w\n, \n]+: ; middle ; Staatsminister +speaker_applicant = [ \-\.,\w]+ (\([\w ]+\))?, (?:A|a)ntragsteller(?:in)?[\-\w\n, \n]*: ; middle ; Antragsteller +speaker_president = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; middle ; Präsident +speaker_undefined = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-\.]+\) ?: ; middle ; MdB +speaker_defined = ^[\w \-\.,]+ ?Bundesminister(in)? [\w\-\., ]* ?: ; middle ; Bundesminister +speaker_chancellor = ^[\w \-\.\,]+Bundeskanzler(in)? ?: ; middle ; Bundeskanzler +speaker_secretary = ^[\w \-\.,]+ ?Schriftführer(in)? ?: ; middle ; Schriftführer +speaker_rapporteur = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-]+\) ?, (?:B|b)erichterstatter: ; middle ; Berichterstatter +end_of_session = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\)) ; last ; Zeitpunkt + +[Additional name features] +academic_titles = Dr. Dr. h. c. ; Dr. h. c. +parties = DIE LINKE ; CDU/CSU ; PDS/Linke Liste ; Fraktionslos ; F.D.P. + +[Regular expressions speeches] +comments = \B\([^\(\)]*\)\B ; kommentar +date_string = [\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,2} ?\. Wahlperiode (?:–|—|-|--) \d{1,3} ?\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]*|[\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,3}\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]* ; metadata + +[Multiline entities] +multiline_comment = \B\([^\(\)]* ; [^\(\)]*\)\B ; kommentar + +[File paths] +nlp_output = /home/stephan/Desktop/nlp_output +nlp_input = /home/stephan/Repos/master_thesis_data/data/outputs/outputs_markup/development_data/beautiful_xml/ +nlp_lemmatized_tokenized = /home/stephan/Desktop/nlp_output/lemmatized +tmp_path = /home/stephan/Repos/master_thesis/data/working_data/output/clear_speech_markup/tmp +nlp_beuatiful_xml = /home/stephan/Desktop/nlp_output/nlp_beuatiful_xml +input_folder_xmls = /home/stephan/Repos/master_thesis/data/working_data/sub_set/ +output_folder = /home/stephan/Repos/master_thesis/data/working_data/output +new_metadata = /home/stephan/Repos/master_thesis/data/working_data/output/new_metadata +new_simple_markup = /home/stephan/Repos/master_thesis/data/working_data/output/simple_xml +complex_markup = /home/stephan/Repos/master_thesis/data/working_data/output/complex_markup +clear_speech_markup = /home/stephan/Repos/master_thesis/data/working_data/output/clear_speech_markup +beautiful_xml = /home/stephan/Repos/master_thesis/data/working_data/output/beautiful_xml + diff --git a/bundesdata_markup_nlp/config_readme.md b/bundesdata_markup_nlp/config_readme.md new file mode 100755 index 0000000..691415d --- /dev/null +++ b/bundesdata_markup_nlp/config_readme.md @@ -0,0 +1,105 @@ +[Regular expressions time extraction] +# These regular expressions are used to extract the start and ending time of one +# session. The regular expressions are kind of complex because they have to catch +# a lot of human errors. To catch those errors the expression is repeatedly +# "chained" by using the or statement with only minor differences between each +# expression. This is the easiest way though to catch as many times as possible. +# The expressions match the partial strings where the start or end time is mentioned. +# With different match groups the hours and minutes will then be extracted. + +# START TIME: Matches the start time. +session_start_time = (?:Die Sitzung wird [umrn]+ (?:(?:(\d{1,2}) Uhr (?:(\d{1,2})?)|(?:(\d{1,2})\.(?:(\d{1,2})) Uhr)) ?(?:Minuten?)?.?)? ?(?:durch\n*[\w \.;'\(\)]*)?[\s \. A-z]*(?:(?:eröffnet\.)|(?:eingeleitet[\w „\",\.]+)))|(?:Begi[\w]+:? (\d{1,2})(?:[, \.]*)?(?:(\d{1,2}))? ?Uhr\.?)|(?:Die Sitzung wird [umrn]+ (\d{1,2}) Uhr eröffnet.)|(?:eingeleitet[\w „\",\.]+)))|(?:Begi[\w]+:? (\d{1,2})(?:[, \.]*)?(?:(\d{1,2}))? ?Uhr\.?)|(?:Die Sitzung wird [umrn]+ (\d{1,2}) Uhr eröffnet.) + +# END TIME: Matches the end time. +session_end_time = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr\s(\d{1,2}).?\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(\d{1,2})\sUhr (\d{1,2})\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr und\s.?(\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (\d{1,2}) Uhr (\d{1,2})\.\)) + + +[Regular expressions splits] +# These expressions are used for splitting the protocols at the location if +# matched. +# All match groups are non catching except the group catching the entire regex +# to insert it later on again. This is the main difference to the time extractions. +# These splits are needed to automatically separate the actual session content +# from the table of contents and the attachments. + +# Split at first president occurrence. +session_start_president_split = (\n\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?:) + +# Split at the end time of session. +attachment_split = ((?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\))) + + +[Regular expressions speakers] +# These are the regular expressions for matching the speakers in the protocols. +# They consist of tuples with three values. +# First element of the tuple is the regex. +# Second element is a case that tells if this regex should be used as a +# First, middle, or last element/match during the markup process. +# Third element describes the type of speech the speaker is holding in German, to use it as an attribute later on. +# The value tuple is divided with " ; " to convert it into a list later on. +# It is similar to csv syntax. If needed the user can add more key, value pairs following the same +# pattern to automatically identify even more speaker roles. + +speaker_president_first = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; first ; Präsident +speaker_state_secretary = ^[ \-\.,\w]+ Staatssekretär[\-\w\n, \n]+: ; middle ; Staatssekretär +speaker_minister_of_state = ^[ \-\.,\w]+ Staatsminister[\-\w\n, \n]+: ; middle ; Staatsminister +speaker_applicant = [ \-\.,\w]+ (\([\w ]+\))?, (?:A|a)ntragsteller(?:in)?[\-\w\n, \n]*: ; middle ; Antragsteller +speaker_president = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; middle ; Präsident +speaker_undefined = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-\.]+\) ?: ; middle ; MdB +speaker_defined = ^[\w \-\.,]+ ?Bundesminister(in)? [\w\-\., ]* ?: ; middle ; Bundesminister +speaker_chancellor = ^[\w \-\.\,]+Bundeskanzler(in)? ?: ; middle ; Bundeskanzler +speaker_secretary = ^[\w \-\.,]+ ?Schriftführer(in)? ?: ; middle ; Schriftführer +speaker_rapporteur = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-]+\) ?, (?:B|b)erichterstatter: ; middle ; Berichterstatter +end_of_session = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\)) ; last ; Zeitpunkt + +[Additional name features] +# In this section the user can add additional strings which are not part of the +# Stammdatenbank but are used inside the protocolls. +academic_titles = Dr. Dr. h. c. ; Dr. h. c. +parties = DIE LINKE ; CDU/CSU ; PDS/Linke Liste ; Fraktionslos ; F.D.P. + +[Regular expressions speeches] +# These regular expressions are used to markup some entities inside of the actual speeches. +# The value of any given key is a tuple with two values splitted by " ; " like in the section +# \[Regular expressions speakers\]. First value is the regex and the second value is the tagname +# wirrten as a string. This list of key, value pairs can also be extended by the user to identify +# even more entities inside of the speeches. Just add key, value pairs following the same pattern. +# These expressions are only used to identify entities which are present in one

without +# linebreaks. + +comments = \B\([^\(\)]*\)\B ; kommentar +date_string_with_periode = [\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,2} ?\. Wahlperiode (?:–|—|-|--) \d{1,3} ?\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]* ; metadata +date_string_without_periode = [\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,3}\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]* ; metadata + +[Multiline entities] +# These regulare expressions are used to identifie entities in speeches which span over multiple

+# elements. The value of any given key is a tuple with three values splitted by " ; " like in the +# section [Regular expressions speakers]. First value is a regex describing how the start of the +# entity string looks like. The second value is a regex describing how the end of the entity string +# looks like. Third value is the tagname written as a normal string. +multiline_comment = \B\([^\(\)]* ; [^\(\)]*\)\B ; kommentar + +[File paths] +# This is where the paths for input and output folders are set. The input folder +# path should contain the XML-protocols that will be processed. +# The output folder path specifies the place where all the intermediate files +# and the final new XML protocols with the new automatic created markup will be +# saved. + +input_folder_xmls = /home/stephan/Repos/master_thesis/data/working_data/development_data_xml +output_folder = /home/stephan/Repos/master_thesis/data/working_data/ + +# These paths will be set while running the programm. +nlp_output = /home/stephan/Desktop/nlp_output +nlp_input = /home/stephan/Desktop/protocols/ +nlp_lemmatized_tokenized = /home/stephan/Desktop/nlp_output/lemmatized +tmp_path = /home/stephan/Desktop/nlp_output/lemmatized/tmp +nlp_beuatiful_xml = /home/stephan/Desktop/nlp_output/nlp_beuatiful_xml +input_folder_xmls = /home/stephan/Repos/master_thesis_data/inputs/excluded_periods/ +output_folder = /home/stephan/Desktop/output +new_metadata = /home/stephan/Desktop/output/new_metadata +new_simple_markup = /home/stephan/Desktop/output/simple_xml +complex_markup = /home/stephan/Desktop/output/complex_markup +clear_speech_markup = /home/stephan/Desktop/output/clear_speech_markup +beautiful_xml = /home/stephan/Desktop/output/beautiful_xml +fixed_markup = /home/stephan/Repos/master_thesis/data/working_data/id_fixed/fixed_markup diff --git a/bundesdata_markup_nlp/markup/EntityMarkup.py b/bundesdata_markup_nlp/markup/EntityMarkup.py new file mode 100755 index 0000000..efa8276 --- /dev/null +++ b/bundesdata_markup_nlp/markup/EntityMarkup.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from markup.MetadataMarkup import MetadataMarkup +from lxml import etree +from xml.etree import ElementTree +from xml.sax.saxutils import escape +import logging +import os +import re + + +class EntityMarkup(MetadataMarkup): + """Class for getting an XML node in which entities will be marked. + In practice this class and its mehtods can be used to get the text of a + given Node and marks every speaker in this text string. + Also passes methods and fields to the more specific + SimpleSpeakersMarkup.""" + + def __init__(self, file_path, element_name=".//sitzungsverlauf"): + super().__init__() + self.file_path = file_path + self.element_name = element_name + self.xml_tree = None + self.current_string = str() + self.filename = os.path.basename(file_path) + self.logger = logging.getLogger(__name__) + + def get_element_text(self): + """ + Gets the strings of all elements matched by an element x-path. Element + name will be passed when the class is istanced. Distunguishes between + one string or several strings. + """ + self.all_elements = self.xml_tree.iterfind(self.element_name) + len_all_elements = len(list(self.all_elements)) + self.current_strings = [] + if(len_all_elements == 1): + self.all_elements = self.xml_tree.iterfind(self.element_name) + self.current_string = escape(list(self.all_elements)[0].text) + self.current_strings.append(self.current_string) + elif(len_all_elements > 1): + self.current_strings = [] + self.all_elements = self.xml_tree.iterfind(self.element_name) + for element in self.all_elements: + string = escape(element.text) + self.current_strings.append(string) + self.all_elements = self.xml_tree.iterfind(self.element_name) + + def replace_string(self, replacement_string, element_name): + """ + This function takes the newly manipulated xml string and overwrites + the old string with it. + """ + replacement_string = ( + "<" + element_name + ">" + + replacement_string + + "" + ) + for element in self.xml_tree.xpath("//%s" % element_name): + element.getparent().remove(element) + replacement_element = etree.fromstring(replacement_string) + self.xml_tree.insert(1, replacement_element) + + def simple_check_xml(self, xml_string, file_name, save_valid, node=True): + """ + Checks if a given xml element is well-formed xml. If it is checking a + partial string it adds a root element. If node is False it is checking a + document as a string. + """ + try: + if(node is True): + folder_path = "logs/well-formed_strings/" + file_path = os.path.join(folder_path, os.path.basename(file_name)) + xml_string = "" + xml_string + "" + tree = etree.fromstring(xml_string) + self.logger.info(("The node string is well-formed. Simple markup is" + " correct. Node string can be found in " + + folder_path)) + self.logger.info(tree) + if(save_valid is True): + self.logger.info("Node string can be found in" + folder_path) + if not os.path.exists(folder_path): + os.mkdir(folder_path) + with open(file_path, "w") as text_file: + text_file.write(xml_string) + else: + folder_path = "logs/well-formed_files/" + file_path = os.path.join(folder_path, os.path.basename(file_name)) + xml_string = xml_string + tree = etree.fromstring(xml_string) + self.logger.info("The XML file is well-formed.") + self.logger.info(tree) + if(save_valid is True): + self.logger.info("File can be found in" + folder_path) + if not os.path.exists(folder_path): + os.mkdir(folder_path) + with open(file_path, "w") as text_file: + text_file.write(xml_string.decode("utf-8")) + except Exception as e: + if(node is True): + folder_path = "logs/not_well-formed_strings/" + file_path = os.path.join(folder_path, os.path.basename(file_name)) + if not os.path.exists(folder_path): + os.mkdir(folder_path) + with open(file_path, "w") as text_file: + text_file.write(xml_string) + self.logger.error(("XML node string is not well-formed. XML can be" + " found in " + folder_path)) + self.logger.error(e) + else: + folder_path = "logs/not_well-formed_files/" + file_path = os.path.join(folder_path, os.path.basename(file_name)) + if not os.path.exists(folder_path): + os.mkdir(folder_path) + with open(file_path, "w") as text_file: + text_file.write(xml_string.decode("utf-8")) + self.logger.error(("XML file is not well-formed. XML can be" + " found in " + folder_path)) + self.logger.error(e) + return False + + def inject_element(self, current_element, regex, tagname, + strip_newlines=False): + """ + Injects new xml elements into the selected element text. The new element + will be created by using a regular expression which matches a partial + string in the current_element text string. The match will be the + new_element text string. The tagname sets the tagname of the + new_element. Optionally Attributes can be set aswell. + """ + element_string = ElementTree.tostring(current_element, encoding="unicode", method="xml") + match = re.search(regex, element_string) + if(match): + index_shift = 0 + if(strip_newlines is True): + counter = match.group().count("\n") + match_str = re.sub(r"\n", "", match.group()) + else: + counter = 0 + match_str = match.group() + index_start = match.start() + index_shift - counter + index_end = match.end() + index_shift - counter + new_element = etree.Element(tagname) + new_element.text = match_str + new_element_str = ElementTree.tostring(new_element, encoding="unicode", method="xml") + element_string = (element_string[:index_start] + + new_element_str + + element_string[index_end:]) + index_shift += len(new_element_str) - len(match_str) + replacement_element = etree.fromstring(element_string.encode("utf8")) + current_element.getparent().replace(current_element, replacement_element) + + def markup_speech_lines(self, current_element): + """ + Inserts markup in every speech that marks every line

with + attribute klasse="J". J is set for every line even if it is O. In the + early protocols (period 1. to 10.) One line is most of the time a + sentence. In the later periods one line is capped at around 80 + characters. + """ + lines = current_element.xpath("text()") + if(len(lines) > 0): + lines = lines[0].splitlines() + current_element.xpath(".//redner")[0].tail = "" + for line in lines: + part_element = etree.Element("p") + part_element.set("klasse", "J") + part_element.text = line + current_element.append(part_element) + + def get_multiline_entities(self, elements, start_of_str, end_of_str, + tagname): + """ + This function identifies multiline entities (i.e. Kommentare/Comments) + wich are split over multiple elements which have been marked with the + markup_speech_lines() function. + Gets the text of those and joins them together into one + string. The first elements text will be set to the newly created string + surrounded by new xml tags with tagname set to input tagname. + All other elements with the rest of the string will be deleted. + start_of_str should be a regex that describes the pattern how the start + of the supposed multiline entity looks like. end_of_str describes the + pattern how the end of the supposed multiline entity looks like. + """ + self.multiline_text = [] + self.multiline_elements = [] + start_found = False + end_found = False + for element in elements: + if(start_found is False and end_found is False + and element.text is not None): + start_match = re.search(start_of_str, element.text) + if(start_match is not None): + self.multiline_text.append(start_match.group()) + self.multiline_elements.append(element) + start_found = True + continue + elif(start_found is True and end_found is False + and element.text is not None): + end_match = re.search(end_of_str, element.text) + if(end_match): + self.multiline_text.append(end_match.group()) + self.multiline_elements.append(element) + end_found = True + continue + else: + self.multiline_text.append(element.text) + self.multiline_elements.append(element) + continue + elif(start_found is True and end_found is True): + new_element_text = re.sub(r"- ", "", " ".join(self.multiline_text)) # joins the sting parts and also removes hyphenation + part_element = etree.Element("p") + part_element.set("klasse", "J") + comment_element = etree.Element(tagname) + comment_element.text = new_element_text + part_element.append(comment_element) + self.multiline_elements[0].getparent().replace(self.multiline_elements[0], part_element) + for element in self.multiline_elements[1:]: + element.getparent().remove(element) + start_found = False + end_found = False + self.multiline_text = [] + self.multiline_elements = [] + continue diff --git a/bundesdata_markup_nlp/markup/MdBData.py b/bundesdata_markup_nlp/markup/MdBData.py new file mode 100755 index 0000000..0544722 --- /dev/null +++ b/bundesdata_markup_nlp/markup/MdBData.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from utility.XMLProtocol import XMLProtocol +import logging + + +class MdBData(XMLProtocol): + """Class to handel operations on the Stammdatenbank.""" + + def __init__(self): + super(XMLProtocol, self).__init__() + self.logger = logging.getLogger(__name__) + + def get_set(self, element_path, element_tree): + """ + Creates Sets from input path on element_tree. + """ + tmp_list = [element.text for element in + element_tree.iterfind(element_path) if element is not None] + set_of_elements = set(tmp_list) + return set_of_elements diff --git a/bundesdata_markup_nlp/markup/MetadataMarkup.py b/bundesdata_markup_nlp/markup/MetadataMarkup.py new file mode 100755 index 0000000..9907884 --- /dev/null +++ b/bundesdata_markup_nlp/markup/MetadataMarkup.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from utility.XMLProtocol import XMLProtocol +from utility import update_config +from lxml import etree +from datetime import datetime +from babel.dates import format_date +import os +import re +import logging +import configparser + + +class MetadataMarkup(XMLProtocol): + """ + This class is for opening one XML-protocoll, extracting the included + metadata and creating a new valid metadata head. + """ + + def __init__(self): + super().__init__() + self.plenarprotokoll_string = str() # will be extracted with extract_metadata() + self.wahlperiode = int() # will be extracted with extract_metadata() + self.sitzungsnr = int() # will be extracted with extract_metadata() + self.herausgeber = "Deutscher Bundestag" # Always the same in every protocoll + self.berichtart = "Steongrafischer Bericht" # Always the same in every protocoll + self.sitzungstitel_string = ". Sitzung" # Always the same in every protocoll + self.ort = "Berlin" # Always the same in every protocoll + self.datum_ger_non_iso = str() # will be extracted with extract_metadata() + self.datum_iso = str() # ISO-date will be built from self.datum_ger_non_iso + self.datum_string = str() # will be built from self.datum_iso + self.attachment = str() # will be extracted from a split. Will not work + # all the time. But will not break the XML. + self.logger = logging.getLogger(__name__) + + def extract_metadata(self, etree_element_object): + """ + Extracts metadata from the given XML-tags and wirtes them into the + instance variables + """ + root = etree_element_object + metadata_list = [] + for element in root.iter(): + if(element.tag != "TEXT"): + metadata_list.append(element.text) + metadata_list = metadata_list[1:] + self.wahlperiode = metadata_list[0] + self.plenarprotokoll_string = metadata_list[1].lower().title() + self.sitzungsnr = metadata_list[2].split("/")[1] + self.datum_ger_non_iso = metadata_list[3] + self.logger.info("Metadata successfully extracted.") + self.logger.info("Wahlperiode is:" + self.wahlperiode) + self.logger.info("Plenarprotokoll is:" + self.plenarprotokoll_string) + self.logger.info("Sitzungsnummer is:" + self.sitzungsnr) + self.logger.info("German non ISO date is:" + self.datum_ger_non_iso) + + def built_iso_date(self, ger_date): + """ + Gets the german date and converts it to an ISO standard date. + """ + self.datum_iso = datetime.strptime(ger_date, "%d.%m.%Y").date() + self.logger.info("ISO date created:" + str(self.datum_iso)) + + def built_date_string(self, iso_date): + """ + Gets the ISO date and creates from it an german full string date. + """ + date_string = format_date(iso_date, format="full", locale="de_DE") + date_string = re.sub(r",", ", den", date_string) + self.datum_string = date_string + self.logger.info("Date string created:" + self.datum_string) + + def delete_old_metadata(self, etree_element_object): + """ + Deletes old metadata tags and text. Renames root tag. + """ + for element in etree_element_object.iter(): + if(element.tag != "TEXT" and element.tag != "DOKUMENT"): + element.getparent().remove(element) + elif(element.tag == "DOKUMENT"): + element.tag = "dbtplenarprotokoll" + elif(element.tag == "TEXT"): + self.full_content = element.text + element.getparent().remove(element) + self.logger.info("Old metadata deleted.") + + def insert_new_metadata(self, etree_element_object): + """ + Inserts the extracted metadata and splitted content into new created + and valid xml tags according to the official schema. + """ + vorspann_element = etree.Element("vorspann") + xml_string = """ + + {} {}/{} + (neu) + {} + {} + {}. Sitzung + {}, {} + """\ + .format(self.plenarprotokoll_string, self.wahlperiode, + self.sitzungsnr, self.herausgeber, self.berichtart, + self.sitzungsnr, self.ort, self.datum_ger_non_iso, + self.datum_string) + etree_from_str = etree.fromstring(xml_string) + etree_element_object.insert(0, vorspann_element) + vorspann_element.append(etree_from_str) + toc_element = etree.Element("inhaltsverzeichnis") + toc_element.text = self.toc + vorspann_element.append(toc_element) + content_element = etree.Element("sitzungsverlauf") + content_element.text = self.president + self.content + etree_element_object.insert(2, content_element) + anlagen_element = etree.Element("anlagen") + anlagen_element. text = self.attachment + etree_element_object.insert(3, anlagen_element) + rednerliste_element = etree.Element("rednerliste", + sitzungsdatum=self.datum_ger_non_iso) + etree_element_object.insert(4, rednerliste_element) + self.xml_tree = etree_element_object + self.logger.info("New metadata XML-head inserted." + xml_string) + + def split_content(self, etree_element_object): + """Splits the full content to: table of content, speeches and in some + cases attachments.""" + config = configparser.ConfigParser() + config.read("config.ini") + + session_start_split = config["Regular expressions splits"]["session_start_president_split"] + regex_start = re.compile(session_start_split) + tmp_list = regex_start.split(self.full_content, maxsplit=1) + self.toc = tmp_list[0] + self.president = tmp_list[1] + self.content = tmp_list[2] + + attachment_split = config["Regular expressions splits"]["attachment_split"] + regex_att = re.compile(attachment_split) + tmp_list = regex_att.split(self.content) + tmp_list = [element for element in tmp_list if element is not None] + if(tmp_list[-1] == ""): # if the split does not match anything last item is empty string. + self.content = "".join(tmp_list[0:-1]) + self.attachment = "Keine Anlage extrahiert." + self.logger.warning(("There is no attachment.")) + else: + self.content = "".join(tmp_list[0:-1]) + self.attachment = tmp_list[-1] + self.logger.info("Attachment found.") + self.logger.info("Contet splitted at:" + str(regex_start)) + self.logger.info("Contet splitted at:" + str(regex_att)) + + def get_session_times(self): + """This function looks into the entire protocoll content to extract the + last closing time and the starting time. If only one of both or none are + found, the missing time will be set to xx:xx.""" + config = configparser.ConfigParser() + config.read("config.ini") + regex_conf_values = config.items("Regular expressions time extraction") + regex_conf_values = [regex[1] for regex in regex_conf_values] + tmp_list = [] + identifier = 0 + start_time_found = True + end_time_found = True + + for regex in (regex_conf_values): + identifier += 1 + regex = re.compile(regex) + if(identifier == 1): + # Always gets first start time. + matches = list(regex.finditer(self.full_content)) + if(len(matches) > 1): + match = matches[-1] + elif(len(matches) == 0): + match = None + else: + match = matches[0] + elif(identifier == 2): + # Always gets last closing time + matches = list(regex.finditer(self.full_content)) + if(len(matches) > 1): + match = matches[-1] + elif(len(matches) == 0): + match = None + else: + match = matches[0] + + if(match is None and identifier == 1): + self.logger.warning("No start time found for " + str(regex)) + start_time_found = False + elif(match is None and identifier == 2): + self.logger.warning("No end time found for " + str(regex)) + end_time_found = False + elif(match): + session_time = [group for group in match.groups() + if group is not None] + session_time = ["0" + group if len(group) == 1 else group for + group in session_time] # Adds a 0 in front if digit len is 1 + if(len(session_time) == 2): + tmp_list.append(":".join(session_time)) + elif(len(session_time) == 1): + tmp_list.append(session_time[0] + ":00") + + if(len(tmp_list) == 2): + self.session_start_time = tmp_list[0] + self.session_end_time = tmp_list[1] + self.logger.info("Start time found: " + self.session_start_time) + self.logger.info("End time found: " + self.session_end_time) + self.logger.info("Successfully matched start and end times.") + elif(len(tmp_list) == 1 and start_time_found is True and end_time_found + is False): + self.session_start_time = tmp_list[0] + self.session_end_time = "xx:xx" + self.logger.warning("Only start time found: " + + self.session_start_time) + self.logger.warning("End time set to: " + + self.session_end_time) + elif(len(tmp_list) == 1 and start_time_found is False and end_time_found + is True): + self.session_end_time = tmp_list[0] + self.session_start_time = "xx:xx" + self.logger.warning("Only end time found: " + + self.session_end_time) + self.logger.warning("Start time set to: " + + self.session_start_time) + + def write_to_attr(self, element, attr_key, attr_value): + """ + Writes two strings as a an attribute key value pair to a given + element. + """ + elements = self.xml_tree.findall(element) + if(elements == []): + element = self.tree.getroot() + elements.append(element) + for element in elements: + element.set(attr_key, attr_value) + self.xml_tree = self.xml_tree + self.logger.info("Wrote attribute " + + attr_key + + "=" + + "\"" + + attr_value + + "\"") + + def save_to_file(self, output_path, file_path, subfolder, config_section, + config_key): + """ + Writes the new markup to a new xml file. Takes the output path and + creates a new folder there. Also updates the config file with the new + path. + """ + self.filename = os.path.basename(file_path) + save_path = os.path.join(output_path, subfolder) + if not os.path.exists(save_path): + os.mkdir(save_path) + tree = etree.ElementTree(self.xml_tree) + new_filename = self.filename + save_file_path = os.path.join(save_path, new_filename) + tree.write(save_file_path, + pretty_print=True, + xml_declaration=True, + encoding="utf8", + doctype="") + self.logger.info("New XML saved to:" + save_file_path) + update_config.update_config("config.ini", config_section, config_key, + save_path) diff --git a/bundesdata_markup_nlp/markup/SpeakerMarkup.py b/bundesdata_markup_nlp/markup/SpeakerMarkup.py new file mode 100755 index 0000000..ef10613 --- /dev/null +++ b/bundesdata_markup_nlp/markup/SpeakerMarkup.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +from markup.EntityMarkup import EntityMarkup +import re +import logging + + +class SpeakerMarkup(EntityMarkup): + """ + Class for specific markup of different speakers identified by different + regular expressions included in the config file. + """ + + def __init__(self, string, regex): + super(SpeakerMarkup).__init__() + self.string_to_search = string + self.regex_string = regex + self.logger = logging.getLogger(__name__) + + def identify_speaker(self): + """ + Gets match objects from the speakers in the given text node. Also + calculates length of it and puts the matches in a list. + """ + self.matches = re.finditer(self.regex_compiled, self.string_to_search) + tmp_list = [] + for match in self.matches: + tmp_list.append(match) + self.matches_count = len(tmp_list) + self.matches = tmp_list + + def markup_speaker(self, case="middle"): + """ + This is where the first simple markup happens. It uses the matches + and replaces them with simple markup for further processing. The + 'first' markup uses re.sub. The second and third one work on string + basis. + """ + + def markup_logging(): + """Helper function for creating log file output.""" + if(self.matches_count == 0): + self.logger.warning("0 matches for given expression:" + + self.regex_string) + elif(self.matches_count > 0): + self.logger.info(str(self.matches_count) + + " matches for given expression:" + + self.regex_string) + elif(self.matches_count == 1): + self.logger.info(str(self.matches_count) + + " match for given expression:" + + self.regex_string) + + if(case == "first"): + # Uses re.sub because it is only for one match. + start_tags = "" + end_tags = "" + self.matches_count = 1 # sets count to 1 because it only marks the first match + markup_logging() + first_match = self.matches[0] + start_xml = start_tags + first_match.group() + end_tags + if(len(first_match.group().split()) <= 10): + self.string_to_search = self.regex_compiled.sub(start_xml, + self.string_to_search, + count=1) + self.markuped_string = self.string_to_search + + elif(case == "middle"): + """ + Does not use re.sub because it is faster to work on the string. + Also it avoids looping two times to get the specific match.group() + which caused some errors. + """ + index_shift = 0 + start_tags = "\n" + end_tags = "" + markup_logging() + for match in self.matches: + index_start = match.start() + index_shift + index_end = match.end() + index_shift + whole_match_len = len(match.group()) + # Handels cases where lots of text before the actual speaker is # matched + linebrks_in_match = len(match.group().split("\n")) + if(linebrks_in_match >= 2): + last_part_match = "".join(match.group().split("\n")[1:]) + first_line_of_match = match.group().split("\n")[0] + if(len(first_line_of_match.split()) <= 10): + match = first_line_of_match + last_part_match + else: + match = last_part_match + + delta_start_index = whole_match_len - len(match) + index_start = index_start + delta_start_index + + self.string_to_search = (self.string_to_search[:index_start] + + start_tags + + match + + end_tags + + self.string_to_search[index_end:] + ) + index_shift += len(start_tags) + len(end_tags) + + else: + self.string_to_search = (self.string_to_search[:index_start] + + start_tags + + match.group() + + end_tags + + self.string_to_search[index_end:] + ) + index_shift += len(start_tags) + len(end_tags) + + self.markuped_string = self.string_to_search + + elif(case == "last"): + index_shift = 0 + """ + Matches the end of the session to add the last closing tag + to the last speech for well-formed xml. Uses re.sub because it is + only one operation. + """ + end_tag = "" + session_close_time_tag = ('') + # Created end tags will be inserted into the protocol + if(len(self.matches) == 1): + self.logger.info("Last speech successfully tagged.") + markup_logging() + for match in self.matches: + end_xml = end_tag + match.group() + session_close_time_tag + if(len(match.group().split()) <= 15): + self.string_to_search = self.regex_compiled.sub(end_xml, + self.string_to_search, + count=1) + self.markuped_string = self.string_to_search + + elif(len(self.matches) == 0): + self.logger.warning(("No end of session found! Last tag " + end_tag + + " will be added to the end of the protocol." + " This might add some unrelated text to the " + "last speech.")) + markup_logging() + self.markuped_string = self.string_to_search + end_tag + + else: + markup_logging() + self.logger.warning(("There are " + str(len(self.matches)) + + " session endings. Ignoring the endings" + + " before the last final ending of the " + + " session.")) + match = self.matches[-1] + end_xml = end_tag + match.group() + session_close_time_tag + whole_match_len = len(match.group()) + index_start = match.start() + index_shift + index_end = match.end() + index_shift + last_line = match.group().split("\n")[-1] # Always takes the last line of a match avoiding lots of text before the actual speaker. + delta_start_index = whole_match_len - len(last_line) + index_start = index_start + delta_start_index + self.string_to_search = (self.string_to_search[:index_start] + + end_xml + + self.string_to_search[index_end:]) + index_shift += len(end_tag) + self.markuped_string = self.string_to_search diff --git a/bundesdata_markup_nlp/markup/SpeakerNameMarkup.py b/bundesdata_markup_nlp/markup/SpeakerNameMarkup.py new file mode 100755 index 0000000..6fa282c --- /dev/null +++ b/bundesdata_markup_nlp/markup/SpeakerNameMarkup.py @@ -0,0 +1,554 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from markup.SpeakerMarkup import SpeakerMarkup +from xml.etree import ElementTree +from lxml import etree +from tqdm import tqdm +from itertools import combinations +import copy +import logging +import re +import os + + +class SpeakerNameMarkup(SpeakerMarkup): + """ + This class is for the complex markup of the speakers in one given protocol. + Creates the name tag with all needed inforamtion from the Stammdatenbank. + Has to cross reference the speaker with said Stammdatenbank. + """ + known_redner_dicts = dict() + last_wahlperiode = int() + + def __init__(self, file_path, element_name=".//redner"): + super(SpeakerNameMarkup).__init__() + self.file_path = file_path + self.filename = os.path.basename(self.file_path)[:-4] + self.element_name = element_name + self.redner_dict = dict() + self.all_speakers = [] + self.logger = logging.getLogger(__name__) + + def cross_reference_markup(self, strings, feature_set_dict, + MdB_etree): + """ + Checks if features like name, surename academic title and city are + present in the input string. Consists of main function and helper + functions. First the string will be split in tokens. Every token will + be checked a gainst sets of valid names, surnames, academic titles and + fractions. If there is a match a dictionary entriy will be set + accordingly. + Also uses the add_missing_MdB_feature helper function in a second step + to add features which are not present in the string or have been + identified wrongly. + The function crates a dictionary containing all features of one speaker + to crate a valid XML element from it later on. + """ + + def initiate_dict(keys, extra_keys): + """ + Creates a dictionarie with a set of keys and sets them to None. + Some specific key values will be set to specific values. + """ + for key in keys: + redner_dict[key] = None + for key in extra_keys: + redner_dict[key] = None + redner_dict["feature_complete"] = False + redner_dict["original_string"] = string + redner_dict["identified"] = False + redner_dict["damalige_fraktion"] = None + + def get_names(keys, dict, token): + """ + Checks if token is in set vorname or nachname. If it is dictionary + values will be set accordingly. Avoids that surname will be + overwirtten by a name wich is also a valid surname. + """ + for key in keys[0:2]: # Only for vorname, nachname in written order + if(token in feature_set_dict[key][0] and redner_dict[key] + is None): + redner_dict[key] = token + elif(token in feature_set_dict["nachname"][0] + and redner_dict["nachname"] is not None): + redner_dict["nachname"] = token + else: + continue + + def get_feature(key, string, set): + """ + Checks if a token is a valid feature (like name affix or academic + title, ortszusatz or namenszusatz) and adds it to the dictionary. + Does not check for names. + """ + for feature in set: + if(key == "titel"): + regex = r"(\b{}\B)".format(re.escape(feature)) # could be Dr. and . is not a word boundary. + elif(key is "namenszusatz"): + regex = r"\b({})\b".format(re.escape(feature)) # No . in word so word boundary at start and end of regex. + elif(key is "fraktion"): + regex = r"\B(\({}\))\B".format(re.escape(feature)) # always surrounded by parentheses, but also has to match them to avoid matching i. e. "CDU" in "CDU/CSU" + elif(key is "ortszusatz"): + regex = r"\B{}\B".format(re.escape(feature)) # always surrounded by parentheses + else: + regex = r"(\b{}\b)".format(re.escape(feature)) + match = re.search(regex, string) + if(match): + if(key == "fraktion"): + redner_dict[key] = match.group()[1:-1] # removes () + break + else: + redner_dict[key] = match.group() + break + else: + redner_dict[key] = None + + def get_role(string): + """Checks redner string for role. Identifies 'Bundesministerin für + Familie, Senioren, Frauen und Jugend' etc.""" + if("Staatssekretär" in string or "Staatssekretärin" in string): + regex = r"(Staatssekretär(in)?)" + splits = re.split(regex, string, maxsplit=1) + role_long = splits[1] + splits[-1] + redner_dict["rolle_lang"] = role_long + role_short = [word[0] for word in role_long.split() + if word[0].isupper()] + role_short = splits[1] + " " + "".join(role_short) + redner_dict["rolle_kurz"] = role_short + elif("Bundesminister" in string or "Bundesministerin" in string): + regex = r"(Bundesminister(in)?)" + splits = re.split(regex, string, maxsplit=1) + role_long = splits[1] + splits[-1] + redner_dict["rolle_lang"] = role_long + role_short = [word[0] for word in role_long.split() + if word[0].isupper()] + role_short = splits[1] + " " + "".join(role_short) + redner_dict["rolle_kurz"] = role_short + + def check_name(redner_dict): + """ + Checks if vorname and nachname are the same. Sets vorname to None if + True. Vorname will be set later on with add_missing_MdB_feature. + """ + if(redner_dict["nachname"] == redner_dict["vorname"]): + redner_dict["vorname"] = None + + def get_party(redner_dict): + """ + Creates a party key in the dictionary containing the party of the + speaker. Party is not the same as fraction. This is mainly done + because CDU/CSU is the fraction in the bundestag but speakers can + belong to either the CDU or CSU. If the fraction is not CDU/CSU + party will be set to fraction. Also handels problems with GRÜNE. + """ + if(redner_dict["fraktion"] != "CDU/CSU" + and redner_dict["fraktion"] != "CDU" + and redner_dict["fraktion"] != "CSU"): + redner_dict["partei"] = redner_dict["fraktion"] + elif(redner_dict["fraktion"] == "CDU" + or redner_dict["fraktion"] == "CSU"): + redner_dict["partei"] = redner_dict["fraktion"] + redner_dict["fraktion"] = "CDU/CSU" + if(redner_dict["fraktion"] == "GRÜNE"): + redner_dict["fraktion"] = "BÜNDNIS 90/DIE GRÜNEN" + + def check_party_and_fraction(): + """ + Checks if party and fraction have been set correctly. Will be used + after add_missing_MdB_feature. To correct some errors with CDU/CSU. + """ + if(redner_dict["fraktion"] is not None + and redner_dict["partei"] == "CDU" + or redner_dict["partei"] == "CSU"): + redner_dict["fraktion"] = "CDU/CSU" + + if(redner_dict["partei"] is None + and redner_dict["fraktion"] is not None + and redner_dict["fraktion"] != "CDU" + and redner_dict["fraktion"] != "CSU"): + redner_dict["partei"] = redner_dict["fraktion"] + + def get_match_in_str(key, string, regex): + """ + Matches a regex in the current string and adds it as a value to the + given key into the dictionary. + """ + match = re.search(regex, string) + if(match): + redner_dict[key] = match.group() + else: + redner_dict[key] = None + + def add_missing_MdB_feature(string, redner_dict, feature_set_dict, + MdB_etree, conditions_key_list, + feature_lookup, feature_to_add, + logging_state=False, multi_ids=False): + """ + This function trys to get missing features for on speaker. Input is + a list of features(conditions_key_list) which are used as parameters + in an xpath expression. The Xpath is built dynamically from the + list. + If the Xpath matches one unique entry the feature(feature_to_add) + will be set to the match of feature_lookup in the matched element. + """ + ### + # Xpath creation from conditions_key_list + ### + xpath_parts = [] + conds = conditions_key_list + len_conds = len(conds) + if(len_conds == 1): + for condition in conds: + xpath_part = ".//MDB[.//{}/text()='{}']" \ + .format(feature_set_dict[condition][1], + redner_dict[condition]) + xpath_parts.append(xpath_part) + xpath = "".join(xpath_parts) + if("None" in xpath): + xpath = None + elif(len_conds == 2): + xpath_first_part = ".//MDB[.//{}/text()='{}'" \ + .format(feature_set_dict[conds[0]][1], + redner_dict[conds[0]]) + xpath_parts.insert(0, xpath_first_part) + xpath_last_part = ".//{}/text()='{}']" \ + .format(feature_set_dict[conds[-1]][1], + redner_dict[conds[-1]]) + xpath_parts.append(xpath_last_part) + xpath = " and ".join(xpath_parts) + if("None" in xpath): + xpath = None + elif(len_conds > 2): + xpath_first_part = ".//MDB[.//{}/text()='{}'" \ + .format(feature_set_dict[conds[0]][1], + redner_dict[conds[0]]) + xpath_parts.insert(0, xpath_first_part) + for condition in conds[1:-1]: + xpath_inner_part = ".//{}/text()='{}'" \ + .format(feature_set_dict[condition][1], + redner_dict[condition]) + xpath_parts.append(xpath_inner_part) + xpath_last_part = ".//{}/text()='{}']" \ + .format(feature_set_dict[conds[-1]][1], + redner_dict[conds[-1]]) + xpath_parts.append(xpath_last_part) + xpath = " and ".join(xpath_parts) + if("None" in xpath): # sets xpaths to None if it uses a feature which is None + xpath = None + xpath_parts = [] # empties xpath_parts list + try: # tries every xpath + matches = MdB_etree.xpath(xpath) + except TypeError: # handles xpaths that are None + matches = [] + # If xpath has unique match new feature value will be set to given feature + if(len(matches) == 1): + matches = matches[0] + feature_lookup = ".//" + feature_lookup + new_feature = matches.xpath(feature_lookup)[0].text + self.logger.info((" There is one unique match " + + " for this speaker: " + + str(redner_dict) + + " Extracted feature " + + feature_lookup + ": " + + str(new_feature) + + " with: " + + str(conds))) + redner_dict[feature_to_add] = new_feature + self.logger.info(("New speaker features are: " + + str(redner_dict))) + # Handels mathches tha are not unique for logging and mutli id + elif(len(matches) > 1): + self.logger.warning((" There are " + + str(len(matches)) + + " matches for this speaker: " + + str(redner_dict) + + " .Could not extract: " + + feature_lookup + + " Features used are: " + + str(conds))) + elif(len(matches) > 1 and multi_ids is True): + ids = matches + for id, i in ids, enumerate(ids): + key = "id" + i + redner_dict[key] = id + return matches + + def get_periode(MdB_etree): + periode = self.xml_tree.xpath(".//wahlperiode") + if(periode): + redner_dict["wahlperiode"] = periode[0].text + return periode[0].text + + ### + # Start of main function cross_reference_markup + ### + + # Initiates empty dict and gets keys for it + redner_dict = dict() + features = list(feature_set_dict.keys()) + + # Counters to calculate how successful the identification of speakers is + identified_speakers = 0 + unidentified_speakers = 0 + multiple_identified_speakers = 0 + + # Cross references every string + for string in tqdm(strings, desc="Cross reference name markup for speakers in strings"): + self.logger.info("\nStarting name markup process for new speaker:") + # Sets values in redner_dict to None or specific value + initiate_dict(features, [feature for feature in features]) + tokens = string.replace(":", "").replace(",", "").split() # replaces ":" and "," with nothing because some names would be "name:" and some names would contain a "," + for token in tokens: + get_names(features, feature_set_dict, token) + self.logger.info("nachname is: " + str(redner_dict["nachname"])) + feature_keys = [key for key in features if key not in ["vorname", + "nachname"]] + for f_key in feature_keys: + get_feature(f_key, string, feature_set_dict[f_key][0]) + get_party(redner_dict) + check_name(redner_dict) + regex_p = r"^\w*(?:P|p)räsident\w*" + get_match_in_str("präsident", string, regex_p) + get_role(string) + + ### + # Checks if script is still running for the same current periode. + # If this is not the case the known_redner_dicts will be emptied. + ### + current_wahlperiode = get_periode(MdB_etree) + if(current_wahlperiode != SpeakerNameMarkup.last_wahlperiode): + SpeakerNameMarkup.known_redner_dicts = dict() + SpeakerNameMarkup.last_wahlperiode = current_wahlperiode + + ### + # Creates possible combinations of features which will be used in + # add_missing_MdB_feature to identify missing features like vorname or + # nachname. + ### + + combination_features = [feature for feature in features if feature + not in ["namenszusatz", + "feature_complete", + "id", + "titel", + "rolle_kurz", + "rolle_lang", + "original_string", + "identified", + "damalige_fraktion"]] + subsets = [] + for length in range(0, 5): + for subset in combinations(combination_features, length): + subsets.append(list(subset)) + subsets = subsets[1:] + combination_features.remove("wahlperiode") + combination_features.remove("nachname") + + ### + # First while loop trying to identify every feature for one speaker. + # Uses combinations from above. Before calling the function + # add_missing_MdB_feature there is a check if the speaker has alreeady + # been identified before. If this is the case features will be set to + # the already identfied features. This saves a lot of time. + ### + + counter_feats = 0 + while(redner_dict["feature_complete"] is False): + redner_dict["damalige_fraktion"] = redner_dict["fraktion"] + # print("Doing name markup for:", redner_dict) + # Checks if speaker has been already identified before. + if(string in SpeakerNameMarkup.known_redner_dicts): + # print("Speaker has already been identified once.") + redner_dict = SpeakerNameMarkup.known_redner_dicts[string].copy() + # print("Speaker features are set to:", + # SpeakerNameMarkup.known_redner_dicts[string]) + redner_dict["identified"] = True + self.logger.info(("Speaker has alreeady been identified " + + "once.")) + self.logger.info(("Speaker features are set to: " + + str(SpeakerNameMarkup.known_redner_dicts[string]))) + if(SpeakerNameMarkup.known_redner_dicts[string]["feature_complete"] is not False): + identified_speakers += 1 + break + else: + for feature in combination_features: + for subset in subsets: + add_missing_MdB_feature(string, + redner_dict, + feature_set_dict, + MdB_etree, + subset, + feature_set_dict[feature][1], + feature) + check_party_and_fraction() + if(redner_dict["vorname"] is not None + and redner_dict["nachname"] is not None + and redner_dict["fraktion"] is not None + and redner_dict["partei"] is not None): + redner_dict["feature_complete"] = True + counter_feats += 1 + if(counter_feats == len(combination_features)): + redner_dict["feature_complete"] = False + break + + ### + # Second while loop uses four features to identfie the unique ID for one + # speaker with add_missing_MdB_feature. Also tries to identfie speakers + # with lesser known features. In this case there can be multiple possile + # ids for one speaker these will be saved in a special dictionary entry. + # Rare case. + ### + + counter_ids = 0 + while(redner_dict["id"] is None): + if(redner_dict["feature_complete"] is True): + add_missing_MdB_feature(string, + redner_dict, + feature_set_dict, + MdB_etree, + ["vorname", "nachname", "partei", + "wahlperiode"], + feature_set_dict["id"][1], + "id") + key_original_string = redner_dict["original_string"] + SpeakerNameMarkup.known_redner_dicts.update( + {key_original_string: redner_dict.copy()}) + redner_dict["identified"] = True + if(counter_ids == 1): + redner_dict["id"] = None + redner_dict["feature_complete"] = False + redner_dict["identified"] = False + self.logger.warning(("Unique ID could not be assigned. " + + "Feature complete: True " + + "Features are: " + + str(redner_dict))) + SpeakerNameMarkup.known_redner_dicts.update( + {key_original_string: redner_dict.copy()}) + unidentified_speakers += 1 + identified_speakers -= 1 # because identified_speakers was set before + break + identified_speakers += 1 + elif(redner_dict["feature_complete"] is not True): + redner_dict["id"] = None + ids = add_missing_MdB_feature(string, + redner_dict, + feature_set_dict, + MdB_etree, + ["nachname", "partei", + "wahlperiode"], + feature_set_dict["id"][1], + "id", False, True) + if(ids is not None and len(ids) > 1): + redner_dict["identified"] = "Multiple" + multiple_identified_speakers += 1 + identified_speakers -= 1 + break + elif(ids is None): + self.logger.warning(("Unique ID could not be assigned. " + + "Feature complete: False " + + "Features are: " + + str(redner_dict))) + redner_dict["identified"] = False + unidentified_speakers += 1 + break + counter_ids += 1 + + self.logger.info(("Number of identified speakers with valid id and" + + " name markup is: " + + str(identified_speakers))) + self.logger.info(("Number of unidentified speakers without valid" + + " id and name markup is: " + + str(unidentified_speakers))) + self.logger.info(("Number of speakers with possible multiple ids: " + + str(multiple_identified_speakers))) + self.logger.info(("Number of all speaker entitiys in current" + + " protocoll is: " + + str(len(strings)))) + redner_dict_final = copy.deepcopy(redner_dict) + self.redner_dict = redner_dict_final + self.all_speakers.append(self.redner_dict) + for key in features: + redner_dict[key] = None + + # print("Speaker features after whole cross reference markup:", + # redner_dict_final) + self.logger.info(("Saved speakers (identfied and not identified): " + + str(len(self.all_speakers)))) + + def create_speaker_elements(self): + """ + Creates a valid redner XML element for one redner_dict entry from the + list self.all_speakers. Has to be done step by step becuase dictionary + is not sorted and name sub elements have to be in specific order. + """ + self.all_speaker_elements = [] + for redner_entry in tqdm(self.all_speakers, desc="Creating speaker element"): + redner_element = etree.Element("redner") + redner_element.set("id", str(redner_entry["id"])) + name_element = etree.Element("name") + titel_element = etree.Element("titel") + titel_element.text = redner_entry["titel"] + vorname_element = etree.Element("vorname") + vorname_element.text = redner_entry["vorname"] + namenszusatz_element = etree.Element("namenszusatz") + namenszusatz_element.text = redner_entry["namenszusatz"] + nachname_element = etree.Element("nachname") + nachname_element.text = redner_entry["nachname"] + damalige_fraktion_element = etree.Element("damalige_fraktion") + damalige_fraktion_element.text = redner_entry["damalige_fraktion"] + fraktion_element = etree.Element("fraktion") + fraktion_element.text = redner_entry["fraktion"] + partei_element = etree.Element("partei") + partei_element.text = redner_entry["partei"] + ortszusatz_element = etree.Element("ortszusatz") + ortszusatz_element.text = redner_entry["ortszusatz"] + rolle_lang_element = etree.Element("rolle_lang") + rolle_lang_element.text = redner_entry["rolle_lang"] + rolle_kurz_element = etree.Element("rolle_kurz") + rolle_kurz_element.text = redner_entry["rolle_kurz"] + original_string_element = etree.Element("original_string") + original_string_element.text = redner_entry["original_string"] + + if(redner_entry["titel"] is not None): + name_element.append(titel_element) + name_element.append(vorname_element) + if(redner_entry["namenszusatz"] is not None): + name_element.append(namenszusatz_element) + name_element.append(nachname_element) + name_element.append(damalige_fraktion_element) + name_element.append(fraktion_element) + name_element.append(partei_element) + if(redner_entry["ortszusatz"] is not None): + name_element.append(ortszusatz_element) + if(redner_entry["rolle_lang"] is not None): + name_element.append(rolle_lang_element) + name_element.append(rolle_kurz_element) + name_element.append(original_string_element) + name_element.tail = original_string_element.text + redner_element.append(name_element) + self.all_speaker_elements.append(redner_element) + self.logger.info(("Speaker element is: " + + ElementTree.tostring(redner_element).decode("utf-8"))) + + def set_speech_ids(self): + """ + This functions sets a unique rede id for every rede element in one + protocoll. Id is a ten digit integer preceded by the string ID. + Example: ID1809900000 + First two digits are the wahlperiode the followinf three digits are the + sitzungsnr (session number). The remaining digits are for counting the + speeches. First speech is 00100, second is 00200, eleventh is 01100 and so on. + Example: ID1809901100 --> eleventh speech + Last tow digits are for corrections. + """ + + id_counter = 000 + speeches = self.xml_tree.xpath(".//sitzungsbeginn | .//rede") + for speech in tqdm(speeches, desc="Creating speech ids"): + id_counter_str = str(id_counter).zfill(5) + id = "ID" + self.filename + id_counter_str + speech.set("id", id) + id_counter += 100 + self.logger.info(("Speech id is: " + id)) + self.xml_tree = self.xml_tree diff --git a/bundesdata_markup_nlp/markup/__init__.py b/bundesdata_markup_nlp/markup/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/bundesdata_markup_nlp/markup/__pycache__/EntityMarkup.cpython-37.pyc b/bundesdata_markup_nlp/markup/__pycache__/EntityMarkup.cpython-37.pyc new file mode 100644 index 0000000..cb5f72a Binary files /dev/null and b/bundesdata_markup_nlp/markup/__pycache__/EntityMarkup.cpython-37.pyc differ diff --git a/bundesdata_markup_nlp/markup/__pycache__/MdBData.cpython-37.pyc b/bundesdata_markup_nlp/markup/__pycache__/MdBData.cpython-37.pyc new file mode 100644 index 0000000..96ae2c8 Binary files /dev/null and b/bundesdata_markup_nlp/markup/__pycache__/MdBData.cpython-37.pyc differ diff --git a/bundesdata_markup_nlp/markup/__pycache__/MetadataMarkup.cpython-37.pyc b/bundesdata_markup_nlp/markup/__pycache__/MetadataMarkup.cpython-37.pyc new file mode 100644 index 0000000..154ab8d Binary files /dev/null and b/bundesdata_markup_nlp/markup/__pycache__/MetadataMarkup.cpython-37.pyc differ diff --git a/bundesdata_markup_nlp/markup/__pycache__/SpeakerMarkup.cpython-37.pyc b/bundesdata_markup_nlp/markup/__pycache__/SpeakerMarkup.cpython-37.pyc new file mode 100644 index 0000000..54095c4 Binary files /dev/null and b/bundesdata_markup_nlp/markup/__pycache__/SpeakerMarkup.cpython-37.pyc differ diff --git a/bundesdata_markup_nlp/markup/__pycache__/SpeakerNameMarkup.cpython-37.pyc b/bundesdata_markup_nlp/markup/__pycache__/SpeakerNameMarkup.cpython-37.pyc new file mode 100644 index 0000000..1d1f9eb Binary files /dev/null and b/bundesdata_markup_nlp/markup/__pycache__/SpeakerNameMarkup.cpython-37.pyc differ diff --git a/bundesdata_markup_nlp/markup/__pycache__/__init__.cpython-37.pyc b/bundesdata_markup_nlp/markup/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..b578973 Binary files /dev/null and b/bundesdata_markup_nlp/markup/__pycache__/__init__.cpython-37.pyc differ diff --git a/bundesdata_markup_nlp/markup/__pycache__/beautify_markup.cpython-37.pyc b/bundesdata_markup_nlp/markup/__pycache__/beautify_markup.cpython-37.pyc new file mode 100644 index 0000000..b782155 Binary files /dev/null and b/bundesdata_markup_nlp/markup/__pycache__/beautify_markup.cpython-37.pyc differ diff --git a/bundesdata_markup_nlp/markup/__pycache__/metadata.cpython-37.pyc b/bundesdata_markup_nlp/markup/__pycache__/metadata.cpython-37.pyc new file mode 100644 index 0000000..c603f9e Binary files /dev/null and b/bundesdata_markup_nlp/markup/__pycache__/metadata.cpython-37.pyc differ diff --git a/bundesdata_markup_nlp/markup/__pycache__/speaker_names.cpython-37.pyc b/bundesdata_markup_nlp/markup/__pycache__/speaker_names.cpython-37.pyc new file mode 100644 index 0000000..f196a4f Binary files /dev/null and b/bundesdata_markup_nlp/markup/__pycache__/speaker_names.cpython-37.pyc differ diff --git a/bundesdata_markup_nlp/markup/__pycache__/speakers.cpython-37.pyc b/bundesdata_markup_nlp/markup/__pycache__/speakers.cpython-37.pyc new file mode 100644 index 0000000..445735b Binary files /dev/null and b/bundesdata_markup_nlp/markup/__pycache__/speakers.cpython-37.pyc differ diff --git a/bundesdata_markup_nlp/markup/__pycache__/speeches.cpython-37.pyc b/bundesdata_markup_nlp/markup/__pycache__/speeches.cpython-37.pyc new file mode 100644 index 0000000..521ee5f Binary files /dev/null and b/bundesdata_markup_nlp/markup/__pycache__/speeches.cpython-37.pyc differ diff --git a/bundesdata_markup_nlp/markup/beautify_markup.py b/bundesdata_markup_nlp/markup/beautify_markup.py new file mode 100755 index 0000000..39f1b80 --- /dev/null +++ b/bundesdata_markup_nlp/markup/beautify_markup.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from utility.FileGetter import FileGetter +from utility.XMLProtocol import XMLProtocol +import configparser +from tqdm import tqdm + + +def beautify_xml(case, alter_lines=False, line_width=0): + """ + Beautifies the xml protocols so that they are easily readable by humans. + Uses .beautify_xml_part() and .beautify_xml() to be able to format lines for + specific parts of an xml. Alter lines can be set to Flase or True. Line + width that will be used if alter_lines is True can be set to any value + between 0 and 160. + """ + config = configparser.ConfigParser() + config.read("config.ini") + if(case == "markup"): + output_path = config["File paths"]["output_folder"] + input_path = config["File paths"]["clear_speech_markup"] + key_name = "beautiful_xml" + elif(case == "nlp"): + output_path = config["File paths"]["nlp_output"] + input_path = config["File paths"]["nlp_lemmatized_tokenized"] + key_name = "nlp_beuatiful_xml" + files = FileGetter(input_path, "*.xml") + files = files.get_files() + for file_path in tqdm(sorted(files), desc="First beautification steps"): + xml = XMLProtocol() + xml.read_xml(file_path) + xml.beautify_xml_part(file_path, ".//vorspann") + xml.replace_elements(".//vorspann", [xml.beautified_part]) + xml.beautify_xml_part(file_path, ".//sitzungsverlauf", alter_lines, + line_width) + xml.replace_elements(".//sitzungsverlauf", [xml.beautified_part]) + xml.save_to_file(output_path, file_path, key_name, + "File paths", key_name) + config.read("config.ini") + beautiful_xmls_path = config["File paths"][key_name] + files = FileGetter(beautiful_xmls_path, "*.xml") + files = files.get_files() + for file_path in tqdm(files, desc="Second beautification steps"): + xml.beautify_xml(file_path, False) + + +if __name__ == '__main__': + beautify_xml() diff --git a/bundesdata_markup_nlp/markup/metadata.py b/bundesdata_markup_nlp/markup/metadata.py new file mode 100755 index 0000000..28a6ae2 --- /dev/null +++ b/bundesdata_markup_nlp/markup/metadata.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from utility.FileGetter import FileGetter +from markup.MetadataMarkup import MetadataMarkup +from tqdm import tqdm +import os +import configparser +import logging + + +def get_metadata(): + """ + This script creates a valid metadata head and first level xml tag strucutre + for all files in one directory with subdirs. It needs all filepaths for all + files to consider. File paths will be extracted by using the FileGetter + class. + After that it extracts the given metadata for one file each and writes it as + valid XML according to the new offical schema into a new file at the given + output path. + """ + logger = logging.getLogger(__name__) + print("Running metadata creation for original XML-protocolls.") + config = configparser.ConfigParser() + config.read("config.ini") + input_path = config["File paths"]["input_folder_xmls"] + output_path = config["File paths"]["output_folder"] + Files = FileGetter(input_path, "*.xml") + file_list = Files.get_files() + metadata = MetadataMarkup() + for file in tqdm(sorted(file_list), desc="Metadata status:"): + logger.info("\nCreating metadata for: " + str(os.path.basename(file))) + root = metadata.read_protcol(file) + metadata.extract_metadata(root) + metadata.built_iso_date(metadata.datum_ger_non_iso) + metadata.built_date_string(metadata.datum_iso) + metadata.delete_old_metadata(root) + metadata.split_content(root) + metadata.insert_new_metadata(root) + metadata.get_session_times() + metadata.write_to_attr("dbtplenarprotokoll", "sitzung-datum", + metadata.datum_ger_non_iso) + metadata.write_to_attr("dbtplenarprotokoll", "sitzung-start-uhrzeit", + metadata.session_start_time) + metadata.write_to_attr("dbtplenarprotokol", "sitzung-ende-uhrzeit", + metadata.session_end_time) + metadata.write_to_attr("dbtplenarprotokoll", "sitzungs-nr", + metadata.sitzungsnr) + metadata.write_to_attr("dbtplenarprotokol", "wahlperiode", + metadata.wahlperiode) + metadata.save_to_file(output_path, file, "new_metadata", "File paths", "new_metadata") + logger.info("New metadata created for: " + str(os.path.basename(file))) + print("Succesfully extracted and wrote new metadata to XML-protocolls.") + + +if __name__ == '__main__': + get_metadata() diff --git a/bundesdata_markup_nlp/markup/speaker_names.py b/bundesdata_markup_nlp/markup/speaker_names.py new file mode 100755 index 0000000..cd5c8ed --- /dev/null +++ b/bundesdata_markup_nlp/markup/speaker_names.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from markup.SpeakerNameMarkup import SpeakerNameMarkup +from markup.MdBData import MdBData +from utility.FileGetter import FileGetter +from xml.etree import ElementTree +from tqdm import tqdm +import os +import configparser +import logging + + +def get_names(): + """ + This script gets the identified speaker elements. It will analyse the text + of those to determine , , @id etc. for every speaker. + Also creates a speech id for every speech. + """ + ### + # Setting paths in config and start logging + ### + logger = logging.getLogger(__name__) + config = configparser.ConfigParser() + config.read("config.ini") + xml_path = config["File paths"]["new_simple_markup"] + output_path = config["File paths"]["output_folder"] + parent_path = os.path.dirname(os.getcwd()) + stammdatenbank_full_path = os.path.join(parent_path, + "data/MdB_data/MdB_Stammdaten.xml") + ### + # opens and reads Stammdatenbank + ### + stammdatenbank = MdBData() + stammdatenbank.read_xml(stammdatenbank_full_path) + ### + # Getting sets of different name name/MdB features + ### + # getting first names + first_names = stammdatenbank.get_set(".//VORNAME", stammdatenbank.xml_tree) + first_names.discard(None) + # getting las names + last_names = stammdatenbank.get_set(".//NACHNAME", stammdatenbank.xml_tree) + last_names.discard(None) + # getting academic titles + academic_titles = stammdatenbank.get_set(".//AKAD_TITEL", + stammdatenbank.xml_tree) + academic_titles_short = stammdatenbank.get_set(".//ANREDE_TITEL", + stammdatenbank.xml_tree) + additional_academic_titles = [title for title in config["Additional name features"]["academic_titles"].split()] + for title in additional_academic_titles: + academic_titles.add(title) + academic_titles = academic_titles.union(academic_titles_short) + academic_titles.discard(None) + # getting parties + parties = stammdatenbank.get_set(".//PARTEI_KURZ", stammdatenbank.xml_tree) + additional_parties = [party for party in config["Additional name features"]["parties"].split()] + for party in additional_parties: + parties.add(party) + parties.discard(None) + # getting name affixes + name_affixes = stammdatenbank.get_set(".//PRAEFIX", stammdatenbank.xml_tree) + name_affixes.discard(None) + # getting cities + cities = stammdatenbank.get_set(".//ORTSZUSATZ", stammdatenbank.xml_tree) + cities.discard(None) + # setting empty sets to later combine them with XML node names for XPaths + party = set() # + periode = set() # + feature_complete = set() # + speaker_id = set() # + role_long = set() + role_short = set() + ### + # creating dict with tuples of sets and corresponding XML node name + ### + sets = [(first_names, "VORNAME"), (last_names, "NACHNAME"), + (academic_titles, "AKAD_TITEL"), (parties, "PARTEI_KURZ"), + (name_affixes, "PRAEFIX"), (cities, "ORTSZUSATZ"), + (party, "PARTEI_KURZ"), (periode, "WP"), (feature_complete, "None"), + (speaker_id, "ID"), (role_long, "None"), (role_short, "None")] + features = ["vorname", "nachname", "titel", "fraktion", "namenszusatz", + "ortszusatz", "partei", "wahlperiode", "feature_complete", + "id", "rolle_lang", "rolle_kurz"] + feature_set_dict = dict(zip(features, sets)) + ### + # opening XML protocolls + # starting speaker markup for features + ### + files = FileGetter(xml_path, "*.xml") + files = files.get_files() + for file_path in tqdm(sorted(files), + desc="File status"): + complex_speaker = SpeakerNameMarkup(file_path, ".//redner") + complex_speaker.read_xml(file_path) + complex_speaker.get_element_text() + logger.info(("Doing cross reference markup for names to get redner ids." + + " For file: " + + os.path.basename(file_path))) + complex_speaker.cross_reference_markup(complex_speaker.current_strings, + feature_set_dict, + stammdatenbank.xml_tree) + complex_speaker.create_speaker_elements() + complex_speaker.replace_elements(".//redner", + complex_speaker.all_speaker_elements, + True) + xml_string = ElementTree.tostring(complex_speaker.xml_tree) + bool = complex_speaker.simple_check_xml(xml_string, file_path, False, + False) + if(bool is False): + logger.error(("This XML file is not well-formed. Program stopped." + " Fix or remove this file an run the program again." + )) + print("Program has stopped. See logs for more info.") + break + complex_speaker.set_speech_ids() + complex_speaker.save_to_file(output_path, file_path, "complex_markup", + "File paths", "complex_markup") + + +if __name__ == '__main__': + get_names() diff --git a/bundesdata_markup_nlp/markup/speakers.py b/bundesdata_markup_nlp/markup/speakers.py new file mode 100755 index 0000000..33b260a --- /dev/null +++ b/bundesdata_markup_nlp/markup/speakers.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from utility.FileGetter import FileGetter +from utility.XMLProtocol import XMLProtocol +from markup.EntityMarkup import EntityMarkup +from markup.SpeakerMarkup import SpeakerMarkup +from tqdm import tqdm +import configparser +import logging +import os + + +def get_speakers(): + """ + This script identifies speakers in one xml with the new metadata structure + created by metastructure.py and applies well-formed XML markup to them and their + speeches. The markup trys to follow the official guideline from the Deutsche + Bundesregierung but is more simplistic and deviates from it when it comes down + to apply markup to the presiden of a session. This decision was made to + guarantee that every speakers speech only contains what he or she is saying. + Thus the markup follows the own minimal markup defined in the DTD + 'minimal_markup.dtd' which trys to mimic the official one as close as + possible. The full offical markup cannot be applied to the XML protocolls + automatically. Script uses classes and subclasses from EntityMarkup.py. + """ + logger = logging.getLogger(__name__) + print("Running simple markup for first speaker identification.") + config = configparser.ConfigParser() + config.read("config.ini") + regex_conf_triples = config.items("Regular expressions speakers") + regex_conf_triples = [regex[1].split(" ; ") for regex in regex_conf_triples] + input_path = config["File paths"]["new_metadata"] + output_path = config["File paths"]["output_folder"] + files = FileGetter(input_path, "*.xml") + file_list = files.get_files() + sum_matches = 0 + + for file_path in tqdm(sorted(file_list), desc="Speaker markup status"): + + identified = EntityMarkup(file_path) + logger.info("Doing simple markup for: " + str(os.path.basename(file_path))) + logger.info("\nMarkup status for: " + str(os.path.basename(file_path))) + with open(file_path, 'r') as f: + xml_as_string = f.read() + xml_as_bytes = xml_as_string.encode("utf-8") + bool = identified.simple_check_xml(xml_as_bytes, file_path, False, + False) + if(bool is False): + logger.error(("This XML file is not well-formed. Program stopped." + " Fix or remove this file an run the program again." + )) + print("Program has stopped. See logs for more info.") + break + identified.read_xml(file_path) + identified.get_element_text() + string_for_markup = identified.current_string + # Start of simple markup + for regex_conf_triplet in regex_conf_triples: + regex = regex_conf_triplet[0] + case = regex_conf_triplet[1] + speaker = SpeakerMarkup(string_for_markup, regex) + speaker.compile_regex(regex) + speaker.identify_speaker() + speaker.markup_speaker(case) + string_for_markup = speaker.markuped_string + sum_matches += speaker.matches_count + + logger.info(str(sum_matches) + " total matches in the protocol.") + sum_matches = 0 + speaker.simple_check_xml(string_for_markup, file_path, False) + # Saving simple markuped string to xml + speaker.read_xml(file_path) + speaker.replace_string(string_for_markup, "sitzungsverlauf") + speaker.save_to_file(output_path, file_path, "simple_xml", "File paths", + "new_simple_markup") + + print("Simple markup finished.") + + config.read("config.ini") + new_simple_xml_path = config["File paths"]["new_simple_markup"] + # Start of president Replacer + new_files = FileGetter(new_simple_xml_path, "*.xml") + new_file_list = new_files.get_files() + print("Replacing some XML-elements in the protocolls.") + for file_path in tqdm(sorted(new_file_list), desc="Files replacement status"): + logger.info("Replacing some xml elements for: " + str(os.path.basename(file_path))) + for regex_conf_triplet in regex_conf_triples: + if(regex_conf_triplet[1] != "first" + or regex_conf_triplet[1] != "last"): + regex = regex_conf_triplet[0] + speaker_rolle_value = regex_conf_triplet[2] + replacements = XMLProtocol() + replacements.read_xml(file_path) + replacements.compile_regex(regex) + replacements.expand_element(".//rede", "typ", + speaker_rolle_value) + replacements.save_to_file(output_path, file_path, "simple_xml", + "File paths", "new_simple_markup") + start_time_attr_value = replacements.xml_tree.get("sitzung-start-uhrzeit") + replacements.replace_tag_attr(".//sitzungsverlauf/rede[1]", + "sitzungsbeginn", + "sitzung-start-uhrzeit", + start_time_attr_value, + False) + end_time_attr_value = replacements.xml_tree.get("sitzung-ende-uhrzeit") + replacements.expand_element(".//sitzungsende", "sitzung-ende-uhrzeit", + end_time_attr_value, False) + replacements.save_to_file(output_path, file_path, "simple_xml", + "File paths", "new_simple_markup") + + +if __name__ == '__main__': + get_speakers() diff --git a/bundesdata_markup_nlp/markup/speeches.py b/bundesdata_markup_nlp/markup/speeches.py new file mode 100755 index 0000000..4faa930 --- /dev/null +++ b/bundesdata_markup_nlp/markup/speeches.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from utility.FileGetter import FileGetter +from markup.EntityMarkup import EntityMarkup +import configparser +from tqdm import tqdm +import logging + +def markup_speeches(): + """ + Marks up different entitys in the speech strings. For example comments. + First it marks speech parts (

) line by line. + """ + logger = logging.getLogger(__name__) + config = configparser.ConfigParser() + config.read("config.ini") + complex_xmls = config["File paths"]["complex_markup"] + output_path = config["File paths"]["output_folder"] + regex_conf_pairs = config.items("Regular expressions speeches") + regex_conf_pairs = [regex[1].split(" ; ") for regex in regex_conf_pairs] + multiline_entities = config.items("Multiline entities") + multiline_entities = [regex[1].split(" ; ") for regex in multiline_entities] + files = FileGetter(complex_xmls, "*.xml") + file_list = files.get_files() + for file_path in tqdm(sorted(file_list), desc="File status speech markup"): + entity = EntityMarkup(file_path) + entity.read_xml(file_path) + speeches = entity.xml_tree.xpath(".//rede") + session_start = entity.xml_tree.xpath(".//sitzungsbeginn")[0] + for speech in speeches: + entity.markup_speech_lines(speech) + entity.markup_speech_lines(session_start) + + session_lines = entity.xml_tree.xpath(".//p") + for line in tqdm(session_lines, desc="Marking single line entities"): + for pair in regex_conf_pairs: + entity.inject_element(line, pair[0], pair[1]) + + session_lines = entity.xml_tree.xpath(".//p") # gets new altered session lines (

) + for pair in tqdm(multiline_entities, desc="Marking multiline entities:"): + entity.get_multiline_entities(session_lines, pair[0], pair[1], pair[2]) + # For logging + all_entities = 0 + only_single_line_entities = 0 + for pair in regex_conf_pairs: + element_path = ".//" + pair[1] + nr_entities = len(entity.xml_tree.xpath(element_path)) + logger.info(("Number of identified " + pair[1] + " elements is: " + + str(nr_entities) + + " (single line)")) + all_entities += nr_entities + only_single_line_entities += nr_entities + + for pair in multiline_entities: + element_path = ".//" + pair[2] + nr_entities = len(entity.xml_tree.xpath(element_path)) + logger.info(("Number of identified " + pair[2] + " elements is: " + + str(nr_entities) + + " (multi line)")) + all_entities += nr_entities + + logger.info(("Number of all identified single line entities: " + + str(only_single_line_entities))) + + logger.info(("Number of all identified entities is: " + str(all_entities) + + " Also includes multiline matches. Number could be higher" + + " than it is if multiline matches are matching the same" + + " like the single line entitie regexes.")) + + entity.save_to_file(output_path, file_path, "clear_speech_markup", + "File paths", "clear_speech_markup") + + +if __name__ == '__main__': + markup_speeches() diff --git a/bundesdata_markup_nlp/nlp/__pycache__/lemmatization.cpython-37.pyc b/bundesdata_markup_nlp/nlp/__pycache__/lemmatization.cpython-37.pyc new file mode 100644 index 0000000..86cc008 Binary files /dev/null and b/bundesdata_markup_nlp/nlp/__pycache__/lemmatization.cpython-37.pyc differ diff --git a/bundesdata_markup_nlp/nlp/__pycache__/n_grams.cpython-37.pyc b/bundesdata_markup_nlp/nlp/__pycache__/n_grams.cpython-37.pyc new file mode 100644 index 0000000..0112037 Binary files /dev/null and b/bundesdata_markup_nlp/nlp/__pycache__/n_grams.cpython-37.pyc differ diff --git a/bundesdata_markup_nlp/nlp/__pycache__/tokenize.cpython-37.pyc b/bundesdata_markup_nlp/nlp/__pycache__/tokenize.cpython-37.pyc new file mode 100644 index 0000000..64d778a Binary files /dev/null and b/bundesdata_markup_nlp/nlp/__pycache__/tokenize.cpython-37.pyc differ diff --git a/bundesdata_markup_nlp/nlp/lemmatization.py b/bundesdata_markup_nlp/nlp/lemmatization.py new file mode 100755 index 0000000..9085855 --- /dev/null +++ b/bundesdata_markup_nlp/nlp/lemmatization.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import de_core_news_sm +import configparser +from utility.XMLProtocol import XMLProtocol +from lxml import etree +from tqdm import tqdm +import re + + +def lemmatization(files, no_stop_words=False): + """ + Lemmatizes the speeches of the input XML protocols with the built in spacy + lookup-table function. Can include or exclude stop words. + Lemmatized text will be written into an new Element named + . Always removes punctuation. Joines hyphenated strings + before they will be lemmatised. + """ + nlp = de_core_news_sm.load() + config = configparser.ConfigParser() + config.read("config.ini") + output_path = config["File paths"]["nlp_output"] + for file_path in tqdm(sorted(files), desc="Lemmatization file status"): + xml = XMLProtocol() + xml.read_xml(file_path) + speeches = xml.xml_tree.xpath(".//rede | .//sitzungsbeginn") + for speech in speeches: + parts = speech.xpath(".//p") + tmp_list = [] + for part in parts: + if(part.text is not None): + tmp_list.append(re.sub(r"_", " ", str(part.text + "\n"))) + """ + replaces "_" with " ". Is needed because a string like + "Treffsicherheit einer Schrotflinte;_Sie haben nämlich kaum + den Punkt getroffen" will not be lemmatized correctly in spacy. + "Schrotflinte;_Sie" wil be recognized as one token. + Furthermore this meeses up the sorted ngram calculation. + Also adds \n at end of every line to help identifying + hyphenated words. + """ + part.getparent().remove(part) + new_text = "".join(tmp_list) + new_text = re.sub(r"(?P[a-zßüöä])(?P\-\n)(?P[a-ßzäüö])", "\g\g", new_text) + """ + joins hyphenated words together: + 'Länderfinanz- ausgleich' --> Länderfinanzausgleich. + Better to do it here because most of the comments and metadata has + already been marked. + Ignores strings like: 'Finanz-, Handels- und Sicherheitspolitik'. + Does not ignore them when they happen at a linebreak. This is a rare + occasion though. + """ + new_text = re.sub(r"(?P[a-zßüöä])(?P\-\n)(?P[A-ZÄÜÖ])", "\g-\g", new_text) + """ + Removes all line breaks again. This way compound names with a line + break inbetween like "Sütterlin-\nWaack" will be recognized as one + string by spacy. --> Sütterlin-Waack + """ + lemmatized_speech = etree.Element("rede_lemmatisiert") + doc = nlp(new_text) + if(no_stop_words is False): + lemmatized = " ".join([token.lemma_ for token in doc + if token.pos_ != "PUNCT" and token.text != "_"]) + """ + Removes "_" from text. Has to be removed + because it is some kind of special + character in spacy. + """ + filename_sufix = "_lemmatized_with_stopwords.xml" + elif(no_stop_words is True): + lemmatized = " ".join([token.lemma_ for token in doc + if token.is_stop is False + and token.pos_ != "PUNCT" and token.text != "_"]) + filename_sufix = "_lemmatized_without_stopwords.xml" + lemmatized_speech.text = lemmatized + speech.append(lemmatized_speech) + xml.save_to_file(output_path, file_path, "lemmatized", "File paths", + "nlp_lemmatized_tokenized", filename_sufix) + + +if __name__ == '__main__': + lemmatization() diff --git a/bundesdata_markup_nlp/nlp/n_grams.py b/bundesdata_markup_nlp/nlp/n_grams.py new file mode 100755 index 0000000..9410200 --- /dev/null +++ b/bundesdata_markup_nlp/nlp/n_grams.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -* + +import configparser +import csv +import os +import gc +from utility.XMLProtocol import XMLProtocol +from collections import Counter +from tqdm import tqdm +from sklearn.feature_extraction.text import CountVectorizer +from itertools import groupby, chain +from operator import itemgetter +import locale +locale.setlocale(locale.LC_COLLATE, "C") # Sets locale to portable "C" locale. + + +def n_grams(files, group_by_feature="year", + input_type_name="lemmatized_without_stopwords"): + """ + Clacluates 1 to 5 grams for given input protocols. Can either handel + lemmatized or non lemmatized files. Writes the ngrams to a tab separated csv + file. One row inclueds the ngram, the match count of it, the year or date, + or rede_id or redner_id. One file per unigram, bigram, trigram etc. per + group key will be created. (There wil be one file for unigrams starting with + the letter 'A' one for unigrams starting with 'B' etc.) + Third parameter is a string set by the user which will be added to + the file names to help distinguish lemmatized and non lemmatized ngrams etc. + The more protocols are used as input the more RAM the script needs. + For all 4106 protocols 32GB of RAM with a 32GB swap file was used! + """ + config = configparser.ConfigParser() + config.read("config.ini") + output_path = config["File paths"]["nlp_output"] + output_path = os.path.join(output_path, "n-grams") + if not os.path.exists(output_path): + os.mkdir(output_path) + for step in tqdm(range(6)[1:], desc="Current ngram calculating"): + N_GRAMS = [] + file_name_prefix = str(step) + "_grams" + counter_vectorizer = CountVectorizer(ngram_range=(step, step), + lowercase=False) + for file_path in tqdm(sorted(files), desc="File status"): + xml = XMLProtocol() + xml.read_xml(file_path) + feature_year = xml.xml_tree.xpath("@sitzung-datum")[0][-4:] + feature_mont_year = xml.xml_tree.xpath("@sitzung-datum")[0][-7:] + speeches = xml.xml_tree.xpath(".//rede | .//sitzungsbeginn") + for speech in speeches: + # gets id of current speech + feature_rede_id = speech.xpath("@id") + if(len(feature_rede_id) == 0): + feature_rede_id = "sitzungsbeginn" + else: + feature_rede_id = feature_rede_id[0] + # gets id of current speaker + feature_redner_id = speech.xpath(".//redner/@id")[0] + # gets speech text from tokenized or lemmatized protocol + speech_text = speech.xpath("node()[2]")[0] # gets second child of speech + if(speech_text.text is not None): + tmp_str = speech_text.text + + ngrams = counter_vectorizer.build_analyzer() + ngrams_list = ngrams(tmp_str) + + if(group_by_feature == "year"): + pairs = [(pair,) + (feature_year,) for pair + in ngrams_list] + elif(group_by_feature == "month_year"): + pairs = [(pair,) + (feature_mont_year,) for pair + in ngrams_list] + elif(group_by_feature == "speaker"): + pairs = [(pair,) + (feature_redner_id,) for pair + in ngrams_list] + elif(group_by_feature == "speech"): + pairs = [(pair,) + (feature_rede_id,) for pair + in ngrams_list] + N_GRAMS.extend(pairs) + speeches = None + # puts uppercase ngram at first position in line to sort by this + # will be delted later on + print("Start counting ngrams.") + N_GRAMS = Counter(N_GRAMS) + print("Finished counting ngrams.") + print("Start sorting ngrams") + N_GRAMS = [item[0][0][0].upper() + + "||" + + item[0][0] + + "||" + + str(item[0][1]) + + "||" + + str(item[1]) + for item in N_GRAMS.items()] + N_GRAMS = sorted(N_GRAMS, key=locale.strxfrm) + print("Finished sorting ngrams") + # sorts all ngrams into groups one group for each german uppercasse + # letter except ß + # Also one group for every decimal from 0 to 10 + # Other non ascii or non decimal ngrams will be sorted in own groups + # These groups will be joined together later on into one non ascii group + alphabetically = [] + tmp_list = [] + for letter, entries in tqdm(groupby(N_GRAMS, key=itemgetter(0)), + desc="Grouping ngrams alphabetically"): + if(letter): + print(letter) + for entry in entries: + tmp_list.append(entry) + alphabetically.append(tmp_list) + tmp_list = [] + N_GRAMS = None + gc.collect() # frees RAM + key_list = ([i for i in range(10)] + + "A B C D E F G H I J K L M N O P Q R S T U V W X Y Z".split() + + ["_Non_ASCII"]) + # groups all non ascii ngrams into one list to save them into one csv + if(len(alphabetically) > 37): + joined_tail = alphabetically[36:] + joined_tail = chain.from_iterable(list(joined_tail)) + del alphabetically[36:] + alphabetically.append(joined_tail) + # save groups to individual files + for group, key in tqdm(zip(alphabetically, key_list), + desc="Writing ngrams to files"): + group_ngrams = [entry.split("||")[1:] for entry in group] + file_name = (str(key) + + "_" + + file_name_prefix + + "_per_" + + group_by_feature + + "_" + + input_type_name + + ".csv") + file_output_path = os.path.join(output_path, file_name) + with open(file_output_path, "w", newline="", encoding="utf8") as file: + writer = csv.writer(file, delimiter="\t") + writer.writerows(group_ngrams) + alphabetically = None + + +if __name__ == '__main__': + n_grams() diff --git a/bundesdata_markup_nlp/nlp/tokenize.py b/bundesdata_markup_nlp/nlp/tokenize.py new file mode 100755 index 0000000..7fdbe14 --- /dev/null +++ b/bundesdata_markup_nlp/nlp/tokenize.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import de_core_news_sm +import configparser +from utility.XMLProtocol import XMLProtocol +from lxml import etree +from tqdm import tqdm +import re + + +def tokenize(files, no_stop_words=False): + """ + Tokenizes the speeches of the input XML protocols. Can include or exclude + stop words. Tokenized speeches will be written into a new element + . Always removes punctuation. Joines hyphenated strings + before they will be tokenized. + """ + nlp = de_core_news_sm.load() + config = configparser.ConfigParser() + config.read("config.ini") + output_path = config["File paths"]["nlp_output"] + for file_path in tqdm(sorted(files), desc="Tokenization file status"): + xml = XMLProtocol() + xml.read_xml(file_path) + speeches = xml.xml_tree.xpath(".//rede | .//sitzungsbeginn") + for speech in speeches: + parts = speech.xpath(".//p") + tmp_list = [] + for part in parts: + if(part.text is not None): + tmp_list.append(re.sub(r"_", " ", str(part.text + "\n"))) + """ + replaces "_" with " ". Is needed because a string like + "Treffsicherheit einer Schrotflinte;_Sie haben nämlich kaum + den Punkt getroffen" will not be lemmatized correctly in spacy. + "Schrotflinte;_Sie" wil be recognized as one token. + Furthermore this meeses up the sorted ngram calculation. + Also adds \n at end of every line to help identifying + hyphenated words. + """ + part.getparent().remove(part) + new_text = "".join(tmp_list) + new_text = re.sub(r"(?P[a-zßüöä])(?P\-\n)(?P[a-ßzäüö])", "\g\g", new_text) + """ + joins hyphenated words together: + 'Länderfinanz- ausgleich' --> Länderfinanzausgleich. + Better to do it here because most of the comments and metadata has + already been marked. + Ignores strings like: 'Finanz-, Handels- und Sicherheitspolitik'. + Does not ignore them when they happen at a linebreak. This is a rare + occasion though. + """ + new_text = re.sub(r"(?P[a-zßüöä])(?P\-\n)(?P[A-ZÄÜÖ])", "\g-\g", new_text) + """ + Removes all line breaks again. This way compound names with a line + break inbetween like "Sütterlin-\nWaack" will be recognized as one + string by spacy. --> Sütterlin-Waack + """ + tokenized_speech = etree.Element("rede_tokenisiert") + doc = nlp(new_text) + if(no_stop_words is False): + tokenized = " ".join([token.text for token in doc + if token.pos_ != "PUNCT"]) + filename_sufix = "_tokenized_with_stopwords.xml" + elif(no_stop_words is True): + tokenized = " ".join([token.text for token in doc + if token.is_stop is False + and token.pos_ != "PUNCT"]) + filename_sufix = "_tokenized_without_stopwords.xml" + tokenized_speech.text = tokenized + speech.append(tokenized_speech) + xml.save_to_file(output_path, file_path, "tokenized", "File paths", + "nlp_lemmatized_tokenized", filename_sufix) + + +if __name__ == '__main__': + tokenize() diff --git a/bundesdata_markup_nlp/samples/__init__.py b/bundesdata_markup_nlp/samples/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/bundesdata_markup_nlp/samples/create_samples.py b/bundesdata_markup_nlp/samples/create_samples.py new file mode 100755 index 0000000..0fe5928 --- /dev/null +++ b/bundesdata_markup_nlp/samples/create_samples.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +import fnmatch +import argparse +import random +import shutil + +""" +This is just a quick script to get randomized samples from the protocols. +""" + + +def parse_arguments(): + """Argument Parser""" + parser = argparse.ArgumentParser(description="Creates samples from given \ + directory with given size. Creates two \ + samples with no overlapping.") + parser.add_argument("-p", + "--path", + help="Path to data files to create sample from.", + required=True, + type=str, + metavar="") + parser.add_argument("-s", + "--size", + help="Size of sample.", + required=True, + type=int, + metavar="") + parser.add_argument("-n", "--number_of_samples", + help="How many smaples should be created? should be \ + created?", + required=True, + type=int, + metavar="") + parser.add_argument("-t", + "--file_type", + help="What file types should be used as the base for \ + the sample?. Accepts wildcars.", + required=True, + type=str) + args = parser.parse_args() + return args + + +def get_files(path, file_type): + """Creates file list with full paths of all files in the given directory and + its sub directories and returns it.""" + list_of_files = [] + for path, subdirs, files in os.walk(path): + for name in files: + if fnmatch.fnmatch(name, file_type): + list_of_files.append(os.path.join(path, name)) + return list_of_files + + +def get_files_to_copy(list_of_files, sample_size): + """Gets random filepaths from all filepaths to create a sample out of those. + Filepaths that have already been use will be removed from the file list to + create independent sampels.""" + counter = 0 + sample_list = [] + while counter < sample_size: + counter += 1 + random_index = random.randint(0, len(list_of_files)-1) + sample_list.append(list_of_files[random_index]) + del list_of_files[random_index] + pass + return list_of_files, sample_list + + +def copy_files(path, sample_list, step_int): + """Copys the given files to new directories.""" + sample_path = os.path.join(path, str(step_int)) + print(sample_path) + os.mkdir(sample_path) + for file in sample_list: + shutil.copy2(file, sample_path) + + +def main(): + args = parse_arguments() + path = args.path + file_list = get_files(path, args.file_type) + for step in range(1, args.number_of_samples + 1): + file_list = get_files_to_copy(file_list, args.size)[0] + sample_list = get_files_to_copy(file_list, args.size)[1] + copy_files(path, sample_list, step) + file_list = get_files_to_copy(file_list, args.size)[0] + + +if __name__ == '__main__': + main() diff --git a/bundesdata_markup_nlp/utility/FileGetter.py b/bundesdata_markup_nlp/utility/FileGetter.py new file mode 100755 index 0000000..2d12c96 --- /dev/null +++ b/bundesdata_markup_nlp/utility/FileGetter.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +import fnmatch + +""" +This class is for getting filepaths of all files in a given directory. Also +gets files in subdirectories. +""" + + +class FileGetter(object): + """ + Class for getting file paths of given path wich will be opend and/or + further processed later on. + """ + + def __init__(self, path, file_type): + super(FileGetter, self).__init__() + self.path = path + self.file_type = file_type + + def get_files(self): + """ + Creates file list with full paths of all files in the given + directory and its sub directories and returns it. + """ + list_of_files = [] + for path, subdirs, files in os.walk(self.path): + for name in files: + if fnmatch.fnmatch(name, self.file_type): + list_of_files.append(os.path.join(path, name)) + self.list_of_files = list_of_files + return list_of_files diff --git a/bundesdata_markup_nlp/utility/XMLProtocol.py b/bundesdata_markup_nlp/utility/XMLProtocol.py new file mode 100755 index 0000000..28f6b44 --- /dev/null +++ b/bundesdata_markup_nlp/utility/XMLProtocol.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from utility import delete_folder +from utility import update_config +from xml.etree import ElementTree +from os import path +from lxml import etree +import os +import logging +import re + + +class XMLProtocol(object): + """Class for standard operations on/with the XML protocols. Has functions + for reading, saving and manipulationg an XML protocol. All other classes + inherit from this one. + """ + + def __init__(self): + super().__init__() + self.logger = logging.getLogger(__name__) + + def read_protcol(self, file_path): + """ + Takes a file path and parses the file as an XML returns a root element. + """ + self.file_path = file_path + self.filename = os.path.basename(self.file_path) + parser = etree.XMLParser(remove_blank_text=True) + self.tree = etree.parse(file_path, parser) # for better xml indentation + root = self.tree.getroot() + self.logger.info("File successfully parsed as XML.") + return root + + def read_xml(self, file_path): + """Takes a file path and parses the file as an XML.""" + parser = etree.XMLParser(encoding='utf-8', remove_blank_text=True) + tree = etree.parse(file_path, parser) # for better xml indentation + self.xml_tree = tree.getroot() + + def save_to_file(self, output_path, file_path, subfolder, config_section, + config_key, filename_sufix=""): + """ + Writes the new markup to a new xml file. Takes the output path and + creates a new folder there. Also updates the config file with the new + path. + """ + if(filename_sufix == ""): + self.filename = path.basename(file_path) + elif(filename_sufix != ""): + self.filename = path.basename(file_path)[:-4] + filename_sufix + save_path = os.path.join(output_path, subfolder) + if not os.path.exists(save_path): + os.mkdir(save_path) + tree = etree.ElementTree(self.xml_tree) + new_filename = self.filename + save_file_path = os.path.join(save_path, new_filename) + tree.write(save_file_path, + pretty_print=True, + xml_declaration=True, + encoding="utf8", + doctype="") + self.logger.info("New XML saved to:" + save_file_path) + update_config.update_config("config.ini", config_section, config_key, + save_path) + + def beautify_xml_part(self, file_path, xpath, alter_lines=False, + line_width=80): + """ + Beautifies part (element node) of an input XML. + """ + tmp_path = os.path.join(os.path.dirname(file_path), "tmp") + tree = etree.ElementTree(self.xml_tree) + self.beautified_part = tree.find(xpath) + self.beautified_part = ElementTree.tostring(self.beautified_part) + self.beautified_part = etree.fromstring(self.beautified_part) + self.beautified_part = etree.ElementTree(self.beautified_part) + if not os.path.exists(tmp_path): + os.mkdir(tmp_path) + tmp_file_path = os.path.join(tmp_path, "tmp.xml") + self.beautified_part.write(tmp_file_path, + pretty_print=True, + xml_declaration=True, + encoding="utf8") + if(alter_lines is True): + os.system("html-beautify -r -q -w {} --no-preserve-newlines {}".format(line_width, tmp_file_path)) + self.beautified_part = etree.parse(tmp_file_path).getroot() + elif(alter_lines is False): + os.system("html-beautify -r -q {}".format(tmp_file_path)) + self.beautified_part = etree.parse(tmp_file_path).getroot() + update_config.update_config("config.ini", "File paths", "tmp_path", + tmp_path) + delete_folder.delete_folder(tmp_path) + + def beautify_xml(self, file_path, alter_lines=False, line_width=80): + if(alter_lines is True): + os.system("html-beautify -r -q -w {} --no-preserve-newlines {}".format(line_width, file_path)) + elif(alter_lines is False): + os.system("html-beautify -r -q {}".format(file_path)) + + def expand_element(self, element_to_expand, expand_attr_key, + expand_attr_value, check_child=True): + """ + This function takes an XPath expression for an xml element. + The tag of this element will be expanded with the given + expand_attrkey and expand_attr_value. Also needs a regex to determine if + the current selected element is an element which should be replaced. + For this the text of the first child of the current element is checked + against the given regex. Per default the child element text of the + current element is checked wether the regex matches the string or not. + Set check_child to False to avoid this and just expand the current + element. + """ + elements = self.xml_tree.findall(element_to_expand) + for element in elements: + if(check_child is True): + first_child = element.getchildren()[0] + match = self.regex_compiled.search(first_child.text) + if(match): + element.set(expand_attr_key, expand_attr_value) + self.xml_tree = self.xml_tree + else: + element.set(expand_attr_key, expand_attr_value) + self.xml_tree = self.xml_tree + + def replace_tag_name(self, element_to_replace, tag_name, check_child=True): + """ + Replaces a given element tag(as XPath) name with a new tag name. + """ + elements = self.xml_tree.findall(element_to_replace) + for element in elements: + if(check_child is True): + first_child = element.getchildren()[0] + match = self.regex_compiled.search(first_child.text) + if(match): + element.tag = tag_name + else: + element.tag = tag_name + self.xml_tree = self.xml_tree + + def replace_tag_attr(self, element_to_replace, tag_name, attr_key, + attr_value, check_child=True): + """ + Replaces tag name of given element(as XPath) with new name and adds an + attribute Can also check if the child of the current element contains + some specific text like in the expand_element function. + """ + elements = self.xml_tree.findall(element_to_replace) + for element in elements: + if(check_child is True): + first_child = element.getchildren()[0] + match = self.regex_compiled.search(first_child.text) + if(match): + element.tag = tag_name + element.set(attr_key, attr_value) + else: + element.tag = tag_name + element.set(attr_key, attr_value) + self.xml_tree = self.xml_tree + + def replace_elements(self, elements_to_replace, replacment_elements, + keep_parent_text=False): + """ + Replaces elements identifeid by XPath with new elements. Can either keep + the text of the parent element or not. + """ + elements = self.xml_tree.findall(elements_to_replace) + parents_text_xpath = elements_to_replace + "/" + "parent::node()" + "/" + "text()" + elements_text = self.xml_tree.xpath(parents_text_xpath) + if(len(elements) == len(replacment_elements)): + if(keep_parent_text is False): + for element, replacement_element in zip(elements, replacment_elements): + element.getparent().replace(element, replacement_element) + else: + for element, replacement_element in zip(elements, replacment_elements): + element.getparent().replace(element, replacement_element) + self.xml_tree = self.xml_tree + elements = self.xml_tree.findall(elements_to_replace) + for element, text in zip(elements, elements_text): + element.tail = text + self.xml_tree = self.xml_tree + else: + self.logger.warning(("Elements missmatch. There are " + + str(len(elements)) + + " that should be repalced." + + " There are " + str(len(replacment_elements)) + + " present." + + " No elements have been replaced.")) + + def compile_regex(self, regex): + self.regex_string = regex + """ + Takes the input regex string and compiles it for better performance + and redability. + """ + self.regex_compiled = re.compile(self.regex_string, re.MULTILINE) + + def clean_text(self, regex, xpath, replacement_string="",): + """ + Replaces regex matches with nothing by default or replacement string + for an element matched by the xpath in the xml_tree. Works with + matchgroups. + """ + elements = self.xml_tree.xpath(xpath) + for element in elements: + replaced = re.sub(regex, replacement_string, element.text) + element.text = replaced + self.xml_tree = self.xml_tree diff --git a/bundesdata_markup_nlp/utility/__init__.py b/bundesdata_markup_nlp/utility/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/bundesdata_markup_nlp/utility/__pycache__/FileGetter.cpython-37.pyc b/bundesdata_markup_nlp/utility/__pycache__/FileGetter.cpython-37.pyc new file mode 100644 index 0000000..496bb85 Binary files /dev/null and b/bundesdata_markup_nlp/utility/__pycache__/FileGetter.cpython-37.pyc differ diff --git a/bundesdata_markup_nlp/utility/__pycache__/XMLProtocol.cpython-37.pyc b/bundesdata_markup_nlp/utility/__pycache__/XMLProtocol.cpython-37.pyc new file mode 100644 index 0000000..eeba106 Binary files /dev/null and b/bundesdata_markup_nlp/utility/__pycache__/XMLProtocol.cpython-37.pyc differ diff --git a/bundesdata_markup_nlp/utility/__pycache__/__init__.cpython-37.pyc b/bundesdata_markup_nlp/utility/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..4ad1076 Binary files /dev/null and b/bundesdata_markup_nlp/utility/__pycache__/__init__.cpython-37.pyc differ diff --git a/bundesdata_markup_nlp/utility/__pycache__/delete_folder.cpython-37.pyc b/bundesdata_markup_nlp/utility/__pycache__/delete_folder.cpython-37.pyc new file mode 100644 index 0000000..250b9ae Binary files /dev/null and b/bundesdata_markup_nlp/utility/__pycache__/delete_folder.cpython-37.pyc differ diff --git a/bundesdata_markup_nlp/utility/__pycache__/update_config.cpython-37.pyc b/bundesdata_markup_nlp/utility/__pycache__/update_config.cpython-37.pyc new file mode 100644 index 0000000..574153f Binary files /dev/null and b/bundesdata_markup_nlp/utility/__pycache__/update_config.cpython-37.pyc differ diff --git a/bundesdata_markup_nlp/utility/delete_folder.py b/bundesdata_markup_nlp/utility/delete_folder.py new file mode 100755 index 0000000..3ddbffa --- /dev/null +++ b/bundesdata_markup_nlp/utility/delete_folder.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import shutil + + +def delete_folder(folder_path): + """ + Deletes folder idetified by input folder path string. + """ + shutil.rmtree(folder_path) + + +if __name__ == '__main__': + delete_folder() diff --git a/bundesdata_markup_nlp/utility/move_ngrams.py b/bundesdata_markup_nlp/utility/move_ngrams.py new file mode 100755 index 0000000..e9c528f --- /dev/null +++ b/bundesdata_markup_nlp/utility/move_ngrams.py @@ -0,0 +1,22 @@ +import os + +""" +Helper script to move n_gram csvs to seperate folders. Just copy this into the +folder containing the n-grams and execute it. Change n to number of N in N-grams. +""" +current_path = os.getcwd() +files = [] +n = 5 +for file in os.listdir(current_path): + if file.endswith(".csv"): + files.append(file) +files = sorted(files) + +dir_list = ["1_grams", "2_grams", "3_grams", "4_grams", "5_grams"][:n] +for dir in dir_list: + os.system("mkdir {}".format(dir)) + +for step, dir in zip(range(0, n), dir_list): + for file in files[step::n]: + print(file) + os.system("mv {} {}".format(file, dir)) diff --git a/bundesdata_markup_nlp/utility/update_config.py b/bundesdata_markup_nlp/utility/update_config.py new file mode 100755 index 0000000..2b23cbf --- /dev/null +++ b/bundesdata_markup_nlp/utility/update_config.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import configparser + + +def update_config(file_name, section, key, value): + """ + This script updates a config file identified by file_name. Updates the data + of one key value pair in a specific section. + """ + config = configparser.ConfigParser() + config.read(file_name) + file = open(file_name, "w") + config.set(section, key, value) + config.write(file) + file.close() + + +if __name__ == '__main__': + update_config() diff --git a/docs/BT-PP_DTD_kommentiert_20150519.pdf b/docs/BT-PP_DTD_kommentiert_20150519.pdf new file mode 100755 index 0000000..0d3794e Binary files /dev/null and b/docs/BT-PP_DTD_kommentiert_20150519.pdf differ diff --git a/docs/metadaten.md b/docs/metadaten.md new file mode 100755 index 0000000..c55d0fd --- /dev/null +++ b/docs/metadaten.md @@ -0,0 +1,5 @@ +# Metadaten + +Quelle der Strukturdefinition: https://www.bundestag.de/blob/577234/f9159cee3e045cbc37dcd6de6322fcdd/dbtplenarprotokoll_kommentiert-data.pdf +Heruntergleaden am: 06.11.2018 + diff --git a/requirements.txt b/requirements.txt new file mode 100755 index 0000000..7e98162 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +# Bundesdata +lxml==4.2.5 +Babel==2.6.0 +tqdm==4.28.1 +spacy==2.0.18 +https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.0.0/de_core_news_sm-2.0.0.tar.gz +scikit-learn[alldeps]==0.20.2