Initial commit
This commit is contained in:
commit
4263e5f41e
2
.gitignore
vendored
Executable file
2
.gitignore
vendored
Executable file
@ -0,0 +1,2 @@
|
||||
data/*
|
||||
.idea/*
|
72
README.md
Executable file
72
README.md
Executable file
@ -0,0 +1,72 @@
|
||||
# Master_thesis
|
||||
Master Thesis Repository.
|
||||
|
||||
## Benötigte Pakete und Sprachen
|
||||
|
||||
- Python 3.7+
|
||||
- Python Pakete werden mittels requirements.txt installiert. Siehe Installation Schritt 2.
|
||||
|
||||
## Installation
|
||||
1. Stellen sie sicher, dass das Paket `python3.7-dev` installiert ist. Wenn nicht: `sudo apt-get install python3.7-dev`
|
||||
1. Installieren Sie _virtualenv_ mittels `pip install virtualenv`. Oder dem jeweiligen package manager der eigenen Distribution.
|
||||
2. Installieren Sie JS Beautifier systemweit `sudo npm -g install js-beautify` (Optional! Wenn nicht gewünscht, kann der Schritt übersprungen werden. Der Schritt welches dieses Paket während der Auszeichnung benötigt kann übersprungen werden. Allerdings gibt es so keine schön formatierten XML-Dateien.)
|
||||
3. Erstelle virtual environment für das Projekt mittels `virtualenv --python=python3.7 path/to/folder`
|
||||
4. Aktivieren der virtuellen Umgebung mittels `source path/to/folder/bin/activate`
|
||||
5. `cd verzeichnis/des/repository`
|
||||
6. Installieren der Abhängigkeiten mit `pip install -r requirements.txt`.
|
||||
|
||||
## Scriptaufrufe Beispiele:
|
||||
|
||||
### @Home
|
||||
- `source ~/VirtualEnvs/bundesdata/bin/activate`
|
||||
- `cd ~/Documents/Eigene\ geschriebene\ Programme/master_thesis/bundesdata/`
|
||||
|
||||
#### Development Data
|
||||
|
||||
**Metadata**
|
||||
-`python markup/metastructure.py -p /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data/working_data/development_data_xml -f *.xml -o /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data/working_data`
|
||||
|
||||
**Speakers**
|
||||
- `python markup/speakers.py -p /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data/working_data/xml_new_metadata_structure -f *.xml -o /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data/working_data`
|
||||
|
||||
#### Full data
|
||||
|
||||
**Metadata**
|
||||
-`python markup/metastructure.py -p /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data/protocols_raw_xml -f *.xml -o /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data`
|
||||
|
||||
**Speakers**
|
||||
- `python markup/speakers.py -p /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data/xml_new_metadata_structure -f *.xml -o /home/stephan/Documents/Eigene\ geschriebene\ Programme/master_thesis/data`
|
||||
|
||||
### @Uni
|
||||
|
||||
|
||||
#### Development Data
|
||||
- `source /home/stephan/VirtualEnvs/bundesdata/bin/activate`
|
||||
- `cd /home/stephan/Repos/master_thesis/bundesdata`
|
||||
|
||||
**Speakers**
|
||||
- `python markup/speakers.py -p /home/stephan/Repos/master_thesis/data/working_data/xml_new_metadata_structure -f *.xml -o /home/stephan/Repos/master_thesis/data/working_data`
|
||||
|
||||
**Metadata**
|
||||
-`python markup/metastructure.py -p /home/stephan/Repos/master_thesis/data/working_data/development_data_xml -f *.xml -o /home/stephan/Repos/master_thesis/data/working_data`
|
||||
|
||||
|
||||
#### Test Data
|
||||
- `source /home/stephan/VirtualEnvs/bundesdata/bin/activate`
|
||||
- `cd /home/stephan/Repos/master_thesis/bundesdata`
|
||||
|
||||
**Speakers**
|
||||
- `python markup/speakers.py -p /home/stephan/Repos/master_thesis/data/working_data/test/xml_new_metadata_structure -f *.xml -o /home/stephan/Repos/master_thesis/data/working_data/test`
|
||||
|
||||
**Metadata**
|
||||
-`python markup/metastructure.py -p /home/stephan/Repos/master_thesis/data/working_data/test_data_xml -f *.xml -o /home/stephan/Repos/master_thesis/data/working_data/test`
|
||||
|
||||
#### Full data
|
||||
- `source /home/stephan/VirtualEnvs/bundesdata/bin/activate`
|
||||
- `cd /home/stephan/Repos/master_thesis/bundesdata`
|
||||
|
||||
**Speakers**
|
||||
- `python markup/speakers.py -p /home/stephan/Repos/master_thesis/data/xml_new_metadata_structure -f *.xml -o /home/stephan/Repos/master_thesis/data`
|
||||
|
||||
**Metadata**
|
||||
-`python markup/metastructure.py -p /home/stephan/Repos/master_thesis/data/protocols_raw_xml -f *.xml -o /home/stephan/Repos/master_thesis/data`
|
0
bundesdata_markup_nlp/__init__.py
Executable file
0
bundesdata_markup_nlp/__init__.py
Executable file
214
bundesdata_markup_nlp/bundesdata_markup.py
Executable file
214
bundesdata_markup_nlp/bundesdata_markup.py
Executable file
@ -0,0 +1,214 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from markup import metadata, speakers, speaker_names, speeches
|
||||
from utility import update_config
|
||||
from markup import beautify_markup
|
||||
from utility import delete_folder
|
||||
import argparse
|
||||
import time
|
||||
import configparser
|
||||
from datetime import datetime
|
||||
import logging
|
||||
import os
|
||||
|
||||
"""
|
||||
This is the mains script handeling the automatic markup of the protocols. Needs
|
||||
some user Input specified in parse-arguments().
|
||||
"""
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
"""
|
||||
Argument Parser
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description="Starts the markup process of \
|
||||
the XML protocols. Uses either the input \
|
||||
and output paths currently specified in \
|
||||
the config file or the paths set when \
|
||||
calling the script from the terminal with \
|
||||
the flag argument '-sp' or '--set_paths'. \
|
||||
Using this parameter writes the given \
|
||||
paths into the config file. \
|
||||
Some steps of the markup process can be \
|
||||
skipped if they already have been executed\
|
||||
once while useing the -kt option \
|
||||
by using the corresponding parameters. \
|
||||
")
|
||||
parser.add_argument("-sp",
|
||||
"--set_paths",
|
||||
nargs=2,
|
||||
help="User can set the input and output paths for the \
|
||||
files created during the markup. The paths will be \
|
||||
written to the config file.",
|
||||
required=False,
|
||||
type=str,
|
||||
metavar=("input_path", "output_path"))
|
||||
parser.add_argument("-sm",
|
||||
"--skip_metadata",
|
||||
help="Skips the script creating metadata and first \
|
||||
xml strucutre.",
|
||||
action="store_true",
|
||||
required=False)
|
||||
parser.add_argument("-ss",
|
||||
"--skip_simple_speakers",
|
||||
help="Skips the script creating the first simple \
|
||||
speaker markup.",
|
||||
action="store_true",
|
||||
required=False)
|
||||
parser.add_argument("-sn",
|
||||
"--skip_name_markup",
|
||||
help="Skips the script creating the name markup.",
|
||||
action="store_true",
|
||||
required=False)
|
||||
parser.add_argument("-ssp",
|
||||
"--skip_speeches",
|
||||
help="Skips the script creating markup inside of \
|
||||
speeches.",
|
||||
action="store_true",
|
||||
required=False)
|
||||
parser.add_argument("-sb",
|
||||
"--skip_beautify_xml",
|
||||
help="Skips the script creating beautiful xml files.",
|
||||
action="store_true",
|
||||
required=False)
|
||||
parser.add_argument("-kt",
|
||||
"--keep_tmp_files",
|
||||
help="Keeps all temporary xml files beeing created \
|
||||
during the entire markup process. Using this flag is \
|
||||
needed when skipping steps of the entire markup during \
|
||||
a rerun of the script. \
|
||||
If this is not set temporary files will always be \
|
||||
deleted.",
|
||||
action="store_true",
|
||||
required=False)
|
||||
parser.add_argument("-fr",
|
||||
"--fresh_run",
|
||||
help="Deltes all temporary folders in output folder \
|
||||
also deletes all paths saved in the config file file \
|
||||
before starting the markup process. The user has to set\
|
||||
the paths again with -sp.",
|
||||
action="store_true",
|
||||
required=False)
|
||||
parser.add_argument("-la",
|
||||
"--log_all",
|
||||
help="If set the programm will log all information \
|
||||
about the markup process (statistics etc.). Otherwise \
|
||||
it only logs errors and warnings.",
|
||||
action="store_true",
|
||||
required=False)
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
Main function calling all other scripts for the automatic markup of the
|
||||
protocols.
|
||||
"""
|
||||
args = parse_arguments()
|
||||
if(args.log_all is True):
|
||||
level = logging.INFO
|
||||
elif(args.log_all is False):
|
||||
level = logging.WARNING
|
||||
logging.basicConfig(filename="logs/bundesdata.log", level=level,
|
||||
format="%(asctime)s %(name)s %(levelname)s:%(message)s",
|
||||
datefmt='%Y/%m/%d %H:%M:%S',
|
||||
filemode="w")
|
||||
logger = logging.getLogger(__name__)
|
||||
start_time = datetime.now()
|
||||
print("Start time of script is:", start_time)
|
||||
print("Info and status about the markup process can be found in:",
|
||||
"logs/bundesdata.log")
|
||||
logger.info("Start time of script is: " + str(start_time))
|
||||
|
||||
# Deletes output folder and all folders inside that.
|
||||
# Also removes all path options from the section "File paths"
|
||||
if(args.fresh_run is True):
|
||||
config = configparser.ConfigParser()
|
||||
config.read("config.ini")
|
||||
options = config.items("File paths")
|
||||
for option in options:
|
||||
if(option[0] == "output_folder"):
|
||||
try:
|
||||
delete_folder.delete_folder(option[1])
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
else:
|
||||
config.remove_option("File paths", option[0])
|
||||
with open("config.ini", 'w') as out:
|
||||
config.write(out)
|
||||
|
||||
# sets paths and creates output folder
|
||||
if(args.set_paths):
|
||||
input_path = args.set_paths[0]
|
||||
output_path = os.path.join(args.set_paths[1], "output")
|
||||
if not os.path.exists(output_path):
|
||||
os.mkdir(output_path)
|
||||
config = configparser.ConfigParser()
|
||||
config.read("config.ini")
|
||||
update_config.update_config("config.ini", "File paths",
|
||||
"input_folder_xmls", input_path)
|
||||
update_config.update_config("config.ini", "File paths",
|
||||
"output_folder", output_path)
|
||||
|
||||
if(args.skip_metadata is not True):
|
||||
print("Starting metadata extraction and markup.")
|
||||
metadata.get_metadata()
|
||||
print("Metadata creation and content splits finished.")
|
||||
elif(args.skip_metadata is True):
|
||||
print("Skipping script metadata.py.")
|
||||
|
||||
time.sleep(1)
|
||||
if(args.skip_simple_speakers is not True):
|
||||
print("Starting first simple speeches and speaker markup.")
|
||||
speakers.get_speakers()
|
||||
print(("Finished simple markup."))
|
||||
elif(args.skip_simple_speakers is True):
|
||||
print("Skipping script speakers.py.")
|
||||
|
||||
time.sleep(1)
|
||||
if(args.skip_name_markup is not True):
|
||||
print("Starting complex markup of speaker names.")
|
||||
speaker_names.get_names()
|
||||
print("Finished complex name markup. (names etc.)")
|
||||
elif(args.skip_name_markup is True):
|
||||
print("Skipping script speaker_names.py.")
|
||||
|
||||
time.sleep(1)
|
||||
if(args.skip_speeches is not True):
|
||||
print("Starting markup of comments etc. in speeches.")
|
||||
speeches.markup_speeches()
|
||||
print("Finished markup of comments etc. in speeches.")
|
||||
elif(args.skip_speeches is True):
|
||||
print("Skipping script speeches.py.")
|
||||
|
||||
time.sleep(1)
|
||||
if(args.skip_beautify_xml is not True):
|
||||
print("Starting to prettyfie the xmls.")
|
||||
beautify_markup.beautify_xml("markup")
|
||||
print("Prettyfied the xmls.")
|
||||
elif(args.skip_beautify_xml is True):
|
||||
print("Skipping script beautify_markup.py.")
|
||||
|
||||
if(args.keep_tmp_files is not True):
|
||||
config = configparser.ConfigParser()
|
||||
config.read("config.ini")
|
||||
folder_paths = []
|
||||
folder_paths.append(config["File paths"]["new_metadata"])
|
||||
folder_paths.append(config["File paths"]["new_simple_markup"])
|
||||
folder_paths.append(config["File paths"]["complex_markup"])
|
||||
folder_paths.append(config["File paths"]["clear_speech_markup"])
|
||||
for folder_path in folder_paths:
|
||||
delete_folder.delete_folder(folder_path)
|
||||
|
||||
end_time = datetime.now()
|
||||
print("End time of script is:", str(end_time))
|
||||
logger.info("End time of script is: " + str(end_time))
|
||||
duration = end_time - start_time
|
||||
print("Duration of script is:", duration)
|
||||
logger.info("Script duration is: " + str(duration))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
178
bundesdata_markup_nlp/bundesdata_nlp.py
Executable file
178
bundesdata_markup_nlp/bundesdata_nlp.py
Executable file
@ -0,0 +1,178 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import argparse
|
||||
import configparser
|
||||
import os
|
||||
import logging
|
||||
from utility.FileGetter import FileGetter
|
||||
from utility import update_config
|
||||
from utility import delete_folder
|
||||
from markup import beautify_markup
|
||||
from nlp import tokenize, lemmatization, n_grams
|
||||
from datetime import datetime
|
||||
|
||||
"""
|
||||
This script handles the tokenization, lemmatization and ngramm calculation of
|
||||
the input protocols. Needs some user input specfied int parse_arguments().
|
||||
"""
|
||||
|
||||
def parse_arguments():
|
||||
"""
|
||||
Argument Parser
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description="Starts the nlp analysis of \
|
||||
the newly created XML-protocols")
|
||||
parser.add_argument("-sp",
|
||||
"--set_paths",
|
||||
nargs=2,
|
||||
help="User can set the input and output paths for the \
|
||||
files created during the nlp process. The paths will be\
|
||||
written to the config file.",
|
||||
required=False,
|
||||
type=str,
|
||||
metavar=("input_path", "output_path"))
|
||||
parser.add_argument("-fr",
|
||||
"--fresh_run",
|
||||
help="Deltes all temporary folders and output folders \
|
||||
created during a previously nlp run before this one \
|
||||
starts.",
|
||||
action="store_true",
|
||||
required=False)
|
||||
parser.add_argument("-sb",
|
||||
"--skip_beautify_xml",
|
||||
help="Skips the script creating beautiful xml files.",
|
||||
action="store_true",
|
||||
required=False)
|
||||
parser.add_argument("-ns",
|
||||
"--no_stop_words",
|
||||
help="If this is used the lemmatization or tokenization\
|
||||
of the input protocols will exculde stop words.",
|
||||
required=False,
|
||||
action="store_true")
|
||||
group = parser.add_mutually_exclusive_group(required=False)
|
||||
group.add_argument("-lm",
|
||||
"--lemmatize",
|
||||
help="Lemmatizes the XML protocols in the input directory\
|
||||
and saves them into the output directory.",
|
||||
action="store_true",
|
||||
required=False)
|
||||
group.add_argument("-tn",
|
||||
"--tokenize",
|
||||
help="Tokenizes the XML protocols in the input directory\
|
||||
and saves them into the output directory.",
|
||||
action="store_true",
|
||||
required=False)
|
||||
group.add_argument("-cn",
|
||||
"--calculate_n_grams",
|
||||
nargs=2,
|
||||
help="Calculates n_grams for any tokenized or leammtized\
|
||||
XML protocol created by this script. \
|
||||
feature_to_group_n_grams_by can be set to the following:\
|
||||
'year','month_year', 'speaker' or 'speech'.",
|
||||
required=False,
|
||||
type=str,
|
||||
metavar=("feature_to_group_n_grams_by", "input_type_name"))
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
# logging and start time
|
||||
logging.basicConfig(filename="logs/bundesdata_nlp.log", level=logging.INFO,
|
||||
format="%(asctime)s %(name)s %(levelname)s:%(message)s",
|
||||
datefmt='%Y/%m/%d %H:%M:%S',
|
||||
filemode="w")
|
||||
logger = logging.getLogger(__name__)
|
||||
start_time = datetime.now()
|
||||
print("Start time of script is:", start_time)
|
||||
print("Info and status about the nlp process can be found in:",
|
||||
"logs/bundesdata_nlp.log")
|
||||
logger.info("Start time of script is: " + str(start_time))
|
||||
# get arguments
|
||||
args = parse_arguments()
|
||||
# reads config
|
||||
config = configparser.ConfigParser()
|
||||
config.read("config.ini")
|
||||
# if fresh_run is true directory nlp_output will be deleted
|
||||
if(args.fresh_run is True):
|
||||
config = configparser.ConfigParser()
|
||||
config.read("config.ini")
|
||||
options = config.items("File paths")
|
||||
for option in options:
|
||||
if(option[0] == "nlp_output"):
|
||||
try:
|
||||
delete_folder.delete_folder(option[1])
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
else:
|
||||
config.remove_option("File paths", option[0])
|
||||
with open("config.ini", 'w') as out:
|
||||
config.write(out)
|
||||
|
||||
# create outputfolder if it does not exists and wirtes path to config
|
||||
if(args.set_paths):
|
||||
output_path = os.path.join(args.set_paths[1], "nlp_output")
|
||||
if not os.path.exists(output_path):
|
||||
os.mkdir(output_path)
|
||||
update_config.update_config("config.ini", "File paths",
|
||||
"nlp_output", output_path)
|
||||
else:
|
||||
output_path = config["File paths"]["nlp_output"]
|
||||
if not os.path.exists(output_path):
|
||||
os.mkdir(output_path)
|
||||
update_config.update_config("config.ini", "File paths",
|
||||
"nlp_output", output_path)
|
||||
# gets file_path list of input files and wirtes inputfolder path to config
|
||||
if(args.set_paths):
|
||||
input_path = args.set_paths[0]
|
||||
update_config.update_config("config.ini", "File paths",
|
||||
"nlp_input", input_path)
|
||||
elif(args.calculate_n_grams):
|
||||
input_path = config["File paths"]["nlp_beuatiful_xml"]
|
||||
else:
|
||||
input_path = config["File paths"]["nlp_input"]
|
||||
files = FileGetter(input_path, "*.xml")
|
||||
files = files.get_files()
|
||||
# if statements deciding which script will be executed
|
||||
if(args.lemmatize is True and args.no_stop_words is True):
|
||||
print("Starting lemmatization excluding stop words.")
|
||||
lemmatization.lemmatization(files, True)
|
||||
print("Finished lemmatization excluding stop words.")
|
||||
elif(args.lemmatize is True and args.no_stop_words is False):
|
||||
print("Starting lemmatization including stop words.")
|
||||
lemmatization.lemmatization(files)
|
||||
print("Finished lemmatization including stop words.")
|
||||
|
||||
if(args.tokenize is True and args.no_stop_words is True):
|
||||
print("Starting tokenization excluding stop words.")
|
||||
tokenize.tokenize(files, True)
|
||||
print("Finished tokenization excluding stop words.")
|
||||
elif(args.tokenize is True and args.no_stop_words is False):
|
||||
print("Starting tokenization including stop words.")
|
||||
tokenize.tokenize(files)
|
||||
print("Finished tokenization including stop words.")
|
||||
|
||||
if(args.calculate_n_grams):
|
||||
print("Starting calculation of n-grams for input files.")
|
||||
n_grams.n_grams(files, args.calculate_n_grams[0], args.calculate_n_grams[1])
|
||||
print("Finished calculation of n-grams for input files.")
|
||||
|
||||
if(args.skip_beautify_xml is not True and args.lemmatize is True
|
||||
or args.tokenize is True):
|
||||
print("Starting to prettyfy the xmls.")
|
||||
beautify_markup.beautify_xml("nlp", True, 80)
|
||||
print("Prettyfied the xmls.")
|
||||
elif(args.skip_beautify_xml is True):
|
||||
print("Skipping script beautify_markup.py.")
|
||||
|
||||
end_time = datetime.now()
|
||||
print("End time of script is:", str(end_time))
|
||||
logger.info("End time of script is: " + str(end_time))
|
||||
duration = end_time - start_time
|
||||
print("Duration of script is:", duration)
|
||||
logger.info("Script duration is: " + str(duration))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
47
bundesdata_markup_nlp/config.ini
Executable file
47
bundesdata_markup_nlp/config.ini
Executable file
@ -0,0 +1,47 @@
|
||||
[Regular expressions time extraction]
|
||||
session_start_time = (?:Die Sitzung wird [umrn]+ (?:(?:(\d{1,2}) Uhr (?:(\d{1,2})?)|(?:(\d{1,2})\.(?:(\d{1,2})) Uhr)) ?(?:Minuten?)?.?)? ?(?:durch\n*[\w \.;'\(\)]*)?[\s \. A-z]*(?:(?:eröffnet\.)|(?:eingeleitet[\w „\",\.]+)))|(?:Begi[\w]+:? (\d{1,2})(?:[, \.]*)?(?:(\d{1,2}))? ?Uhr\.?)|(?:Die Sitzung wird [umrn]+ (\d{1,2}) Uhr eröffnet.)
|
||||
session_end_time = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr\s(\d{1,2}).?\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(\d{1,2})\sUhr (\d{1,2})\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr und\s.?(\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (\d{1,2}) Uhr (\d{1,2})\.\))
|
||||
|
||||
[Regular expressions splits]
|
||||
session_start_president_split = (\n\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?:)
|
||||
attachment_split = ((?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\)))
|
||||
|
||||
[Regular expressions speakers]
|
||||
speaker_president_first = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; first ; Präsident
|
||||
speaker_state_secretary = ^[ \-\.,\w]+ Staatssekretär[\-\w\n, \n]+: ; middle ; Staatssekretär
|
||||
speaker_minister_of_state = ^[ \-\.,\w]+ Staatsminister[\-\w\n, \n]+: ; middle ; Staatsminister
|
||||
speaker_applicant = [ \-\.,\w]+ (\([\w ]+\))?, (?:A|a)ntragsteller(?:in)?[\-\w\n, \n]*: ; middle ; Antragsteller
|
||||
speaker_president = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; middle ; Präsident
|
||||
speaker_undefined = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-\.]+\) ?: ; middle ; MdB
|
||||
speaker_defined = ^[\w \-\.,]+ ?Bundesminister(in)? [\w\-\., ]* ?: ; middle ; Bundesminister
|
||||
speaker_chancellor = ^[\w \-\.\,]+Bundeskanzler(in)? ?: ; middle ; Bundeskanzler
|
||||
speaker_secretary = ^[\w \-\.,]+ ?Schriftführer(in)? ?: ; middle ; Schriftführer
|
||||
speaker_rapporteur = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-]+\) ?, (?:B|b)erichterstatter: ; middle ; Berichterstatter
|
||||
end_of_session = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\)) ; last ; Zeitpunkt
|
||||
|
||||
[Additional name features]
|
||||
academic_titles = Dr. Dr. h. c. ; Dr. h. c.
|
||||
parties = DIE LINKE ; CDU/CSU ; PDS/Linke Liste ; Fraktionslos ; F.D.P.
|
||||
|
||||
[Regular expressions speeches]
|
||||
comments = \B\([^\(\)]*\)\B ; kommentar
|
||||
date_string = [\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,2} ?\. Wahlperiode (?:–|—|-|--) \d{1,3} ?\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]*|[\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,3}\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]* ; metadata
|
||||
|
||||
[Multiline entities]
|
||||
multiline_comment = \B\([^\(\)]* ; [^\(\)]*\)\B ; kommentar
|
||||
|
||||
[File paths]
|
||||
nlp_output = /home/stephan/Desktop/tmp_test/nlp_output
|
||||
nlp_input = /home/stephan/Desktop/tmp_test/nlp_output/nlp_beuatiful_xml/
|
||||
nlp_lemmatized_tokenized = /home/stephan/Desktop/tmp_test/nlp_output/lemmatized
|
||||
tmp_path = /home/stephan/Desktop/tmp_test/nlp_output/lemmatized/tmp
|
||||
nlp_beuatiful_xml = /home/stephan/Desktop/tmp_test/nlp_output/nlp_beuatiful_xml
|
||||
input_folder_xmls = /home/stephan/Desktop/tmp_test/protocols/
|
||||
output_folder = /home/stephan/Desktop/tmp_test/output
|
||||
new_metadata = /home/stephan/Desktop/tmp_test/output/new_metadata
|
||||
new_simple_markup = /home/stephan/Desktop/tmp_test/output/simple_xml
|
||||
complex_markup = /home/stephan/Desktop/tmp_test/output/complex_markup
|
||||
clear_speech_markup = /home/stephan/Desktop/tmp_test/output/clear_speech_markup
|
||||
beautiful_xml = /home/stephan/Desktop/tmp_test/output/beautiful_xml
|
||||
fixed_markup = /home/stephan/Repos/master_thesis/data/working_data/id_fixed/fixed_markup
|
||||
|
46
bundesdata_markup_nlp/config_(backup).ini
Executable file
46
bundesdata_markup_nlp/config_(backup).ini
Executable file
@ -0,0 +1,46 @@
|
||||
[Regular expressions time extraction]
|
||||
session_start_time = (?:Die Sitzung wird [umrn]+ (?:(?:(\d{1,2}) Uhr (?:(\d{1,2})?)|(?:(\d{1,2})\.(?:(\d{1,2})) Uhr)) ?(?:Minuten?)?.?)? ?(?:durch\n*[\w \.;'\(\)]*)?[\s \. A-z]*(?:(?:eröffnet\.)|(?:eingeleitet[\w „\",\.]+)))|(?:Begi[\w]+:? (\d{1,2})(?:[, \.]*)?(?:(\d{1,2}))? ?Uhr\.?)|(?:Die Sitzung wird [umrn]+ (\d{1,2}) Uhr eröffnet.)
|
||||
session_end_time = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr\s(\d{1,2}).?\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(\d{1,2})\sUhr (\d{1,2})\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr und\s.?(\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (\d{1,2}) Uhr (\d{1,2})\.\))
|
||||
|
||||
[Regular expressions splits]
|
||||
session_start_president_split = (\n\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?:)
|
||||
attachment_split = ((?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\)))
|
||||
|
||||
[Regular expressions speakers]
|
||||
speaker_president_first = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; first ; Präsident
|
||||
speaker_state_secretary = ^[ \-\.,\w]+ Staatssekretär[\-\w\n, \n]+: ; middle ; Staatssekretär
|
||||
speaker_minister_of_state = ^[ \-\.,\w]+ Staatsminister[\-\w\n, \n]+: ; middle ; Staatsminister
|
||||
speaker_applicant = [ \-\.,\w]+ (\([\w ]+\))?, (?:A|a)ntragsteller(?:in)?[\-\w\n, \n]*: ; middle ; Antragsteller
|
||||
speaker_president = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; middle ; Präsident
|
||||
speaker_undefined = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-\.]+\) ?: ; middle ; MdB
|
||||
speaker_defined = ^[\w \-\.,]+ ?Bundesminister(in)? [\w\-\., ]* ?: ; middle ; Bundesminister
|
||||
speaker_chancellor = ^[\w \-\.\,]+Bundeskanzler(in)? ?: ; middle ; Bundeskanzler
|
||||
speaker_secretary = ^[\w \-\.,]+ ?Schriftführer(in)? ?: ; middle ; Schriftführer
|
||||
speaker_rapporteur = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-]+\) ?, (?:B|b)erichterstatter: ; middle ; Berichterstatter
|
||||
end_of_session = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\)) ; last ; Zeitpunkt
|
||||
|
||||
[Additional name features]
|
||||
academic_titles = Dr. Dr. h. c. ; Dr. h. c.
|
||||
parties = DIE LINKE ; CDU/CSU ; PDS/Linke Liste ; Fraktionslos ; F.D.P.
|
||||
|
||||
[Regular expressions speeches]
|
||||
comments = \B\([^\(\)]*\)\B ; kommentar
|
||||
date_string = [\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,2} ?\. Wahlperiode (?:–|—|-|--) \d{1,3} ?\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]*|[\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,3}\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]* ; metadata
|
||||
|
||||
[Multiline entities]
|
||||
multiline_comment = \B\([^\(\)]* ; [^\(\)]*\)\B ; kommentar
|
||||
|
||||
[File paths]
|
||||
nlp_output = /home/stephan/Desktop/nlp_output
|
||||
nlp_input = /home/stephan/Repos/master_thesis_data/data/outputs/outputs_markup/development_data/beautiful_xml/
|
||||
nlp_lemmatized_tokenized = /home/stephan/Desktop/nlp_output/lemmatized
|
||||
tmp_path = /home/stephan/Repos/master_thesis/data/working_data/output/clear_speech_markup/tmp
|
||||
nlp_beuatiful_xml = /home/stephan/Desktop/nlp_output/nlp_beuatiful_xml
|
||||
input_folder_xmls = /home/stephan/Repos/master_thesis/data/working_data/sub_set/
|
||||
output_folder = /home/stephan/Repos/master_thesis/data/working_data/output
|
||||
new_metadata = /home/stephan/Repos/master_thesis/data/working_data/output/new_metadata
|
||||
new_simple_markup = /home/stephan/Repos/master_thesis/data/working_data/output/simple_xml
|
||||
complex_markup = /home/stephan/Repos/master_thesis/data/working_data/output/complex_markup
|
||||
clear_speech_markup = /home/stephan/Repos/master_thesis/data/working_data/output/clear_speech_markup
|
||||
beautiful_xml = /home/stephan/Repos/master_thesis/data/working_data/output/beautiful_xml
|
||||
|
105
bundesdata_markup_nlp/config_readme.md
Executable file
105
bundesdata_markup_nlp/config_readme.md
Executable file
@ -0,0 +1,105 @@
|
||||
[Regular expressions time extraction]
|
||||
# These regular expressions are used to extract the start and ending time of one
|
||||
# session. The regular expressions are kind of complex because they have to catch
|
||||
# a lot of human errors. To catch those errors the expression is repeatedly
|
||||
# "chained" by using the or statement with only minor differences between each
|
||||
# expression. This is the easiest way though to catch as many times as possible.
|
||||
# The expressions match the partial strings where the start or end time is mentioned.
|
||||
# With different match groups the hours and minutes will then be extracted.
|
||||
|
||||
# START TIME: Matches the start time.
|
||||
session_start_time = (?:Die Sitzung wird [umrn]+ (?:(?:(\d{1,2}) Uhr (?:(\d{1,2})?)|(?:(\d{1,2})\.(?:(\d{1,2})) Uhr)) ?(?:Minuten?)?.?)? ?(?:durch\n*[\w \.;'\(\)]*)?[\s \. A-z]*(?:(?:eröffnet\.)|(?:eingeleitet[\w „\",\.]+)))|(?:Begi[\w]+:? (\d{1,2})(?:[, \.]*)?(?:(\d{1,2}))? ?Uhr\.?)|(?:Die Sitzung wird [umrn]+ (\d{1,2}) Uhr eröffnet.)|(?:eingeleitet[\w „\",\.]+)))|(?:Begi[\w]+:? (\d{1,2})(?:[, \.]*)?(?:(\d{1,2}))? ?Uhr\.?)|(?:Die Sitzung wird [umrn]+ (\d{1,2}) Uhr eröffnet.)
|
||||
|
||||
# END TIME: Matches the end time.
|
||||
session_end_time = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr\s(\d{1,2}).?\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(\d{1,2})\sUhr (\d{1,2})\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr und\s.?(\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (\d{1,2}) Uhr (\d{1,2})\.\))
|
||||
|
||||
|
||||
[Regular expressions splits]
|
||||
# These expressions are used for splitting the protocols at the location if
|
||||
# matched.
|
||||
# All match groups are non catching except the group catching the entire regex
|
||||
# to insert it later on again. This is the main difference to the time extractions.
|
||||
# These splits are needed to automatically separate the actual session content
|
||||
# from the table of contents and the attachments.
|
||||
|
||||
# Split at first president occurrence.
|
||||
session_start_president_split = (\n\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?:)
|
||||
|
||||
# Split at the end time of session.
|
||||
attachment_split = ((?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\)))
|
||||
|
||||
|
||||
[Regular expressions speakers]
|
||||
# These are the regular expressions for matching the speakers in the protocols.
|
||||
# They consist of tuples with three values.
|
||||
# First element of the tuple is the regex.
|
||||
# Second element is a case that tells if this regex should be used as a
|
||||
# First, middle, or last element/match during the markup process.
|
||||
# Third element describes the type of speech the speaker is holding in German, to use it as an attribute later on.
|
||||
# The value tuple is divided with " ; " to convert it into a list later on.
|
||||
# It is similar to csv syntax. If needed the user can add more key, value pairs following the same
|
||||
# pattern to automatically identify even more speaker roles.
|
||||
|
||||
speaker_president_first = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; first ; Präsident
|
||||
speaker_state_secretary = ^[ \-\.,\w]+ Staatssekretär[\-\w\n, \n]+: ; middle ; Staatssekretär
|
||||
speaker_minister_of_state = ^[ \-\.,\w]+ Staatsminister[\-\w\n, \n]+: ; middle ; Staatsminister
|
||||
speaker_applicant = [ \-\.,\w]+ (\([\w ]+\))?, (?:A|a)ntragsteller(?:in)?[\-\w\n, \n]*: ; middle ; Antragsteller
|
||||
speaker_president = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; middle ; Präsident
|
||||
speaker_undefined = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-\.]+\) ?: ; middle ; MdB
|
||||
speaker_defined = ^[\w \-\.,]+ ?Bundesminister(in)? [\w\-\., ]* ?: ; middle ; Bundesminister
|
||||
speaker_chancellor = ^[\w \-\.\,]+Bundeskanzler(in)? ?: ; middle ; Bundeskanzler
|
||||
speaker_secretary = ^[\w \-\.,]+ ?Schriftführer(in)? ?: ; middle ; Schriftführer
|
||||
speaker_rapporteur = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-]+\) ?, (?:B|b)erichterstatter: ; middle ; Berichterstatter
|
||||
end_of_session = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\)) ; last ; Zeitpunkt
|
||||
|
||||
[Additional name features]
|
||||
# In this section the user can add additional strings which are not part of the
|
||||
# Stammdatenbank but are used inside the protocolls.
|
||||
academic_titles = Dr. Dr. h. c. ; Dr. h. c.
|
||||
parties = DIE LINKE ; CDU/CSU ; PDS/Linke Liste ; Fraktionslos ; F.D.P.
|
||||
|
||||
[Regular expressions speeches]
|
||||
# These regular expressions are used to markup some entities inside of the actual speeches.
|
||||
# The value of any given key is a tuple with two values splitted by " ; " like in the section
|
||||
# \[Regular expressions speakers\]. First value is the regex and the second value is the tagname
|
||||
# wirrten as a string. This list of key, value pairs can also be extended by the user to identify
|
||||
# even more entities inside of the speeches. Just add key, value pairs following the same pattern.
|
||||
# These expressions are only used to identify entities which are present in one <p> without
|
||||
# linebreaks.
|
||||
|
||||
comments = \B\([^\(\)]*\)\B ; kommentar
|
||||
date_string_with_periode = [\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,2} ?\. Wahlperiode (?:–|—|-|--) \d{1,3} ?\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]* ; metadata
|
||||
date_string_without_periode = [\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,3}\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]* ; metadata
|
||||
|
||||
[Multiline entities]
|
||||
# These regulare expressions are used to identifie entities in speeches which span over multiple <p>
|
||||
# elements. The value of any given key is a tuple with three values splitted by " ; " like in the
|
||||
# section [Regular expressions speakers]. First value is a regex describing how the start of the
|
||||
# entity string looks like. The second value is a regex describing how the end of the entity string
|
||||
# looks like. Third value is the tagname written as a normal string.
|
||||
multiline_comment = \B\([^\(\)]* ; [^\(\)]*\)\B ; kommentar
|
||||
|
||||
[File paths]
|
||||
# This is where the paths for input and output folders are set. The input folder
|
||||
# path should contain the XML-protocols that will be processed.
|
||||
# The output folder path specifies the place where all the intermediate files
|
||||
# and the final new XML protocols with the new automatic created markup will be
|
||||
# saved.
|
||||
|
||||
input_folder_xmls = /home/stephan/Repos/master_thesis/data/working_data/development_data_xml
|
||||
output_folder = /home/stephan/Repos/master_thesis/data/working_data/
|
||||
|
||||
# These paths will be set while running the programm.
|
||||
nlp_output = /home/stephan/Desktop/nlp_output
|
||||
nlp_input = /home/stephan/Desktop/protocols/
|
||||
nlp_lemmatized_tokenized = /home/stephan/Desktop/nlp_output/lemmatized
|
||||
tmp_path = /home/stephan/Desktop/nlp_output/lemmatized/tmp
|
||||
nlp_beuatiful_xml = /home/stephan/Desktop/nlp_output/nlp_beuatiful_xml
|
||||
input_folder_xmls = /home/stephan/Repos/master_thesis_data/inputs/excluded_periods/
|
||||
output_folder = /home/stephan/Desktop/output
|
||||
new_metadata = /home/stephan/Desktop/output/new_metadata
|
||||
new_simple_markup = /home/stephan/Desktop/output/simple_xml
|
||||
complex_markup = /home/stephan/Desktop/output/complex_markup
|
||||
clear_speech_markup = /home/stephan/Desktop/output/clear_speech_markup
|
||||
beautiful_xml = /home/stephan/Desktop/output/beautiful_xml
|
||||
fixed_markup = /home/stephan/Repos/master_thesis/data/working_data/id_fixed/fixed_markup
|
225
bundesdata_markup_nlp/markup/EntityMarkup.py
Executable file
225
bundesdata_markup_nlp/markup/EntityMarkup.py
Executable file
@ -0,0 +1,225 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from markup.MetadataMarkup import MetadataMarkup
|
||||
from lxml import etree
|
||||
from xml.etree import ElementTree
|
||||
from xml.sax.saxutils import escape
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
|
||||
|
||||
class EntityMarkup(MetadataMarkup):
|
||||
"""Class for getting an XML node in which entities will be marked.
|
||||
In practice this class and its mehtods can be used to get the text of a
|
||||
given Node and marks every speaker in this text string.
|
||||
Also passes methods and fields to the more specific
|
||||
SimpleSpeakersMarkup."""
|
||||
|
||||
def __init__(self, file_path, element_name=".//sitzungsverlauf"):
|
||||
super().__init__()
|
||||
self.file_path = file_path
|
||||
self.element_name = element_name
|
||||
self.xml_tree = None
|
||||
self.current_string = str()
|
||||
self.filename = os.path.basename(file_path)
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def get_element_text(self):
|
||||
"""
|
||||
Gets the strings of all elements matched by an element x-path. Element
|
||||
name will be passed when the class is istanced. Distunguishes between
|
||||
one string or several strings.
|
||||
"""
|
||||
self.all_elements = self.xml_tree.iterfind(self.element_name)
|
||||
len_all_elements = len(list(self.all_elements))
|
||||
self.current_strings = []
|
||||
if(len_all_elements == 1):
|
||||
self.all_elements = self.xml_tree.iterfind(self.element_name)
|
||||
self.current_string = escape(list(self.all_elements)[0].text)
|
||||
self.current_strings.append(self.current_string)
|
||||
elif(len_all_elements > 1):
|
||||
self.current_strings = []
|
||||
self.all_elements = self.xml_tree.iterfind(self.element_name)
|
||||
for element in self.all_elements:
|
||||
string = escape(element.text)
|
||||
self.current_strings.append(string)
|
||||
self.all_elements = self.xml_tree.iterfind(self.element_name)
|
||||
|
||||
def replace_string(self, replacement_string, element_name):
|
||||
"""
|
||||
This function takes the newly manipulated xml string and overwrites
|
||||
the old string with it.
|
||||
"""
|
||||
replacement_string = (
|
||||
"<" + element_name + ">"
|
||||
+ replacement_string
|
||||
+ "</" + element_name + ">"
|
||||
)
|
||||
for element in self.xml_tree.xpath("//%s" % element_name):
|
||||
element.getparent().remove(element)
|
||||
replacement_element = etree.fromstring(replacement_string)
|
||||
self.xml_tree.insert(1, replacement_element)
|
||||
|
||||
def simple_check_xml(self, xml_string, file_name, save_valid, node=True):
|
||||
"""
|
||||
Checks if a given xml element is well-formed xml. If it is checking a
|
||||
partial string it adds a root element. If node is False it is checking a
|
||||
document as a string.
|
||||
"""
|
||||
try:
|
||||
if(node is True):
|
||||
folder_path = "logs/well-formed_strings/"
|
||||
file_path = os.path.join(folder_path, os.path.basename(file_name))
|
||||
xml_string = "<root>" + xml_string + "</root>"
|
||||
tree = etree.fromstring(xml_string)
|
||||
self.logger.info(("The node string is well-formed. Simple markup is"
|
||||
" correct. Node string can be found in "
|
||||
+ folder_path))
|
||||
self.logger.info(tree)
|
||||
if(save_valid is True):
|
||||
self.logger.info("Node string can be found in" + folder_path)
|
||||
if not os.path.exists(folder_path):
|
||||
os.mkdir(folder_path)
|
||||
with open(file_path, "w") as text_file:
|
||||
text_file.write(xml_string)
|
||||
else:
|
||||
folder_path = "logs/well-formed_files/"
|
||||
file_path = os.path.join(folder_path, os.path.basename(file_name))
|
||||
xml_string = xml_string
|
||||
tree = etree.fromstring(xml_string)
|
||||
self.logger.info("The XML file is well-formed.")
|
||||
self.logger.info(tree)
|
||||
if(save_valid is True):
|
||||
self.logger.info("File can be found in" + folder_path)
|
||||
if not os.path.exists(folder_path):
|
||||
os.mkdir(folder_path)
|
||||
with open(file_path, "w") as text_file:
|
||||
text_file.write(xml_string.decode("utf-8"))
|
||||
except Exception as e:
|
||||
if(node is True):
|
||||
folder_path = "logs/not_well-formed_strings/"
|
||||
file_path = os.path.join(folder_path, os.path.basename(file_name))
|
||||
if not os.path.exists(folder_path):
|
||||
os.mkdir(folder_path)
|
||||
with open(file_path, "w") as text_file:
|
||||
text_file.write(xml_string)
|
||||
self.logger.error(("XML node string is not well-formed. XML can be"
|
||||
" found in " + folder_path))
|
||||
self.logger.error(e)
|
||||
else:
|
||||
folder_path = "logs/not_well-formed_files/"
|
||||
file_path = os.path.join(folder_path, os.path.basename(file_name))
|
||||
if not os.path.exists(folder_path):
|
||||
os.mkdir(folder_path)
|
||||
with open(file_path, "w") as text_file:
|
||||
text_file.write(xml_string.decode("utf-8"))
|
||||
self.logger.error(("XML file is not well-formed. XML can be"
|
||||
" found in " + folder_path))
|
||||
self.logger.error(e)
|
||||
return False
|
||||
|
||||
def inject_element(self, current_element, regex, tagname,
|
||||
strip_newlines=False):
|
||||
"""
|
||||
Injects new xml elements into the selected element text. The new element
|
||||
will be created by using a regular expression which matches a partial
|
||||
string in the current_element text string. The match will be the
|
||||
new_element text string. The tagname sets the tagname of the
|
||||
new_element. Optionally Attributes can be set aswell.
|
||||
"""
|
||||
element_string = ElementTree.tostring(current_element, encoding="unicode", method="xml")
|
||||
match = re.search(regex, element_string)
|
||||
if(match):
|
||||
index_shift = 0
|
||||
if(strip_newlines is True):
|
||||
counter = match.group().count("\n")
|
||||
match_str = re.sub(r"\n", "", match.group())
|
||||
else:
|
||||
counter = 0
|
||||
match_str = match.group()
|
||||
index_start = match.start() + index_shift - counter
|
||||
index_end = match.end() + index_shift - counter
|
||||
new_element = etree.Element(tagname)
|
||||
new_element.text = match_str
|
||||
new_element_str = ElementTree.tostring(new_element, encoding="unicode", method="xml")
|
||||
element_string = (element_string[:index_start]
|
||||
+ new_element_str
|
||||
+ element_string[index_end:])
|
||||
index_shift += len(new_element_str) - len(match_str)
|
||||
replacement_element = etree.fromstring(element_string.encode("utf8"))
|
||||
current_element.getparent().replace(current_element, replacement_element)
|
||||
|
||||
def markup_speech_lines(self, current_element):
|
||||
"""
|
||||
Inserts markup in every speech that marks every line <p> with
|
||||
attribute klasse="J". J is set for every line even if it is O. In the
|
||||
early protocols (period 1. to 10.) One line is most of the time a
|
||||
sentence. In the later periods one line is capped at around 80
|
||||
characters.
|
||||
"""
|
||||
lines = current_element.xpath("text()")
|
||||
if(len(lines) > 0):
|
||||
lines = lines[0].splitlines()
|
||||
current_element.xpath(".//redner")[0].tail = ""
|
||||
for line in lines:
|
||||
part_element = etree.Element("p")
|
||||
part_element.set("klasse", "J")
|
||||
part_element.text = line
|
||||
current_element.append(part_element)
|
||||
|
||||
def get_multiline_entities(self, elements, start_of_str, end_of_str,
|
||||
tagname):
|
||||
"""
|
||||
This function identifies multiline entities (i.e. Kommentare/Comments)
|
||||
wich are split over multiple elements which have been marked with the
|
||||
markup_speech_lines() function.
|
||||
Gets the text of those and joins them together into one
|
||||
string. The first elements text will be set to the newly created string
|
||||
surrounded by new xml tags with tagname set to input tagname.
|
||||
All other elements with the rest of the string will be deleted.
|
||||
start_of_str should be a regex that describes the pattern how the start
|
||||
of the supposed multiline entity looks like. end_of_str describes the
|
||||
pattern how the end of the supposed multiline entity looks like.
|
||||
"""
|
||||
self.multiline_text = []
|
||||
self.multiline_elements = []
|
||||
start_found = False
|
||||
end_found = False
|
||||
for element in elements:
|
||||
if(start_found is False and end_found is False
|
||||
and element.text is not None):
|
||||
start_match = re.search(start_of_str, element.text)
|
||||
if(start_match is not None):
|
||||
self.multiline_text.append(start_match.group())
|
||||
self.multiline_elements.append(element)
|
||||
start_found = True
|
||||
continue
|
||||
elif(start_found is True and end_found is False
|
||||
and element.text is not None):
|
||||
end_match = re.search(end_of_str, element.text)
|
||||
if(end_match):
|
||||
self.multiline_text.append(end_match.group())
|
||||
self.multiline_elements.append(element)
|
||||
end_found = True
|
||||
continue
|
||||
else:
|
||||
self.multiline_text.append(element.text)
|
||||
self.multiline_elements.append(element)
|
||||
continue
|
||||
elif(start_found is True and end_found is True):
|
||||
new_element_text = re.sub(r"- ", "", " ".join(self.multiline_text)) # joins the sting parts and also removes hyphenation
|
||||
part_element = etree.Element("p")
|
||||
part_element.set("klasse", "J")
|
||||
comment_element = etree.Element(tagname)
|
||||
comment_element.text = new_element_text
|
||||
part_element.append(comment_element)
|
||||
self.multiline_elements[0].getparent().replace(self.multiline_elements[0], part_element)
|
||||
for element in self.multiline_elements[1:]:
|
||||
element.getparent().remove(element)
|
||||
start_found = False
|
||||
end_found = False
|
||||
self.multiline_text = []
|
||||
self.multiline_elements = []
|
||||
continue
|
22
bundesdata_markup_nlp/markup/MdBData.py
Executable file
22
bundesdata_markup_nlp/markup/MdBData.py
Executable file
@ -0,0 +1,22 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from utility.XMLProtocol import XMLProtocol
|
||||
import logging
|
||||
|
||||
|
||||
class MdBData(XMLProtocol):
|
||||
"""Class to handel operations on the Stammdatenbank."""
|
||||
|
||||
def __init__(self):
|
||||
super(XMLProtocol, self).__init__()
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def get_set(self, element_path, element_tree):
|
||||
"""
|
||||
Creates Sets from input path on element_tree.
|
||||
"""
|
||||
tmp_list = [element.text for element in
|
||||
element_tree.iterfind(element_path) if element is not None]
|
||||
set_of_elements = set(tmp_list)
|
||||
return set_of_elements
|
267
bundesdata_markup_nlp/markup/MetadataMarkup.py
Executable file
267
bundesdata_markup_nlp/markup/MetadataMarkup.py
Executable file
@ -0,0 +1,267 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from utility.XMLProtocol import XMLProtocol
|
||||
from utility import update_config
|
||||
from lxml import etree
|
||||
from datetime import datetime
|
||||
from babel.dates import format_date
|
||||
import os
|
||||
import re
|
||||
import logging
|
||||
import configparser
|
||||
|
||||
|
||||
class MetadataMarkup(XMLProtocol):
|
||||
"""
|
||||
This class is for opening one XML-protocoll, extracting the included
|
||||
metadata and creating a new valid metadata head.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.plenarprotokoll_string = str() # will be extracted with extract_metadata()
|
||||
self.wahlperiode = int() # will be extracted with extract_metadata()
|
||||
self.sitzungsnr = int() # will be extracted with extract_metadata()
|
||||
self.herausgeber = "Deutscher Bundestag" # Always the same in every protocoll
|
||||
self.berichtart = "Steongrafischer Bericht" # Always the same in every protocoll
|
||||
self.sitzungstitel_string = ". Sitzung" # Always the same in every protocoll
|
||||
self.ort = "Berlin" # Always the same in every protocoll
|
||||
self.datum_ger_non_iso = str() # will be extracted with extract_metadata()
|
||||
self.datum_iso = str() # ISO-date will be built from self.datum_ger_non_iso
|
||||
self.datum_string = str() # will be built from self.datum_iso
|
||||
self.attachment = str() # will be extracted from a split. Will not work
|
||||
# all the time. But will not break the XML.
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def extract_metadata(self, etree_element_object):
|
||||
"""
|
||||
Extracts metadata from the given XML-tags and wirtes them into the
|
||||
instance variables
|
||||
"""
|
||||
root = etree_element_object
|
||||
metadata_list = []
|
||||
for element in root.iter():
|
||||
if(element.tag != "TEXT"):
|
||||
metadata_list.append(element.text)
|
||||
metadata_list = metadata_list[1:]
|
||||
self.wahlperiode = metadata_list[0]
|
||||
self.plenarprotokoll_string = metadata_list[1].lower().title()
|
||||
self.sitzungsnr = metadata_list[2].split("/")[1]
|
||||
self.datum_ger_non_iso = metadata_list[3]
|
||||
self.logger.info("Metadata successfully extracted.")
|
||||
self.logger.info("Wahlperiode is:" + self.wahlperiode)
|
||||
self.logger.info("Plenarprotokoll is:" + self.plenarprotokoll_string)
|
||||
self.logger.info("Sitzungsnummer is:" + self.sitzungsnr)
|
||||
self.logger.info("German non ISO date is:" + self.datum_ger_non_iso)
|
||||
|
||||
def built_iso_date(self, ger_date):
|
||||
"""
|
||||
Gets the german date and converts it to an ISO standard date.
|
||||
"""
|
||||
self.datum_iso = datetime.strptime(ger_date, "%d.%m.%Y").date()
|
||||
self.logger.info("ISO date created:" + str(self.datum_iso))
|
||||
|
||||
def built_date_string(self, iso_date):
|
||||
"""
|
||||
Gets the ISO date and creates from it an german full string date.
|
||||
"""
|
||||
date_string = format_date(iso_date, format="full", locale="de_DE")
|
||||
date_string = re.sub(r",", ", den", date_string)
|
||||
self.datum_string = date_string
|
||||
self.logger.info("Date string created:" + self.datum_string)
|
||||
|
||||
def delete_old_metadata(self, etree_element_object):
|
||||
"""
|
||||
Deletes old metadata tags and text. Renames root tag.
|
||||
"""
|
||||
for element in etree_element_object.iter():
|
||||
if(element.tag != "TEXT" and element.tag != "DOKUMENT"):
|
||||
element.getparent().remove(element)
|
||||
elif(element.tag == "DOKUMENT"):
|
||||
element.tag = "dbtplenarprotokoll"
|
||||
elif(element.tag == "TEXT"):
|
||||
self.full_content = element.text
|
||||
element.getparent().remove(element)
|
||||
self.logger.info("Old metadata deleted.")
|
||||
|
||||
def insert_new_metadata(self, etree_element_object):
|
||||
"""
|
||||
Inserts the extracted metadata and splitted content into new created
|
||||
and valid xml tags according to the official schema.
|
||||
"""
|
||||
vorspann_element = etree.Element("vorspann")
|
||||
xml_string = """
|
||||
<kopfdaten>
|
||||
<plenarprotokoll-nummer>{} <wahlperiode>{}</wahlperiode>/<sitzungsnr>{}</sitzungsnr>
|
||||
(neu)</plenarprotokoll-nummer>
|
||||
<herausgeber>{}</herausgeber>
|
||||
<berichtart>{}</berichtart>
|
||||
<sitzungstitel><sitzungsnr>{}</sitzungsnr>. Sitzung</sitzungstitel>
|
||||
<veranstaltungsdaten><ort>{}</ort>, <datum date="{}">{}</datum></veranstaltungsdaten>
|
||||
</kopfdaten>"""\
|
||||
.format(self.plenarprotokoll_string, self.wahlperiode,
|
||||
self.sitzungsnr, self.herausgeber, self.berichtart,
|
||||
self.sitzungsnr, self.ort, self.datum_ger_non_iso,
|
||||
self.datum_string)
|
||||
etree_from_str = etree.fromstring(xml_string)
|
||||
etree_element_object.insert(0, vorspann_element)
|
||||
vorspann_element.append(etree_from_str)
|
||||
toc_element = etree.Element("inhaltsverzeichnis")
|
||||
toc_element.text = self.toc
|
||||
vorspann_element.append(toc_element)
|
||||
content_element = etree.Element("sitzungsverlauf")
|
||||
content_element.text = self.president + self.content
|
||||
etree_element_object.insert(2, content_element)
|
||||
anlagen_element = etree.Element("anlagen")
|
||||
anlagen_element. text = self.attachment
|
||||
etree_element_object.insert(3, anlagen_element)
|
||||
rednerliste_element = etree.Element("rednerliste",
|
||||
sitzungsdatum=self.datum_ger_non_iso)
|
||||
etree_element_object.insert(4, rednerliste_element)
|
||||
self.xml_tree = etree_element_object
|
||||
self.logger.info("New metadata XML-head inserted." + xml_string)
|
||||
|
||||
def split_content(self, etree_element_object):
|
||||
"""Splits the full content to: table of content, speeches and in some
|
||||
cases attachments."""
|
||||
config = configparser.ConfigParser()
|
||||
config.read("config.ini")
|
||||
|
||||
session_start_split = config["Regular expressions splits"]["session_start_president_split"]
|
||||
regex_start = re.compile(session_start_split)
|
||||
tmp_list = regex_start.split(self.full_content, maxsplit=1)
|
||||
self.toc = tmp_list[0]
|
||||
self.president = tmp_list[1]
|
||||
self.content = tmp_list[2]
|
||||
|
||||
attachment_split = config["Regular expressions splits"]["attachment_split"]
|
||||
regex_att = re.compile(attachment_split)
|
||||
tmp_list = regex_att.split(self.content)
|
||||
tmp_list = [element for element in tmp_list if element is not None]
|
||||
if(tmp_list[-1] == ""): # if the split does not match anything last item is empty string.
|
||||
self.content = "".join(tmp_list[0:-1])
|
||||
self.attachment = "Keine Anlage extrahiert."
|
||||
self.logger.warning(("There is no attachment."))
|
||||
else:
|
||||
self.content = "".join(tmp_list[0:-1])
|
||||
self.attachment = tmp_list[-1]
|
||||
self.logger.info("Attachment found.")
|
||||
self.logger.info("Contet splitted at:" + str(regex_start))
|
||||
self.logger.info("Contet splitted at:" + str(regex_att))
|
||||
|
||||
def get_session_times(self):
|
||||
"""This function looks into the entire protocoll content to extract the
|
||||
last closing time and the starting time. If only one of both or none are
|
||||
found, the missing time will be set to xx:xx."""
|
||||
config = configparser.ConfigParser()
|
||||
config.read("config.ini")
|
||||
regex_conf_values = config.items("Regular expressions time extraction")
|
||||
regex_conf_values = [regex[1] for regex in regex_conf_values]
|
||||
tmp_list = []
|
||||
identifier = 0
|
||||
start_time_found = True
|
||||
end_time_found = True
|
||||
|
||||
for regex in (regex_conf_values):
|
||||
identifier += 1
|
||||
regex = re.compile(regex)
|
||||
if(identifier == 1):
|
||||
# Always gets first start time.
|
||||
matches = list(regex.finditer(self.full_content))
|
||||
if(len(matches) > 1):
|
||||
match = matches[-1]
|
||||
elif(len(matches) == 0):
|
||||
match = None
|
||||
else:
|
||||
match = matches[0]
|
||||
elif(identifier == 2):
|
||||
# Always gets last closing time
|
||||
matches = list(regex.finditer(self.full_content))
|
||||
if(len(matches) > 1):
|
||||
match = matches[-1]
|
||||
elif(len(matches) == 0):
|
||||
match = None
|
||||
else:
|
||||
match = matches[0]
|
||||
|
||||
if(match is None and identifier == 1):
|
||||
self.logger.warning("No start time found for " + str(regex))
|
||||
start_time_found = False
|
||||
elif(match is None and identifier == 2):
|
||||
self.logger.warning("No end time found for " + str(regex))
|
||||
end_time_found = False
|
||||
elif(match):
|
||||
session_time = [group for group in match.groups()
|
||||
if group is not None]
|
||||
session_time = ["0" + group if len(group) == 1 else group for
|
||||
group in session_time] # Adds a 0 in front if digit len is 1
|
||||
if(len(session_time) == 2):
|
||||
tmp_list.append(":".join(session_time))
|
||||
elif(len(session_time) == 1):
|
||||
tmp_list.append(session_time[0] + ":00")
|
||||
|
||||
if(len(tmp_list) == 2):
|
||||
self.session_start_time = tmp_list[0]
|
||||
self.session_end_time = tmp_list[1]
|
||||
self.logger.info("Start time found: " + self.session_start_time)
|
||||
self.logger.info("End time found: " + self.session_end_time)
|
||||
self.logger.info("Successfully matched start and end times.")
|
||||
elif(len(tmp_list) == 1 and start_time_found is True and end_time_found
|
||||
is False):
|
||||
self.session_start_time = tmp_list[0]
|
||||
self.session_end_time = "xx:xx"
|
||||
self.logger.warning("Only start time found: "
|
||||
+ self.session_start_time)
|
||||
self.logger.warning("End time set to: "
|
||||
+ self.session_end_time)
|
||||
elif(len(tmp_list) == 1 and start_time_found is False and end_time_found
|
||||
is True):
|
||||
self.session_end_time = tmp_list[0]
|
||||
self.session_start_time = "xx:xx"
|
||||
self.logger.warning("Only end time found: "
|
||||
+ self.session_end_time)
|
||||
self.logger.warning("Start time set to: "
|
||||
+ self.session_start_time)
|
||||
|
||||
def write_to_attr(self, element, attr_key, attr_value):
|
||||
"""
|
||||
Writes two strings as a an attribute key value pair to a given
|
||||
element.
|
||||
"""
|
||||
elements = self.xml_tree.findall(element)
|
||||
if(elements == []):
|
||||
element = self.tree.getroot()
|
||||
elements.append(element)
|
||||
for element in elements:
|
||||
element.set(attr_key, attr_value)
|
||||
self.xml_tree = self.xml_tree
|
||||
self.logger.info("Wrote attribute "
|
||||
+ attr_key
|
||||
+ "="
|
||||
+ "\""
|
||||
+ attr_value
|
||||
+ "\"")
|
||||
|
||||
def save_to_file(self, output_path, file_path, subfolder, config_section,
|
||||
config_key):
|
||||
"""
|
||||
Writes the new markup to a new xml file. Takes the output path and
|
||||
creates a new folder there. Also updates the config file with the new
|
||||
path.
|
||||
"""
|
||||
self.filename = os.path.basename(file_path)
|
||||
save_path = os.path.join(output_path, subfolder)
|
||||
if not os.path.exists(save_path):
|
||||
os.mkdir(save_path)
|
||||
tree = etree.ElementTree(self.xml_tree)
|
||||
new_filename = self.filename
|
||||
save_file_path = os.path.join(save_path, new_filename)
|
||||
tree.write(save_file_path,
|
||||
pretty_print=True,
|
||||
xml_declaration=True,
|
||||
encoding="utf8",
|
||||
doctype="<!DOCTYPE dbtplenarprotokoll SYSTEM 'dbtplenarprotokoll_minimal.dtd\'>")
|
||||
self.logger.info("New XML saved to:" + save_file_path)
|
||||
update_config.update_config("config.ini", config_section, config_key,
|
||||
save_path)
|
161
bundesdata_markup_nlp/markup/SpeakerMarkup.py
Executable file
161
bundesdata_markup_nlp/markup/SpeakerMarkup.py
Executable file
@ -0,0 +1,161 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
from markup.EntityMarkup import EntityMarkup
|
||||
import re
|
||||
import logging
|
||||
|
||||
|
||||
class SpeakerMarkup(EntityMarkup):
|
||||
"""
|
||||
Class for specific markup of different speakers identified by different
|
||||
regular expressions included in the config file.
|
||||
"""
|
||||
|
||||
def __init__(self, string, regex):
|
||||
super(SpeakerMarkup).__init__()
|
||||
self.string_to_search = string
|
||||
self.regex_string = regex
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def identify_speaker(self):
|
||||
"""
|
||||
Gets match objects from the speakers in the given text node. Also
|
||||
calculates length of it and puts the matches in a list.
|
||||
"""
|
||||
self.matches = re.finditer(self.regex_compiled, self.string_to_search)
|
||||
tmp_list = []
|
||||
for match in self.matches:
|
||||
tmp_list.append(match)
|
||||
self.matches_count = len(tmp_list)
|
||||
self.matches = tmp_list
|
||||
|
||||
def markup_speaker(self, case="middle"):
|
||||
"""
|
||||
This is where the first simple markup happens. It uses the matches
|
||||
and replaces them with simple markup for further processing. The
|
||||
'first' markup uses re.sub. The second and third one work on string
|
||||
basis.
|
||||
"""
|
||||
|
||||
def markup_logging():
|
||||
"""Helper function for creating log file output."""
|
||||
if(self.matches_count == 0):
|
||||
self.logger.warning("0 matches for given expression:"
|
||||
+ self.regex_string)
|
||||
elif(self.matches_count > 0):
|
||||
self.logger.info(str(self.matches_count)
|
||||
+ " matches for given expression:"
|
||||
+ self.regex_string)
|
||||
elif(self.matches_count == 1):
|
||||
self.logger.info(str(self.matches_count)
|
||||
+ " match for given expression:"
|
||||
+ self.regex_string)
|
||||
|
||||
if(case == "first"):
|
||||
# Uses re.sub because it is only for one match.
|
||||
start_tags = "<rede><redner>"
|
||||
end_tags = "</redner>"
|
||||
self.matches_count = 1 # sets count to 1 because it only marks the first match
|
||||
markup_logging()
|
||||
first_match = self.matches[0]
|
||||
start_xml = start_tags + first_match.group() + end_tags
|
||||
if(len(first_match.group().split()) <= 10):
|
||||
self.string_to_search = self.regex_compiled.sub(start_xml,
|
||||
self.string_to_search,
|
||||
count=1)
|
||||
self.markuped_string = self.string_to_search
|
||||
|
||||
elif(case == "middle"):
|
||||
"""
|
||||
Does not use re.sub because it is faster to work on the string.
|
||||
Also it avoids looping two times to get the specific match.group()
|
||||
which caused some errors.
|
||||
"""
|
||||
index_shift = 0
|
||||
start_tags = "\n</rede><rede><redner>"
|
||||
end_tags = "</redner>"
|
||||
markup_logging()
|
||||
for match in self.matches:
|
||||
index_start = match.start() + index_shift
|
||||
index_end = match.end() + index_shift
|
||||
whole_match_len = len(match.group())
|
||||
# Handels cases where lots of text before the actual speaker is # matched
|
||||
linebrks_in_match = len(match.group().split("\n"))
|
||||
if(linebrks_in_match >= 2):
|
||||
last_part_match = "".join(match.group().split("\n")[1:])
|
||||
first_line_of_match = match.group().split("\n")[0]
|
||||
if(len(first_line_of_match.split()) <= 10):
|
||||
match = first_line_of_match + last_part_match
|
||||
else:
|
||||
match = last_part_match
|
||||
|
||||
delta_start_index = whole_match_len - len(match)
|
||||
index_start = index_start + delta_start_index
|
||||
|
||||
self.string_to_search = (self.string_to_search[:index_start]
|
||||
+ start_tags
|
||||
+ match
|
||||
+ end_tags
|
||||
+ self.string_to_search[index_end:]
|
||||
)
|
||||
index_shift += len(start_tags) + len(end_tags)
|
||||
|
||||
else:
|
||||
self.string_to_search = (self.string_to_search[:index_start]
|
||||
+ start_tags
|
||||
+ match.group()
|
||||
+ end_tags
|
||||
+ self.string_to_search[index_end:]
|
||||
)
|
||||
index_shift += len(start_tags) + len(end_tags)
|
||||
|
||||
self.markuped_string = self.string_to_search
|
||||
|
||||
elif(case == "last"):
|
||||
index_shift = 0
|
||||
"""
|
||||
Matches the end of the session to add the last closing <rede> tag
|
||||
to the last speech for well-formed xml. Uses re.sub because it is
|
||||
only one operation.
|
||||
"""
|
||||
end_tag = "</rede>"
|
||||
session_close_time_tag = ('<sitzungsende/>')
|
||||
# Created end tags will be inserted into the protocol
|
||||
if(len(self.matches) == 1):
|
||||
self.logger.info("Last speech successfully tagged.")
|
||||
markup_logging()
|
||||
for match in self.matches:
|
||||
end_xml = end_tag + match.group() + session_close_time_tag
|
||||
if(len(match.group().split()) <= 15):
|
||||
self.string_to_search = self.regex_compiled.sub(end_xml,
|
||||
self.string_to_search,
|
||||
count=1)
|
||||
self.markuped_string = self.string_to_search
|
||||
|
||||
elif(len(self.matches) == 0):
|
||||
self.logger.warning(("No end of session found! Last tag " + end_tag
|
||||
+ " will be added to the end of the protocol."
|
||||
" This might add some unrelated text to the "
|
||||
"last speech."))
|
||||
markup_logging()
|
||||
self.markuped_string = self.string_to_search + end_tag
|
||||
|
||||
else:
|
||||
markup_logging()
|
||||
self.logger.warning(("There are " + str(len(self.matches))
|
||||
+ " session endings. Ignoring the endings"
|
||||
+ " before the last final ending of the "
|
||||
+ " session."))
|
||||
match = self.matches[-1]
|
||||
end_xml = end_tag + match.group() + session_close_time_tag
|
||||
whole_match_len = len(match.group())
|
||||
index_start = match.start() + index_shift
|
||||
index_end = match.end() + index_shift
|
||||
last_line = match.group().split("\n")[-1] # Always takes the last line of a match avoiding lots of text before the actual speaker.
|
||||
delta_start_index = whole_match_len - len(last_line)
|
||||
index_start = index_start + delta_start_index
|
||||
self.string_to_search = (self.string_to_search[:index_start]
|
||||
+ end_xml
|
||||
+ self.string_to_search[index_end:])
|
||||
index_shift += len(end_tag)
|
||||
self.markuped_string = self.string_to_search
|
554
bundesdata_markup_nlp/markup/SpeakerNameMarkup.py
Executable file
554
bundesdata_markup_nlp/markup/SpeakerNameMarkup.py
Executable file
@ -0,0 +1,554 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from markup.SpeakerMarkup import SpeakerMarkup
|
||||
from xml.etree import ElementTree
|
||||
from lxml import etree
|
||||
from tqdm import tqdm
|
||||
from itertools import combinations
|
||||
import copy
|
||||
import logging
|
||||
import re
|
||||
import os
|
||||
|
||||
|
||||
class SpeakerNameMarkup(SpeakerMarkup):
|
||||
"""
|
||||
This class is for the complex markup of the speakers in one given protocol.
|
||||
Creates the name tag with all needed inforamtion from the Stammdatenbank.
|
||||
Has to cross reference the speaker with said Stammdatenbank.
|
||||
"""
|
||||
known_redner_dicts = dict()
|
||||
last_wahlperiode = int()
|
||||
|
||||
def __init__(self, file_path, element_name=".//redner"):
|
||||
super(SpeakerNameMarkup).__init__()
|
||||
self.file_path = file_path
|
||||
self.filename = os.path.basename(self.file_path)[:-4]
|
||||
self.element_name = element_name
|
||||
self.redner_dict = dict()
|
||||
self.all_speakers = []
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def cross_reference_markup(self, strings, feature_set_dict,
|
||||
MdB_etree):
|
||||
"""
|
||||
Checks if features like name, surename academic title and city are
|
||||
present in the input string. Consists of main function and helper
|
||||
functions. First the string will be split in tokens. Every token will
|
||||
be checked a gainst sets of valid names, surnames, academic titles and
|
||||
fractions. If there is a match a dictionary entriy will be set
|
||||
accordingly.
|
||||
Also uses the add_missing_MdB_feature helper function in a second step
|
||||
to add features which are not present in the string or have been
|
||||
identified wrongly.
|
||||
The function crates a dictionary containing all features of one speaker
|
||||
to crate a valid XML element from it later on.
|
||||
"""
|
||||
|
||||
def initiate_dict(keys, extra_keys):
|
||||
"""
|
||||
Creates a dictionarie with a set of keys and sets them to None.
|
||||
Some specific key values will be set to specific values.
|
||||
"""
|
||||
for key in keys:
|
||||
redner_dict[key] = None
|
||||
for key in extra_keys:
|
||||
redner_dict[key] = None
|
||||
redner_dict["feature_complete"] = False
|
||||
redner_dict["original_string"] = string
|
||||
redner_dict["identified"] = False
|
||||
redner_dict["damalige_fraktion"] = None
|
||||
|
||||
def get_names(keys, dict, token):
|
||||
"""
|
||||
Checks if token is in set vorname or nachname. If it is dictionary
|
||||
values will be set accordingly. Avoids that surname will be
|
||||
overwirtten by a name wich is also a valid surname.
|
||||
"""
|
||||
for key in keys[0:2]: # Only for vorname, nachname in written order
|
||||
if(token in feature_set_dict[key][0] and redner_dict[key]
|
||||
is None):
|
||||
redner_dict[key] = token
|
||||
elif(token in feature_set_dict["nachname"][0]
|
||||
and redner_dict["nachname"] is not None):
|
||||
redner_dict["nachname"] = token
|
||||
else:
|
||||
continue
|
||||
|
||||
def get_feature(key, string, set):
|
||||
"""
|
||||
Checks if a token is a valid feature (like name affix or academic
|
||||
title, ortszusatz or namenszusatz) and adds it to the dictionary.
|
||||
Does not check for names.
|
||||
"""
|
||||
for feature in set:
|
||||
if(key == "titel"):
|
||||
regex = r"(\b{}\B)".format(re.escape(feature)) # could be Dr. and . is not a word boundary.
|
||||
elif(key is "namenszusatz"):
|
||||
regex = r"\b({})\b".format(re.escape(feature)) # No . in word so word boundary at start and end of regex.
|
||||
elif(key is "fraktion"):
|
||||
regex = r"\B(\({}\))\B".format(re.escape(feature)) # always surrounded by parentheses, but also has to match them to avoid matching i. e. "CDU" in "CDU/CSU"
|
||||
elif(key is "ortszusatz"):
|
||||
regex = r"\B{}\B".format(re.escape(feature)) # always surrounded by parentheses
|
||||
else:
|
||||
regex = r"(\b{}\b)".format(re.escape(feature))
|
||||
match = re.search(regex, string)
|
||||
if(match):
|
||||
if(key == "fraktion"):
|
||||
redner_dict[key] = match.group()[1:-1] # removes ()
|
||||
break
|
||||
else:
|
||||
redner_dict[key] = match.group()
|
||||
break
|
||||
else:
|
||||
redner_dict[key] = None
|
||||
|
||||
def get_role(string):
|
||||
"""Checks redner string for role. Identifies 'Bundesministerin für
|
||||
Familie, Senioren, Frauen und Jugend' etc."""
|
||||
if("Staatssekretär" in string or "Staatssekretärin" in string):
|
||||
regex = r"(Staatssekretär(in)?)"
|
||||
splits = re.split(regex, string, maxsplit=1)
|
||||
role_long = splits[1] + splits[-1]
|
||||
redner_dict["rolle_lang"] = role_long
|
||||
role_short = [word[0] for word in role_long.split()
|
||||
if word[0].isupper()]
|
||||
role_short = splits[1] + " " + "".join(role_short)
|
||||
redner_dict["rolle_kurz"] = role_short
|
||||
elif("Bundesminister" in string or "Bundesministerin" in string):
|
||||
regex = r"(Bundesminister(in)?)"
|
||||
splits = re.split(regex, string, maxsplit=1)
|
||||
role_long = splits[1] + splits[-1]
|
||||
redner_dict["rolle_lang"] = role_long
|
||||
role_short = [word[0] for word in role_long.split()
|
||||
if word[0].isupper()]
|
||||
role_short = splits[1] + " " + "".join(role_short)
|
||||
redner_dict["rolle_kurz"] = role_short
|
||||
|
||||
def check_name(redner_dict):
|
||||
"""
|
||||
Checks if vorname and nachname are the same. Sets vorname to None if
|
||||
True. Vorname will be set later on with add_missing_MdB_feature.
|
||||
"""
|
||||
if(redner_dict["nachname"] == redner_dict["vorname"]):
|
||||
redner_dict["vorname"] = None
|
||||
|
||||
def get_party(redner_dict):
|
||||
"""
|
||||
Creates a party key in the dictionary containing the party of the
|
||||
speaker. Party is not the same as fraction. This is mainly done
|
||||
because CDU/CSU is the fraction in the bundestag but speakers can
|
||||
belong to either the CDU or CSU. If the fraction is not CDU/CSU
|
||||
party will be set to fraction. Also handels problems with GRÜNE.
|
||||
"""
|
||||
if(redner_dict["fraktion"] != "CDU/CSU"
|
||||
and redner_dict["fraktion"] != "CDU"
|
||||
and redner_dict["fraktion"] != "CSU"):
|
||||
redner_dict["partei"] = redner_dict["fraktion"]
|
||||
elif(redner_dict["fraktion"] == "CDU"
|
||||
or redner_dict["fraktion"] == "CSU"):
|
||||
redner_dict["partei"] = redner_dict["fraktion"]
|
||||
redner_dict["fraktion"] = "CDU/CSU"
|
||||
if(redner_dict["fraktion"] == "GRÜNE"):
|
||||
redner_dict["fraktion"] = "BÜNDNIS 90/DIE GRÜNEN"
|
||||
|
||||
def check_party_and_fraction():
|
||||
"""
|
||||
Checks if party and fraction have been set correctly. Will be used
|
||||
after add_missing_MdB_feature. To correct some errors with CDU/CSU.
|
||||
"""
|
||||
if(redner_dict["fraktion"] is not None
|
||||
and redner_dict["partei"] == "CDU"
|
||||
or redner_dict["partei"] == "CSU"):
|
||||
redner_dict["fraktion"] = "CDU/CSU"
|
||||
|
||||
if(redner_dict["partei"] is None
|
||||
and redner_dict["fraktion"] is not None
|
||||
and redner_dict["fraktion"] != "CDU"
|
||||
and redner_dict["fraktion"] != "CSU"):
|
||||
redner_dict["partei"] = redner_dict["fraktion"]
|
||||
|
||||
def get_match_in_str(key, string, regex):
|
||||
"""
|
||||
Matches a regex in the current string and adds it as a value to the
|
||||
given key into the dictionary.
|
||||
"""
|
||||
match = re.search(regex, string)
|
||||
if(match):
|
||||
redner_dict[key] = match.group()
|
||||
else:
|
||||
redner_dict[key] = None
|
||||
|
||||
def add_missing_MdB_feature(string, redner_dict, feature_set_dict,
|
||||
MdB_etree, conditions_key_list,
|
||||
feature_lookup, feature_to_add,
|
||||
logging_state=False, multi_ids=False):
|
||||
"""
|
||||
This function trys to get missing features for on speaker. Input is
|
||||
a list of features(conditions_key_list) which are used as parameters
|
||||
in an xpath expression. The Xpath is built dynamically from the
|
||||
list.
|
||||
If the Xpath matches one unique entry the feature(feature_to_add)
|
||||
will be set to the match of feature_lookup in the matched element.
|
||||
"""
|
||||
###
|
||||
# Xpath creation from conditions_key_list
|
||||
###
|
||||
xpath_parts = []
|
||||
conds = conditions_key_list
|
||||
len_conds = len(conds)
|
||||
if(len_conds == 1):
|
||||
for condition in conds:
|
||||
xpath_part = ".//MDB[.//{}/text()='{}']" \
|
||||
.format(feature_set_dict[condition][1],
|
||||
redner_dict[condition])
|
||||
xpath_parts.append(xpath_part)
|
||||
xpath = "".join(xpath_parts)
|
||||
if("None" in xpath):
|
||||
xpath = None
|
||||
elif(len_conds == 2):
|
||||
xpath_first_part = ".//MDB[.//{}/text()='{}'" \
|
||||
.format(feature_set_dict[conds[0]][1],
|
||||
redner_dict[conds[0]])
|
||||
xpath_parts.insert(0, xpath_first_part)
|
||||
xpath_last_part = ".//{}/text()='{}']" \
|
||||
.format(feature_set_dict[conds[-1]][1],
|
||||
redner_dict[conds[-1]])
|
||||
xpath_parts.append(xpath_last_part)
|
||||
xpath = " and ".join(xpath_parts)
|
||||
if("None" in xpath):
|
||||
xpath = None
|
||||
elif(len_conds > 2):
|
||||
xpath_first_part = ".//MDB[.//{}/text()='{}'" \
|
||||
.format(feature_set_dict[conds[0]][1],
|
||||
redner_dict[conds[0]])
|
||||
xpath_parts.insert(0, xpath_first_part)
|
||||
for condition in conds[1:-1]:
|
||||
xpath_inner_part = ".//{}/text()='{}'" \
|
||||
.format(feature_set_dict[condition][1],
|
||||
redner_dict[condition])
|
||||
xpath_parts.append(xpath_inner_part)
|
||||
xpath_last_part = ".//{}/text()='{}']" \
|
||||
.format(feature_set_dict[conds[-1]][1],
|
||||
redner_dict[conds[-1]])
|
||||
xpath_parts.append(xpath_last_part)
|
||||
xpath = " and ".join(xpath_parts)
|
||||
if("None" in xpath): # sets xpaths to None if it uses a feature which is None
|
||||
xpath = None
|
||||
xpath_parts = [] # empties xpath_parts list
|
||||
try: # tries every xpath
|
||||
matches = MdB_etree.xpath(xpath)
|
||||
except TypeError: # handles xpaths that are None
|
||||
matches = []
|
||||
# If xpath has unique match new feature value will be set to given feature
|
||||
if(len(matches) == 1):
|
||||
matches = matches[0]
|
||||
feature_lookup = ".//" + feature_lookup
|
||||
new_feature = matches.xpath(feature_lookup)[0].text
|
||||
self.logger.info((" There is one unique match "
|
||||
+ " for this speaker: "
|
||||
+ str(redner_dict)
|
||||
+ " Extracted feature "
|
||||
+ feature_lookup + ": "
|
||||
+ str(new_feature)
|
||||
+ " with: "
|
||||
+ str(conds)))
|
||||
redner_dict[feature_to_add] = new_feature
|
||||
self.logger.info(("New speaker features are: "
|
||||
+ str(redner_dict)))
|
||||
# Handels mathches tha are not unique for logging and mutli id
|
||||
elif(len(matches) > 1):
|
||||
self.logger.warning((" There are "
|
||||
+ str(len(matches))
|
||||
+ " matches for this speaker: "
|
||||
+ str(redner_dict)
|
||||
+ " .Could not extract: "
|
||||
+ feature_lookup
|
||||
+ " Features used are: "
|
||||
+ str(conds)))
|
||||
elif(len(matches) > 1 and multi_ids is True):
|
||||
ids = matches
|
||||
for id, i in ids, enumerate(ids):
|
||||
key = "id" + i
|
||||
redner_dict[key] = id
|
||||
return matches
|
||||
|
||||
def get_periode(MdB_etree):
|
||||
periode = self.xml_tree.xpath(".//wahlperiode")
|
||||
if(periode):
|
||||
redner_dict["wahlperiode"] = periode[0].text
|
||||
return periode[0].text
|
||||
|
||||
###
|
||||
# Start of main function cross_reference_markup
|
||||
###
|
||||
|
||||
# Initiates empty dict and gets keys for it
|
||||
redner_dict = dict()
|
||||
features = list(feature_set_dict.keys())
|
||||
|
||||
# Counters to calculate how successful the identification of speakers is
|
||||
identified_speakers = 0
|
||||
unidentified_speakers = 0
|
||||
multiple_identified_speakers = 0
|
||||
|
||||
# Cross references every <redner> string
|
||||
for string in tqdm(strings, desc="Cross reference name markup for speakers in strings"):
|
||||
self.logger.info("\nStarting name markup process for new speaker:")
|
||||
# Sets values in redner_dict to None or specific value
|
||||
initiate_dict(features, [feature for feature in features])
|
||||
tokens = string.replace(":", "").replace(",", "").split() # replaces ":" and "," with nothing because some names would be "name:" and some names would contain a ","
|
||||
for token in tokens:
|
||||
get_names(features, feature_set_dict, token)
|
||||
self.logger.info("nachname is: " + str(redner_dict["nachname"]))
|
||||
feature_keys = [key for key in features if key not in ["vorname",
|
||||
"nachname"]]
|
||||
for f_key in feature_keys:
|
||||
get_feature(f_key, string, feature_set_dict[f_key][0])
|
||||
get_party(redner_dict)
|
||||
check_name(redner_dict)
|
||||
regex_p = r"^\w*(?:P|p)räsident\w*"
|
||||
get_match_in_str("präsident", string, regex_p)
|
||||
get_role(string)
|
||||
|
||||
###
|
||||
# Checks if script is still running for the same current periode.
|
||||
# If this is not the case the known_redner_dicts will be emptied.
|
||||
###
|
||||
current_wahlperiode = get_periode(MdB_etree)
|
||||
if(current_wahlperiode != SpeakerNameMarkup.last_wahlperiode):
|
||||
SpeakerNameMarkup.known_redner_dicts = dict()
|
||||
SpeakerNameMarkup.last_wahlperiode = current_wahlperiode
|
||||
|
||||
###
|
||||
# Creates possible combinations of features which will be used in
|
||||
# add_missing_MdB_feature to identify missing features like vorname or
|
||||
# nachname.
|
||||
###
|
||||
|
||||
combination_features = [feature for feature in features if feature
|
||||
not in ["namenszusatz",
|
||||
"feature_complete",
|
||||
"id",
|
||||
"titel",
|
||||
"rolle_kurz",
|
||||
"rolle_lang",
|
||||
"original_string",
|
||||
"identified",
|
||||
"damalige_fraktion"]]
|
||||
subsets = []
|
||||
for length in range(0, 5):
|
||||
for subset in combinations(combination_features, length):
|
||||
subsets.append(list(subset))
|
||||
subsets = subsets[1:]
|
||||
combination_features.remove("wahlperiode")
|
||||
combination_features.remove("nachname")
|
||||
|
||||
###
|
||||
# First while loop trying to identify every feature for one speaker.
|
||||
# Uses combinations from above. Before calling the function
|
||||
# add_missing_MdB_feature there is a check if the speaker has alreeady
|
||||
# been identified before. If this is the case features will be set to
|
||||
# the already identfied features. This saves a lot of time.
|
||||
###
|
||||
|
||||
counter_feats = 0
|
||||
while(redner_dict["feature_complete"] is False):
|
||||
redner_dict["damalige_fraktion"] = redner_dict["fraktion"]
|
||||
# print("Doing name markup for:", redner_dict)
|
||||
# Checks if speaker has been already identified before.
|
||||
if(string in SpeakerNameMarkup.known_redner_dicts):
|
||||
# print("Speaker has already been identified once.")
|
||||
redner_dict = SpeakerNameMarkup.known_redner_dicts[string].copy()
|
||||
# print("Speaker features are set to:",
|
||||
# SpeakerNameMarkup.known_redner_dicts[string])
|
||||
redner_dict["identified"] = True
|
||||
self.logger.info(("Speaker has alreeady been identified "
|
||||
+ "once."))
|
||||
self.logger.info(("Speaker features are set to: "
|
||||
+ str(SpeakerNameMarkup.known_redner_dicts[string])))
|
||||
if(SpeakerNameMarkup.known_redner_dicts[string]["feature_complete"] is not False):
|
||||
identified_speakers += 1
|
||||
break
|
||||
else:
|
||||
for feature in combination_features:
|
||||
for subset in subsets:
|
||||
add_missing_MdB_feature(string,
|
||||
redner_dict,
|
||||
feature_set_dict,
|
||||
MdB_etree,
|
||||
subset,
|
||||
feature_set_dict[feature][1],
|
||||
feature)
|
||||
check_party_and_fraction()
|
||||
if(redner_dict["vorname"] is not None
|
||||
and redner_dict["nachname"] is not None
|
||||
and redner_dict["fraktion"] is not None
|
||||
and redner_dict["partei"] is not None):
|
||||
redner_dict["feature_complete"] = True
|
||||
counter_feats += 1
|
||||
if(counter_feats == len(combination_features)):
|
||||
redner_dict["feature_complete"] = False
|
||||
break
|
||||
|
||||
###
|
||||
# Second while loop uses four features to identfie the unique ID for one
|
||||
# speaker with add_missing_MdB_feature. Also tries to identfie speakers
|
||||
# with lesser known features. In this case there can be multiple possile
|
||||
# ids for one speaker these will be saved in a special dictionary entry.
|
||||
# Rare case.
|
||||
###
|
||||
|
||||
counter_ids = 0
|
||||
while(redner_dict["id"] is None):
|
||||
if(redner_dict["feature_complete"] is True):
|
||||
add_missing_MdB_feature(string,
|
||||
redner_dict,
|
||||
feature_set_dict,
|
||||
MdB_etree,
|
||||
["vorname", "nachname", "partei",
|
||||
"wahlperiode"],
|
||||
feature_set_dict["id"][1],
|
||||
"id")
|
||||
key_original_string = redner_dict["original_string"]
|
||||
SpeakerNameMarkup.known_redner_dicts.update(
|
||||
{key_original_string: redner_dict.copy()})
|
||||
redner_dict["identified"] = True
|
||||
if(counter_ids == 1):
|
||||
redner_dict["id"] = None
|
||||
redner_dict["feature_complete"] = False
|
||||
redner_dict["identified"] = False
|
||||
self.logger.warning(("Unique ID could not be assigned. "
|
||||
+ "Feature complete: True "
|
||||
+ "Features are: "
|
||||
+ str(redner_dict)))
|
||||
SpeakerNameMarkup.known_redner_dicts.update(
|
||||
{key_original_string: redner_dict.copy()})
|
||||
unidentified_speakers += 1
|
||||
identified_speakers -= 1 # because identified_speakers was set before
|
||||
break
|
||||
identified_speakers += 1
|
||||
elif(redner_dict["feature_complete"] is not True):
|
||||
redner_dict["id"] = None
|
||||
ids = add_missing_MdB_feature(string,
|
||||
redner_dict,
|
||||
feature_set_dict,
|
||||
MdB_etree,
|
||||
["nachname", "partei",
|
||||
"wahlperiode"],
|
||||
feature_set_dict["id"][1],
|
||||
"id", False, True)
|
||||
if(ids is not None and len(ids) > 1):
|
||||
redner_dict["identified"] = "Multiple"
|
||||
multiple_identified_speakers += 1
|
||||
identified_speakers -= 1
|
||||
break
|
||||
elif(ids is None):
|
||||
self.logger.warning(("Unique ID could not be assigned. "
|
||||
+ "Feature complete: False "
|
||||
+ "Features are: "
|
||||
+ str(redner_dict)))
|
||||
redner_dict["identified"] = False
|
||||
unidentified_speakers += 1
|
||||
break
|
||||
counter_ids += 1
|
||||
|
||||
self.logger.info(("Number of identified speakers with valid id and"
|
||||
+ " name markup is: "
|
||||
+ str(identified_speakers)))
|
||||
self.logger.info(("Number of unidentified speakers without valid"
|
||||
+ " id and name markup is: "
|
||||
+ str(unidentified_speakers)))
|
||||
self.logger.info(("Number of speakers with possible multiple ids: "
|
||||
+ str(multiple_identified_speakers)))
|
||||
self.logger.info(("Number of all speaker entitiys in current"
|
||||
+ " protocoll is: "
|
||||
+ str(len(strings))))
|
||||
redner_dict_final = copy.deepcopy(redner_dict)
|
||||
self.redner_dict = redner_dict_final
|
||||
self.all_speakers.append(self.redner_dict)
|
||||
for key in features:
|
||||
redner_dict[key] = None
|
||||
|
||||
# print("Speaker features after whole cross reference markup:",
|
||||
# redner_dict_final)
|
||||
self.logger.info(("Saved speakers (identfied and not identified): "
|
||||
+ str(len(self.all_speakers))))
|
||||
|
||||
def create_speaker_elements(self):
|
||||
"""
|
||||
Creates a valid redner XML element for one redner_dict entry from the
|
||||
list self.all_speakers. Has to be done step by step becuase dictionary
|
||||
is not sorted and name sub elements have to be in specific order.
|
||||
"""
|
||||
self.all_speaker_elements = []
|
||||
for redner_entry in tqdm(self.all_speakers, desc="Creating speaker element"):
|
||||
redner_element = etree.Element("redner")
|
||||
redner_element.set("id", str(redner_entry["id"]))
|
||||
name_element = etree.Element("name")
|
||||
titel_element = etree.Element("titel")
|
||||
titel_element.text = redner_entry["titel"]
|
||||
vorname_element = etree.Element("vorname")
|
||||
vorname_element.text = redner_entry["vorname"]
|
||||
namenszusatz_element = etree.Element("namenszusatz")
|
||||
namenszusatz_element.text = redner_entry["namenszusatz"]
|
||||
nachname_element = etree.Element("nachname")
|
||||
nachname_element.text = redner_entry["nachname"]
|
||||
damalige_fraktion_element = etree.Element("damalige_fraktion")
|
||||
damalige_fraktion_element.text = redner_entry["damalige_fraktion"]
|
||||
fraktion_element = etree.Element("fraktion")
|
||||
fraktion_element.text = redner_entry["fraktion"]
|
||||
partei_element = etree.Element("partei")
|
||||
partei_element.text = redner_entry["partei"]
|
||||
ortszusatz_element = etree.Element("ortszusatz")
|
||||
ortszusatz_element.text = redner_entry["ortszusatz"]
|
||||
rolle_lang_element = etree.Element("rolle_lang")
|
||||
rolle_lang_element.text = redner_entry["rolle_lang"]
|
||||
rolle_kurz_element = etree.Element("rolle_kurz")
|
||||
rolle_kurz_element.text = redner_entry["rolle_kurz"]
|
||||
original_string_element = etree.Element("original_string")
|
||||
original_string_element.text = redner_entry["original_string"]
|
||||
|
||||
if(redner_entry["titel"] is not None):
|
||||
name_element.append(titel_element)
|
||||
name_element.append(vorname_element)
|
||||
if(redner_entry["namenszusatz"] is not None):
|
||||
name_element.append(namenszusatz_element)
|
||||
name_element.append(nachname_element)
|
||||
name_element.append(damalige_fraktion_element)
|
||||
name_element.append(fraktion_element)
|
||||
name_element.append(partei_element)
|
||||
if(redner_entry["ortszusatz"] is not None):
|
||||
name_element.append(ortszusatz_element)
|
||||
if(redner_entry["rolle_lang"] is not None):
|
||||
name_element.append(rolle_lang_element)
|
||||
name_element.append(rolle_kurz_element)
|
||||
name_element.append(original_string_element)
|
||||
name_element.tail = original_string_element.text
|
||||
redner_element.append(name_element)
|
||||
self.all_speaker_elements.append(redner_element)
|
||||
self.logger.info(("Speaker element is: "
|
||||
+ ElementTree.tostring(redner_element).decode("utf-8")))
|
||||
|
||||
def set_speech_ids(self):
|
||||
"""
|
||||
This functions sets a unique rede id for every rede element in one
|
||||
protocoll. Id is a ten digit integer preceded by the string ID.
|
||||
Example: ID1809900000
|
||||
First two digits are the wahlperiode the followinf three digits are the
|
||||
sitzungsnr (session number). The remaining digits are for counting the
|
||||
speeches. First speech is 00100, second is 00200, eleventh is 01100 and so on.
|
||||
Example: ID1809901100 --> eleventh speech
|
||||
Last tow digits are for corrections.
|
||||
"""
|
||||
|
||||
id_counter = 000
|
||||
speeches = self.xml_tree.xpath(".//sitzungsbeginn | .//rede")
|
||||
for speech in tqdm(speeches, desc="Creating speech ids"):
|
||||
id_counter_str = str(id_counter).zfill(5)
|
||||
id = "ID" + self.filename + id_counter_str
|
||||
speech.set("id", id)
|
||||
id_counter += 100
|
||||
self.logger.info(("Speech id is: " + id))
|
||||
self.xml_tree = self.xml_tree
|
0
bundesdata_markup_nlp/markup/__init__.py
Executable file
0
bundesdata_markup_nlp/markup/__init__.py
Executable file
Binary file not shown.
BIN
bundesdata_markup_nlp/markup/__pycache__/MdBData.cpython-37.pyc
Normal file
BIN
bundesdata_markup_nlp/markup/__pycache__/MdBData.cpython-37.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
bundesdata_markup_nlp/markup/__pycache__/__init__.cpython-37.pyc
Normal file
BIN
bundesdata_markup_nlp/markup/__pycache__/__init__.cpython-37.pyc
Normal file
Binary file not shown.
Binary file not shown.
BIN
bundesdata_markup_nlp/markup/__pycache__/metadata.cpython-37.pyc
Normal file
BIN
bundesdata_markup_nlp/markup/__pycache__/metadata.cpython-37.pyc
Normal file
Binary file not shown.
Binary file not shown.
BIN
bundesdata_markup_nlp/markup/__pycache__/speakers.cpython-37.pyc
Normal file
BIN
bundesdata_markup_nlp/markup/__pycache__/speakers.cpython-37.pyc
Normal file
Binary file not shown.
BIN
bundesdata_markup_nlp/markup/__pycache__/speeches.cpython-37.pyc
Normal file
BIN
bundesdata_markup_nlp/markup/__pycache__/speeches.cpython-37.pyc
Normal file
Binary file not shown.
49
bundesdata_markup_nlp/markup/beautify_markup.py
Executable file
49
bundesdata_markup_nlp/markup/beautify_markup.py
Executable file
@ -0,0 +1,49 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from utility.FileGetter import FileGetter
|
||||
from utility.XMLProtocol import XMLProtocol
|
||||
import configparser
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def beautify_xml(case, alter_lines=False, line_width=0):
|
||||
"""
|
||||
Beautifies the xml protocols so that they are easily readable by humans.
|
||||
Uses .beautify_xml_part() and .beautify_xml() to be able to format lines for
|
||||
specific parts of an xml. Alter lines can be set to Flase or True. Line
|
||||
width that will be used if alter_lines is True can be set to any value
|
||||
between 0 and 160.
|
||||
"""
|
||||
config = configparser.ConfigParser()
|
||||
config.read("config.ini")
|
||||
if(case == "markup"):
|
||||
output_path = config["File paths"]["output_folder"]
|
||||
input_path = config["File paths"]["clear_speech_markup"]
|
||||
key_name = "beautiful_xml"
|
||||
elif(case == "nlp"):
|
||||
output_path = config["File paths"]["nlp_output"]
|
||||
input_path = config["File paths"]["nlp_lemmatized_tokenized"]
|
||||
key_name = "nlp_beuatiful_xml"
|
||||
files = FileGetter(input_path, "*.xml")
|
||||
files = files.get_files()
|
||||
for file_path in tqdm(sorted(files), desc="First beautification steps"):
|
||||
xml = XMLProtocol()
|
||||
xml.read_xml(file_path)
|
||||
xml.beautify_xml_part(file_path, ".//vorspann")
|
||||
xml.replace_elements(".//vorspann", [xml.beautified_part])
|
||||
xml.beautify_xml_part(file_path, ".//sitzungsverlauf", alter_lines,
|
||||
line_width)
|
||||
xml.replace_elements(".//sitzungsverlauf", [xml.beautified_part])
|
||||
xml.save_to_file(output_path, file_path, key_name,
|
||||
"File paths", key_name)
|
||||
config.read("config.ini")
|
||||
beautiful_xmls_path = config["File paths"][key_name]
|
||||
files = FileGetter(beautiful_xmls_path, "*.xml")
|
||||
files = files.get_files()
|
||||
for file_path in tqdm(files, desc="Second beautification steps"):
|
||||
xml.beautify_xml(file_path, False)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
beautify_xml()
|
57
bundesdata_markup_nlp/markup/metadata.py
Executable file
57
bundesdata_markup_nlp/markup/metadata.py
Executable file
@ -0,0 +1,57 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from utility.FileGetter import FileGetter
|
||||
from markup.MetadataMarkup import MetadataMarkup
|
||||
from tqdm import tqdm
|
||||
import os
|
||||
import configparser
|
||||
import logging
|
||||
|
||||
|
||||
def get_metadata():
|
||||
"""
|
||||
This script creates a valid metadata head and first level xml tag strucutre
|
||||
for all files in one directory with subdirs. It needs all filepaths for all
|
||||
files to consider. File paths will be extracted by using the FileGetter
|
||||
class.
|
||||
After that it extracts the given metadata for one file each and writes it as
|
||||
valid XML according to the new offical schema into a new file at the given
|
||||
output path.
|
||||
"""
|
||||
logger = logging.getLogger(__name__)
|
||||
print("Running metadata creation for original XML-protocolls.")
|
||||
config = configparser.ConfigParser()
|
||||
config.read("config.ini")
|
||||
input_path = config["File paths"]["input_folder_xmls"]
|
||||
output_path = config["File paths"]["output_folder"]
|
||||
Files = FileGetter(input_path, "*.xml")
|
||||
file_list = Files.get_files()
|
||||
metadata = MetadataMarkup()
|
||||
for file in tqdm(sorted(file_list), desc="Metadata status:"):
|
||||
logger.info("\nCreating metadata for: " + str(os.path.basename(file)))
|
||||
root = metadata.read_protcol(file)
|
||||
metadata.extract_metadata(root)
|
||||
metadata.built_iso_date(metadata.datum_ger_non_iso)
|
||||
metadata.built_date_string(metadata.datum_iso)
|
||||
metadata.delete_old_metadata(root)
|
||||
metadata.split_content(root)
|
||||
metadata.insert_new_metadata(root)
|
||||
metadata.get_session_times()
|
||||
metadata.write_to_attr("dbtplenarprotokoll", "sitzung-datum",
|
||||
metadata.datum_ger_non_iso)
|
||||
metadata.write_to_attr("dbtplenarprotokoll", "sitzung-start-uhrzeit",
|
||||
metadata.session_start_time)
|
||||
metadata.write_to_attr("dbtplenarprotokol", "sitzung-ende-uhrzeit",
|
||||
metadata.session_end_time)
|
||||
metadata.write_to_attr("dbtplenarprotokoll", "sitzungs-nr",
|
||||
metadata.sitzungsnr)
|
||||
metadata.write_to_attr("dbtplenarprotokol", "wahlperiode",
|
||||
metadata.wahlperiode)
|
||||
metadata.save_to_file(output_path, file, "new_metadata", "File paths", "new_metadata")
|
||||
logger.info("New metadata created for: " + str(os.path.basename(file)))
|
||||
print("Succesfully extracted and wrote new metadata to XML-protocolls.")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
get_metadata()
|
122
bundesdata_markup_nlp/markup/speaker_names.py
Executable file
122
bundesdata_markup_nlp/markup/speaker_names.py
Executable file
@ -0,0 +1,122 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from markup.SpeakerNameMarkup import SpeakerNameMarkup
|
||||
from markup.MdBData import MdBData
|
||||
from utility.FileGetter import FileGetter
|
||||
from xml.etree import ElementTree
|
||||
from tqdm import tqdm
|
||||
import os
|
||||
import configparser
|
||||
import logging
|
||||
|
||||
|
||||
def get_names():
|
||||
"""
|
||||
This script gets the identified speaker elements. It will analyse the text
|
||||
of those to determine <vorname>, <nachname>, @id etc. for every speaker.
|
||||
Also creates a speech id for every speech.
|
||||
"""
|
||||
###
|
||||
# Setting paths in config and start logging
|
||||
###
|
||||
logger = logging.getLogger(__name__)
|
||||
config = configparser.ConfigParser()
|
||||
config.read("config.ini")
|
||||
xml_path = config["File paths"]["new_simple_markup"]
|
||||
output_path = config["File paths"]["output_folder"]
|
||||
parent_path = os.path.dirname(os.getcwd())
|
||||
stammdatenbank_full_path = os.path.join(parent_path,
|
||||
"data/MdB_data/MdB_Stammdaten.xml")
|
||||
###
|
||||
# opens and reads Stammdatenbank
|
||||
###
|
||||
stammdatenbank = MdBData()
|
||||
stammdatenbank.read_xml(stammdatenbank_full_path)
|
||||
###
|
||||
# Getting sets of different name name/MdB features
|
||||
###
|
||||
# getting first names
|
||||
first_names = stammdatenbank.get_set(".//VORNAME", stammdatenbank.xml_tree)
|
||||
first_names.discard(None)
|
||||
# getting las names
|
||||
last_names = stammdatenbank.get_set(".//NACHNAME", stammdatenbank.xml_tree)
|
||||
last_names.discard(None)
|
||||
# getting academic titles
|
||||
academic_titles = stammdatenbank.get_set(".//AKAD_TITEL",
|
||||
stammdatenbank.xml_tree)
|
||||
academic_titles_short = stammdatenbank.get_set(".//ANREDE_TITEL",
|
||||
stammdatenbank.xml_tree)
|
||||
additional_academic_titles = [title for title in config["Additional name features"]["academic_titles"].split()]
|
||||
for title in additional_academic_titles:
|
||||
academic_titles.add(title)
|
||||
academic_titles = academic_titles.union(academic_titles_short)
|
||||
academic_titles.discard(None)
|
||||
# getting parties
|
||||
parties = stammdatenbank.get_set(".//PARTEI_KURZ", stammdatenbank.xml_tree)
|
||||
additional_parties = [party for party in config["Additional name features"]["parties"].split()]
|
||||
for party in additional_parties:
|
||||
parties.add(party)
|
||||
parties.discard(None)
|
||||
# getting name affixes
|
||||
name_affixes = stammdatenbank.get_set(".//PRAEFIX", stammdatenbank.xml_tree)
|
||||
name_affixes.discard(None)
|
||||
# getting cities
|
||||
cities = stammdatenbank.get_set(".//ORTSZUSATZ", stammdatenbank.xml_tree)
|
||||
cities.discard(None)
|
||||
# setting empty sets to later combine them with XML node names for XPaths
|
||||
party = set() #
|
||||
periode = set() #
|
||||
feature_complete = set() #
|
||||
speaker_id = set() #
|
||||
role_long = set()
|
||||
role_short = set()
|
||||
###
|
||||
# creating dict with tuples of sets and corresponding XML node name
|
||||
###
|
||||
sets = [(first_names, "VORNAME"), (last_names, "NACHNAME"),
|
||||
(academic_titles, "AKAD_TITEL"), (parties, "PARTEI_KURZ"),
|
||||
(name_affixes, "PRAEFIX"), (cities, "ORTSZUSATZ"),
|
||||
(party, "PARTEI_KURZ"), (periode, "WP"), (feature_complete, "None"),
|
||||
(speaker_id, "ID"), (role_long, "None"), (role_short, "None")]
|
||||
features = ["vorname", "nachname", "titel", "fraktion", "namenszusatz",
|
||||
"ortszusatz", "partei", "wahlperiode", "feature_complete",
|
||||
"id", "rolle_lang", "rolle_kurz"]
|
||||
feature_set_dict = dict(zip(features, sets))
|
||||
###
|
||||
# opening XML protocolls
|
||||
# starting speaker markup for features
|
||||
###
|
||||
files = FileGetter(xml_path, "*.xml")
|
||||
files = files.get_files()
|
||||
for file_path in tqdm(sorted(files),
|
||||
desc="File status"):
|
||||
complex_speaker = SpeakerNameMarkup(file_path, ".//redner")
|
||||
complex_speaker.read_xml(file_path)
|
||||
complex_speaker.get_element_text()
|
||||
logger.info(("Doing cross reference markup for names to get redner ids."
|
||||
+ " For file: "
|
||||
+ os.path.basename(file_path)))
|
||||
complex_speaker.cross_reference_markup(complex_speaker.current_strings,
|
||||
feature_set_dict,
|
||||
stammdatenbank.xml_tree)
|
||||
complex_speaker.create_speaker_elements()
|
||||
complex_speaker.replace_elements(".//redner",
|
||||
complex_speaker.all_speaker_elements,
|
||||
True)
|
||||
xml_string = ElementTree.tostring(complex_speaker.xml_tree)
|
||||
bool = complex_speaker.simple_check_xml(xml_string, file_path, False,
|
||||
False)
|
||||
if(bool is False):
|
||||
logger.error(("This XML file is not well-formed. Program stopped."
|
||||
" Fix or remove this file an run the program again."
|
||||
))
|
||||
print("Program has stopped. See logs for more info.")
|
||||
break
|
||||
complex_speaker.set_speech_ids()
|
||||
complex_speaker.save_to_file(output_path, file_path, "complex_markup",
|
||||
"File paths", "complex_markup")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
get_names()
|
114
bundesdata_markup_nlp/markup/speakers.py
Executable file
114
bundesdata_markup_nlp/markup/speakers.py
Executable file
@ -0,0 +1,114 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from utility.FileGetter import FileGetter
|
||||
from utility.XMLProtocol import XMLProtocol
|
||||
from markup.EntityMarkup import EntityMarkup
|
||||
from markup.SpeakerMarkup import SpeakerMarkup
|
||||
from tqdm import tqdm
|
||||
import configparser
|
||||
import logging
|
||||
import os
|
||||
|
||||
|
||||
def get_speakers():
|
||||
"""
|
||||
This script identifies speakers in one xml with the new metadata structure
|
||||
created by metastructure.py and applies well-formed XML markup to them and their
|
||||
speeches. The markup trys to follow the official guideline from the Deutsche
|
||||
Bundesregierung but is more simplistic and deviates from it when it comes down
|
||||
to apply markup to the presiden of a session. This decision was made to
|
||||
guarantee that every speakers speech only contains what he or she is saying.
|
||||
Thus the markup follows the own minimal markup defined in the DTD
|
||||
'minimal_markup.dtd' which trys to mimic the official one as close as
|
||||
possible. The full offical markup cannot be applied to the XML protocolls
|
||||
automatically. Script uses classes and subclasses from EntityMarkup.py.
|
||||
"""
|
||||
logger = logging.getLogger(__name__)
|
||||
print("Running simple markup for first speaker identification.")
|
||||
config = configparser.ConfigParser()
|
||||
config.read("config.ini")
|
||||
regex_conf_triples = config.items("Regular expressions speakers")
|
||||
regex_conf_triples = [regex[1].split(" ; ") for regex in regex_conf_triples]
|
||||
input_path = config["File paths"]["new_metadata"]
|
||||
output_path = config["File paths"]["output_folder"]
|
||||
files = FileGetter(input_path, "*.xml")
|
||||
file_list = files.get_files()
|
||||
sum_matches = 0
|
||||
|
||||
for file_path in tqdm(sorted(file_list), desc="Speaker markup status"):
|
||||
|
||||
identified = EntityMarkup(file_path)
|
||||
logger.info("Doing simple markup for: " + str(os.path.basename(file_path)))
|
||||
logger.info("\nMarkup status for: " + str(os.path.basename(file_path)))
|
||||
with open(file_path, 'r') as f:
|
||||
xml_as_string = f.read()
|
||||
xml_as_bytes = xml_as_string.encode("utf-8")
|
||||
bool = identified.simple_check_xml(xml_as_bytes, file_path, False,
|
||||
False)
|
||||
if(bool is False):
|
||||
logger.error(("This XML file is not well-formed. Program stopped."
|
||||
" Fix or remove this file an run the program again."
|
||||
))
|
||||
print("Program has stopped. See logs for more info.")
|
||||
break
|
||||
identified.read_xml(file_path)
|
||||
identified.get_element_text()
|
||||
string_for_markup = identified.current_string
|
||||
# Start of simple markup
|
||||
for regex_conf_triplet in regex_conf_triples:
|
||||
regex = regex_conf_triplet[0]
|
||||
case = regex_conf_triplet[1]
|
||||
speaker = SpeakerMarkup(string_for_markup, regex)
|
||||
speaker.compile_regex(regex)
|
||||
speaker.identify_speaker()
|
||||
speaker.markup_speaker(case)
|
||||
string_for_markup = speaker.markuped_string
|
||||
sum_matches += speaker.matches_count
|
||||
|
||||
logger.info(str(sum_matches) + " total matches in the protocol.")
|
||||
sum_matches = 0
|
||||
speaker.simple_check_xml(string_for_markup, file_path, False)
|
||||
# Saving simple markuped string to xml
|
||||
speaker.read_xml(file_path)
|
||||
speaker.replace_string(string_for_markup, "sitzungsverlauf")
|
||||
speaker.save_to_file(output_path, file_path, "simple_xml", "File paths",
|
||||
"new_simple_markup")
|
||||
|
||||
print("Simple markup finished.")
|
||||
|
||||
config.read("config.ini")
|
||||
new_simple_xml_path = config["File paths"]["new_simple_markup"]
|
||||
# Start of president Replacer
|
||||
new_files = FileGetter(new_simple_xml_path, "*.xml")
|
||||
new_file_list = new_files.get_files()
|
||||
print("Replacing some XML-elements in the protocolls.")
|
||||
for file_path in tqdm(sorted(new_file_list), desc="Files replacement status"):
|
||||
logger.info("Replacing some xml elements for: " + str(os.path.basename(file_path)))
|
||||
for regex_conf_triplet in regex_conf_triples:
|
||||
if(regex_conf_triplet[1] != "first"
|
||||
or regex_conf_triplet[1] != "last"):
|
||||
regex = regex_conf_triplet[0]
|
||||
speaker_rolle_value = regex_conf_triplet[2]
|
||||
replacements = XMLProtocol()
|
||||
replacements.read_xml(file_path)
|
||||
replacements.compile_regex(regex)
|
||||
replacements.expand_element(".//rede", "typ",
|
||||
speaker_rolle_value)
|
||||
replacements.save_to_file(output_path, file_path, "simple_xml",
|
||||
"File paths", "new_simple_markup")
|
||||
start_time_attr_value = replacements.xml_tree.get("sitzung-start-uhrzeit")
|
||||
replacements.replace_tag_attr(".//sitzungsverlauf/rede[1]",
|
||||
"sitzungsbeginn",
|
||||
"sitzung-start-uhrzeit",
|
||||
start_time_attr_value,
|
||||
False)
|
||||
end_time_attr_value = replacements.xml_tree.get("sitzung-ende-uhrzeit")
|
||||
replacements.expand_element(".//sitzungsende", "sitzung-ende-uhrzeit",
|
||||
end_time_attr_value, False)
|
||||
replacements.save_to_file(output_path, file_path, "simple_xml",
|
||||
"File paths", "new_simple_markup")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
get_speakers()
|
76
bundesdata_markup_nlp/markup/speeches.py
Executable file
76
bundesdata_markup_nlp/markup/speeches.py
Executable file
@ -0,0 +1,76 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from utility.FileGetter import FileGetter
|
||||
from markup.EntityMarkup import EntityMarkup
|
||||
import configparser
|
||||
from tqdm import tqdm
|
||||
import logging
|
||||
|
||||
def markup_speeches():
|
||||
"""
|
||||
Marks up different entitys in the speech strings. For example comments.
|
||||
First it marks speech parts (<p>) line by line.
|
||||
"""
|
||||
logger = logging.getLogger(__name__)
|
||||
config = configparser.ConfigParser()
|
||||
config.read("config.ini")
|
||||
complex_xmls = config["File paths"]["complex_markup"]
|
||||
output_path = config["File paths"]["output_folder"]
|
||||
regex_conf_pairs = config.items("Regular expressions speeches")
|
||||
regex_conf_pairs = [regex[1].split(" ; ") for regex in regex_conf_pairs]
|
||||
multiline_entities = config.items("Multiline entities")
|
||||
multiline_entities = [regex[1].split(" ; ") for regex in multiline_entities]
|
||||
files = FileGetter(complex_xmls, "*.xml")
|
||||
file_list = files.get_files()
|
||||
for file_path in tqdm(sorted(file_list), desc="File status speech markup"):
|
||||
entity = EntityMarkup(file_path)
|
||||
entity.read_xml(file_path)
|
||||
speeches = entity.xml_tree.xpath(".//rede")
|
||||
session_start = entity.xml_tree.xpath(".//sitzungsbeginn")[0]
|
||||
for speech in speeches:
|
||||
entity.markup_speech_lines(speech)
|
||||
entity.markup_speech_lines(session_start)
|
||||
|
||||
session_lines = entity.xml_tree.xpath(".//p")
|
||||
for line in tqdm(session_lines, desc="Marking single line entities"):
|
||||
for pair in regex_conf_pairs:
|
||||
entity.inject_element(line, pair[0], pair[1])
|
||||
|
||||
session_lines = entity.xml_tree.xpath(".//p") # gets new altered session lines (<p>)
|
||||
for pair in tqdm(multiline_entities, desc="Marking multiline entities:"):
|
||||
entity.get_multiline_entities(session_lines, pair[0], pair[1], pair[2])
|
||||
# For logging
|
||||
all_entities = 0
|
||||
only_single_line_entities = 0
|
||||
for pair in regex_conf_pairs:
|
||||
element_path = ".//" + pair[1]
|
||||
nr_entities = len(entity.xml_tree.xpath(element_path))
|
||||
logger.info(("Number of identified " + pair[1] + " elements is: "
|
||||
+ str(nr_entities)
|
||||
+ " (single line)"))
|
||||
all_entities += nr_entities
|
||||
only_single_line_entities += nr_entities
|
||||
|
||||
for pair in multiline_entities:
|
||||
element_path = ".//" + pair[2]
|
||||
nr_entities = len(entity.xml_tree.xpath(element_path))
|
||||
logger.info(("Number of identified " + pair[2] + " elements is: "
|
||||
+ str(nr_entities)
|
||||
+ " (multi line)"))
|
||||
all_entities += nr_entities
|
||||
|
||||
logger.info(("Number of all identified single line entities: "
|
||||
+ str(only_single_line_entities)))
|
||||
|
||||
logger.info(("Number of all identified entities is: " + str(all_entities)
|
||||
+ " Also includes multiline matches. Number could be higher"
|
||||
+ " than it is if multiline matches are matching the same"
|
||||
+ " like the single line entitie regexes."))
|
||||
|
||||
entity.save_to_file(output_path, file_path, "clear_speech_markup",
|
||||
"File paths", "clear_speech_markup")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
markup_speeches()
|
Binary file not shown.
BIN
bundesdata_markup_nlp/nlp/__pycache__/n_grams.cpython-37.pyc
Normal file
BIN
bundesdata_markup_nlp/nlp/__pycache__/n_grams.cpython-37.pyc
Normal file
Binary file not shown.
BIN
bundesdata_markup_nlp/nlp/__pycache__/tokenize.cpython-37.pyc
Normal file
BIN
bundesdata_markup_nlp/nlp/__pycache__/tokenize.cpython-37.pyc
Normal file
Binary file not shown.
84
bundesdata_markup_nlp/nlp/lemmatization.py
Executable file
84
bundesdata_markup_nlp/nlp/lemmatization.py
Executable file
@ -0,0 +1,84 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import de_core_news_sm
|
||||
import configparser
|
||||
from utility.XMLProtocol import XMLProtocol
|
||||
from lxml import etree
|
||||
from tqdm import tqdm
|
||||
import re
|
||||
|
||||
|
||||
def lemmatization(files, no_stop_words=False):
|
||||
"""
|
||||
Lemmatizes the speeches of the input XML protocols with the built in spacy
|
||||
lookup-table function. Can include or exclude stop words.
|
||||
Lemmatized text will be written into an new Element named
|
||||
<rede_lemmatisiert>. Always removes punctuation. Joines hyphenated strings
|
||||
before they will be lemmatised.
|
||||
"""
|
||||
nlp = de_core_news_sm.load()
|
||||
config = configparser.ConfigParser()
|
||||
config.read("config.ini")
|
||||
output_path = config["File paths"]["nlp_output"]
|
||||
for file_path in tqdm(sorted(files), desc="Lemmatization file status"):
|
||||
xml = XMLProtocol()
|
||||
xml.read_xml(file_path)
|
||||
speeches = xml.xml_tree.xpath(".//rede | .//sitzungsbeginn")
|
||||
for speech in speeches:
|
||||
parts = speech.xpath(".//p")
|
||||
tmp_list = []
|
||||
for part in parts:
|
||||
if(part.text is not None):
|
||||
tmp_list.append(re.sub(r"_", " ", str(part.text + "\n")))
|
||||
"""
|
||||
replaces "_" with " ". Is needed because a string like
|
||||
"Treffsicherheit einer Schrotflinte;_Sie haben nämlich kaum
|
||||
den Punkt getroffen" will not be lemmatized correctly in spacy.
|
||||
"Schrotflinte;_Sie" wil be recognized as one token.
|
||||
Furthermore this meeses up the sorted ngram calculation.
|
||||
Also adds \n at end of every line to help identifying
|
||||
hyphenated words.
|
||||
"""
|
||||
part.getparent().remove(part)
|
||||
new_text = "".join(tmp_list)
|
||||
new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[a-ßzäüö])", "\g<wordend>\g<wordstart>", new_text)
|
||||
"""
|
||||
joins hyphenated words together:
|
||||
'Länderfinanz- ausgleich' --> Länderfinanzausgleich.
|
||||
Better to do it here because most of the comments and metadata has
|
||||
already been marked.
|
||||
Ignores strings like: 'Finanz-, Handels- und Sicherheitspolitik'.
|
||||
Does not ignore them when they happen at a linebreak. This is a rare
|
||||
occasion though.
|
||||
"""
|
||||
new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[A-ZÄÜÖ])", "\g<wordend>-\g<wordstart>", new_text)
|
||||
"""
|
||||
Removes all line breaks again. This way compound names with a line
|
||||
break inbetween like "Sütterlin-\nWaack" will be recognized as one
|
||||
string by spacy. --> Sütterlin-Waack
|
||||
"""
|
||||
lemmatized_speech = etree.Element("rede_lemmatisiert")
|
||||
doc = nlp(new_text)
|
||||
if(no_stop_words is False):
|
||||
lemmatized = " ".join([token.lemma_ for token in doc
|
||||
if token.pos_ != "PUNCT" and token.text != "_"])
|
||||
"""
|
||||
Removes "_" from text. Has to be removed
|
||||
because it is some kind of special
|
||||
character in spacy.
|
||||
"""
|
||||
filename_sufix = "_lemmatized_with_stopwords.xml"
|
||||
elif(no_stop_words is True):
|
||||
lemmatized = " ".join([token.lemma_ for token in doc
|
||||
if token.is_stop is False
|
||||
and token.pos_ != "PUNCT" and token.text != "_"])
|
||||
filename_sufix = "_lemmatized_without_stopwords.xml"
|
||||
lemmatized_speech.text = lemmatized
|
||||
speech.append(lemmatized_speech)
|
||||
xml.save_to_file(output_path, file_path, "lemmatized", "File paths",
|
||||
"nlp_lemmatized_tokenized", filename_sufix)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
lemmatization()
|
142
bundesdata_markup_nlp/nlp/n_grams.py
Executable file
142
bundesdata_markup_nlp/nlp/n_grams.py
Executable file
@ -0,0 +1,142 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*
|
||||
|
||||
import configparser
|
||||
import csv
|
||||
import os
|
||||
import gc
|
||||
from utility.XMLProtocol import XMLProtocol
|
||||
from collections import Counter
|
||||
from tqdm import tqdm
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from itertools import groupby, chain
|
||||
from operator import itemgetter
|
||||
import locale
|
||||
locale.setlocale(locale.LC_COLLATE, "C") # Sets locale to portable "C" locale.
|
||||
|
||||
|
||||
def n_grams(files, group_by_feature="year",
|
||||
input_type_name="lemmatized_without_stopwords"):
|
||||
"""
|
||||
Clacluates 1 to 5 grams for given input protocols. Can either handel
|
||||
lemmatized or non lemmatized files. Writes the ngrams to a tab separated csv
|
||||
file. One row inclueds the ngram, the match count of it, the year or date,
|
||||
or rede_id or redner_id. One file per unigram, bigram, trigram etc. per
|
||||
group key will be created. (There wil be one file for unigrams starting with
|
||||
the letter 'A' one for unigrams starting with 'B' etc.)
|
||||
Third parameter is a string set by the user which will be added to
|
||||
the file names to help distinguish lemmatized and non lemmatized ngrams etc.
|
||||
The more protocols are used as input the more RAM the script needs.
|
||||
For all 4106 protocols 32GB of RAM with a 32GB swap file was used!
|
||||
"""
|
||||
config = configparser.ConfigParser()
|
||||
config.read("config.ini")
|
||||
output_path = config["File paths"]["nlp_output"]
|
||||
output_path = os.path.join(output_path, "n-grams")
|
||||
if not os.path.exists(output_path):
|
||||
os.mkdir(output_path)
|
||||
for step in tqdm(range(6)[1:], desc="Current ngram calculating"):
|
||||
N_GRAMS = []
|
||||
file_name_prefix = str(step) + "_grams"
|
||||
counter_vectorizer = CountVectorizer(ngram_range=(step, step),
|
||||
lowercase=False)
|
||||
for file_path in tqdm(sorted(files), desc="File status"):
|
||||
xml = XMLProtocol()
|
||||
xml.read_xml(file_path)
|
||||
feature_year = xml.xml_tree.xpath("@sitzung-datum")[0][-4:]
|
||||
feature_mont_year = xml.xml_tree.xpath("@sitzung-datum")[0][-7:]
|
||||
speeches = xml.xml_tree.xpath(".//rede | .//sitzungsbeginn")
|
||||
for speech in speeches:
|
||||
# gets id of current speech
|
||||
feature_rede_id = speech.xpath("@id")
|
||||
if(len(feature_rede_id) == 0):
|
||||
feature_rede_id = "sitzungsbeginn"
|
||||
else:
|
||||
feature_rede_id = feature_rede_id[0]
|
||||
# gets id of current speaker
|
||||
feature_redner_id = speech.xpath(".//redner/@id")[0]
|
||||
# gets speech text from tokenized or lemmatized protocol
|
||||
speech_text = speech.xpath("node()[2]")[0] # gets second child of speech
|
||||
if(speech_text.text is not None):
|
||||
tmp_str = speech_text.text
|
||||
|
||||
ngrams = counter_vectorizer.build_analyzer()
|
||||
ngrams_list = ngrams(tmp_str)
|
||||
|
||||
if(group_by_feature == "year"):
|
||||
pairs = [(pair,) + (feature_year,) for pair
|
||||
in ngrams_list]
|
||||
elif(group_by_feature == "month_year"):
|
||||
pairs = [(pair,) + (feature_mont_year,) for pair
|
||||
in ngrams_list]
|
||||
elif(group_by_feature == "speaker"):
|
||||
pairs = [(pair,) + (feature_redner_id,) for pair
|
||||
in ngrams_list]
|
||||
elif(group_by_feature == "speech"):
|
||||
pairs = [(pair,) + (feature_rede_id,) for pair
|
||||
in ngrams_list]
|
||||
N_GRAMS.extend(pairs)
|
||||
speeches = None
|
||||
# puts uppercase ngram at first position in line to sort by this
|
||||
# will be delted later on
|
||||
print("Start counting ngrams.")
|
||||
N_GRAMS = Counter(N_GRAMS)
|
||||
print("Finished counting ngrams.")
|
||||
print("Start sorting ngrams")
|
||||
N_GRAMS = [item[0][0][0].upper()
|
||||
+ "||"
|
||||
+ item[0][0]
|
||||
+ "||"
|
||||
+ str(item[0][1])
|
||||
+ "||"
|
||||
+ str(item[1])
|
||||
for item in N_GRAMS.items()]
|
||||
N_GRAMS = sorted(N_GRAMS, key=locale.strxfrm)
|
||||
print("Finished sorting ngrams")
|
||||
# sorts all ngrams into groups one group for each german uppercasse
|
||||
# letter except ß
|
||||
# Also one group for every decimal from 0 to 10
|
||||
# Other non ascii or non decimal ngrams will be sorted in own groups
|
||||
# These groups will be joined together later on into one non ascii group
|
||||
alphabetically = []
|
||||
tmp_list = []
|
||||
for letter, entries in tqdm(groupby(N_GRAMS, key=itemgetter(0)),
|
||||
desc="Grouping ngrams alphabetically"):
|
||||
if(letter):
|
||||
print(letter)
|
||||
for entry in entries:
|
||||
tmp_list.append(entry)
|
||||
alphabetically.append(tmp_list)
|
||||
tmp_list = []
|
||||
N_GRAMS = None
|
||||
gc.collect() # frees RAM
|
||||
key_list = ([i for i in range(10)]
|
||||
+ "A B C D E F G H I J K L M N O P Q R S T U V W X Y Z".split()
|
||||
+ ["_Non_ASCII"])
|
||||
# groups all non ascii ngrams into one list to save them into one csv
|
||||
if(len(alphabetically) > 37):
|
||||
joined_tail = alphabetically[36:]
|
||||
joined_tail = chain.from_iterable(list(joined_tail))
|
||||
del alphabetically[36:]
|
||||
alphabetically.append(joined_tail)
|
||||
# save groups to individual files
|
||||
for group, key in tqdm(zip(alphabetically, key_list),
|
||||
desc="Writing ngrams to files"):
|
||||
group_ngrams = [entry.split("||")[1:] for entry in group]
|
||||
file_name = (str(key)
|
||||
+ "_"
|
||||
+ file_name_prefix
|
||||
+ "_per_"
|
||||
+ group_by_feature
|
||||
+ "_"
|
||||
+ input_type_name
|
||||
+ ".csv")
|
||||
file_output_path = os.path.join(output_path, file_name)
|
||||
with open(file_output_path, "w", newline="", encoding="utf8") as file:
|
||||
writer = csv.writer(file, delimiter="\t")
|
||||
writer.writerows(group_ngrams)
|
||||
alphabetically = None
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
n_grams()
|
78
bundesdata_markup_nlp/nlp/tokenize.py
Executable file
78
bundesdata_markup_nlp/nlp/tokenize.py
Executable file
@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import de_core_news_sm
|
||||
import configparser
|
||||
from utility.XMLProtocol import XMLProtocol
|
||||
from lxml import etree
|
||||
from tqdm import tqdm
|
||||
import re
|
||||
|
||||
|
||||
def tokenize(files, no_stop_words=False):
|
||||
"""
|
||||
Tokenizes the speeches of the input XML protocols. Can include or exclude
|
||||
stop words. Tokenized speeches will be written into a new element
|
||||
<rede_tokenisiert>. Always removes punctuation. Joines hyphenated strings
|
||||
before they will be tokenized.
|
||||
"""
|
||||
nlp = de_core_news_sm.load()
|
||||
config = configparser.ConfigParser()
|
||||
config.read("config.ini")
|
||||
output_path = config["File paths"]["nlp_output"]
|
||||
for file_path in tqdm(sorted(files), desc="Tokenization file status"):
|
||||
xml = XMLProtocol()
|
||||
xml.read_xml(file_path)
|
||||
speeches = xml.xml_tree.xpath(".//rede | .//sitzungsbeginn")
|
||||
for speech in speeches:
|
||||
parts = speech.xpath(".//p")
|
||||
tmp_list = []
|
||||
for part in parts:
|
||||
if(part.text is not None):
|
||||
tmp_list.append(re.sub(r"_", " ", str(part.text + "\n")))
|
||||
"""
|
||||
replaces "_" with " ". Is needed because a string like
|
||||
"Treffsicherheit einer Schrotflinte;_Sie haben nämlich kaum
|
||||
den Punkt getroffen" will not be lemmatized correctly in spacy.
|
||||
"Schrotflinte;_Sie" wil be recognized as one token.
|
||||
Furthermore this meeses up the sorted ngram calculation.
|
||||
Also adds \n at end of every line to help identifying
|
||||
hyphenated words.
|
||||
"""
|
||||
part.getparent().remove(part)
|
||||
new_text = "".join(tmp_list)
|
||||
new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[a-ßzäüö])", "\g<wordend>\g<wordstart>", new_text)
|
||||
"""
|
||||
joins hyphenated words together:
|
||||
'Länderfinanz- ausgleich' --> Länderfinanzausgleich.
|
||||
Better to do it here because most of the comments and metadata has
|
||||
already been marked.
|
||||
Ignores strings like: 'Finanz-, Handels- und Sicherheitspolitik'.
|
||||
Does not ignore them when they happen at a linebreak. This is a rare
|
||||
occasion though.
|
||||
"""
|
||||
new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[A-ZÄÜÖ])", "\g<wordend>-\g<wordstart>", new_text)
|
||||
"""
|
||||
Removes all line breaks again. This way compound names with a line
|
||||
break inbetween like "Sütterlin-\nWaack" will be recognized as one
|
||||
string by spacy. --> Sütterlin-Waack
|
||||
"""
|
||||
tokenized_speech = etree.Element("rede_tokenisiert")
|
||||
doc = nlp(new_text)
|
||||
if(no_stop_words is False):
|
||||
tokenized = " ".join([token.text for token in doc
|
||||
if token.pos_ != "PUNCT"])
|
||||
filename_sufix = "_tokenized_with_stopwords.xml"
|
||||
elif(no_stop_words is True):
|
||||
tokenized = " ".join([token.text for token in doc
|
||||
if token.is_stop is False
|
||||
and token.pos_ != "PUNCT"])
|
||||
filename_sufix = "_tokenized_without_stopwords.xml"
|
||||
tokenized_speech.text = tokenized
|
||||
speech.append(tokenized_speech)
|
||||
xml.save_to_file(output_path, file_path, "tokenized", "File paths",
|
||||
"nlp_lemmatized_tokenized", filename_sufix)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
tokenize()
|
0
bundesdata_markup_nlp/samples/__init__.py
Executable file
0
bundesdata_markup_nlp/samples/__init__.py
Executable file
95
bundesdata_markup_nlp/samples/create_samples.py
Executable file
95
bundesdata_markup_nlp/samples/create_samples.py
Executable file
@ -0,0 +1,95 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import fnmatch
|
||||
import argparse
|
||||
import random
|
||||
import shutil
|
||||
|
||||
"""
|
||||
This is just a quick script to get randomized samples from the protocols.
|
||||
"""
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
"""Argument Parser"""
|
||||
parser = argparse.ArgumentParser(description="Creates samples from given \
|
||||
directory with given size. Creates two \
|
||||
samples with no overlapping.")
|
||||
parser.add_argument("-p",
|
||||
"--path",
|
||||
help="Path to data files to create sample from.",
|
||||
required=True,
|
||||
type=str,
|
||||
metavar="")
|
||||
parser.add_argument("-s",
|
||||
"--size",
|
||||
help="Size of sample.",
|
||||
required=True,
|
||||
type=int,
|
||||
metavar="")
|
||||
parser.add_argument("-n", "--number_of_samples",
|
||||
help="How many smaples should be created? should be \
|
||||
created?",
|
||||
required=True,
|
||||
type=int,
|
||||
metavar="")
|
||||
parser.add_argument("-t",
|
||||
"--file_type",
|
||||
help="What file types should be used as the base for \
|
||||
the sample?. Accepts wildcars.",
|
||||
required=True,
|
||||
type=str)
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def get_files(path, file_type):
|
||||
"""Creates file list with full paths of all files in the given directory and
|
||||
its sub directories and returns it."""
|
||||
list_of_files = []
|
||||
for path, subdirs, files in os.walk(path):
|
||||
for name in files:
|
||||
if fnmatch.fnmatch(name, file_type):
|
||||
list_of_files.append(os.path.join(path, name))
|
||||
return list_of_files
|
||||
|
||||
|
||||
def get_files_to_copy(list_of_files, sample_size):
|
||||
"""Gets random filepaths from all filepaths to create a sample out of those.
|
||||
Filepaths that have already been use will be removed from the file list to
|
||||
create independent sampels."""
|
||||
counter = 0
|
||||
sample_list = []
|
||||
while counter < sample_size:
|
||||
counter += 1
|
||||
random_index = random.randint(0, len(list_of_files)-1)
|
||||
sample_list.append(list_of_files[random_index])
|
||||
del list_of_files[random_index]
|
||||
pass
|
||||
return list_of_files, sample_list
|
||||
|
||||
|
||||
def copy_files(path, sample_list, step_int):
|
||||
"""Copys the given files to new directories."""
|
||||
sample_path = os.path.join(path, str(step_int))
|
||||
print(sample_path)
|
||||
os.mkdir(sample_path)
|
||||
for file in sample_list:
|
||||
shutil.copy2(file, sample_path)
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_arguments()
|
||||
path = args.path
|
||||
file_list = get_files(path, args.file_type)
|
||||
for step in range(1, args.number_of_samples + 1):
|
||||
file_list = get_files_to_copy(file_list, args.size)[0]
|
||||
sample_list = get_files_to_copy(file_list, args.size)[1]
|
||||
copy_files(path, sample_list, step)
|
||||
file_list = get_files_to_copy(file_list, args.size)[0]
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
35
bundesdata_markup_nlp/utility/FileGetter.py
Executable file
35
bundesdata_markup_nlp/utility/FileGetter.py
Executable file
@ -0,0 +1,35 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import fnmatch
|
||||
|
||||
"""
|
||||
This class is for getting filepaths of all files in a given directory. Also
|
||||
gets files in subdirectories.
|
||||
"""
|
||||
|
||||
|
||||
class FileGetter(object):
|
||||
"""
|
||||
Class for getting file paths of given path wich will be opend and/or
|
||||
further processed later on.
|
||||
"""
|
||||
|
||||
def __init__(self, path, file_type):
|
||||
super(FileGetter, self).__init__()
|
||||
self.path = path
|
||||
self.file_type = file_type
|
||||
|
||||
def get_files(self):
|
||||
"""
|
||||
Creates file list with full paths of all files in the given
|
||||
directory and its sub directories and returns it.
|
||||
"""
|
||||
list_of_files = []
|
||||
for path, subdirs, files in os.walk(self.path):
|
||||
for name in files:
|
||||
if fnmatch.fnmatch(name, self.file_type):
|
||||
list_of_files.append(os.path.join(path, name))
|
||||
self.list_of_files = list_of_files
|
||||
return list_of_files
|
209
bundesdata_markup_nlp/utility/XMLProtocol.py
Executable file
209
bundesdata_markup_nlp/utility/XMLProtocol.py
Executable file
@ -0,0 +1,209 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from utility import delete_folder
|
||||
from utility import update_config
|
||||
from xml.etree import ElementTree
|
||||
from os import path
|
||||
from lxml import etree
|
||||
import os
|
||||
import logging
|
||||
import re
|
||||
|
||||
|
||||
class XMLProtocol(object):
|
||||
"""Class for standard operations on/with the XML protocols. Has functions
|
||||
for reading, saving and manipulationg an XML protocol. All other classes
|
||||
inherit from this one.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def read_protcol(self, file_path):
|
||||
"""
|
||||
Takes a file path and parses the file as an XML returns a root element.
|
||||
"""
|
||||
self.file_path = file_path
|
||||
self.filename = os.path.basename(self.file_path)
|
||||
parser = etree.XMLParser(remove_blank_text=True)
|
||||
self.tree = etree.parse(file_path, parser) # for better xml indentation
|
||||
root = self.tree.getroot()
|
||||
self.logger.info("File successfully parsed as XML.")
|
||||
return root
|
||||
|
||||
def read_xml(self, file_path):
|
||||
"""Takes a file path and parses the file as an XML."""
|
||||
parser = etree.XMLParser(encoding='utf-8', remove_blank_text=True)
|
||||
tree = etree.parse(file_path, parser) # for better xml indentation
|
||||
self.xml_tree = tree.getroot()
|
||||
|
||||
def save_to_file(self, output_path, file_path, subfolder, config_section,
|
||||
config_key, filename_sufix=""):
|
||||
"""
|
||||
Writes the new markup to a new xml file. Takes the output path and
|
||||
creates a new folder there. Also updates the config file with the new
|
||||
path.
|
||||
"""
|
||||
if(filename_sufix == ""):
|
||||
self.filename = path.basename(file_path)
|
||||
elif(filename_sufix != ""):
|
||||
self.filename = path.basename(file_path)[:-4] + filename_sufix
|
||||
save_path = os.path.join(output_path, subfolder)
|
||||
if not os.path.exists(save_path):
|
||||
os.mkdir(save_path)
|
||||
tree = etree.ElementTree(self.xml_tree)
|
||||
new_filename = self.filename
|
||||
save_file_path = os.path.join(save_path, new_filename)
|
||||
tree.write(save_file_path,
|
||||
pretty_print=True,
|
||||
xml_declaration=True,
|
||||
encoding="utf8",
|
||||
doctype="<!DOCTYPE dbtplenarprotokoll SYSTEM 'dbtplenarprotokoll_minimal.dtd\'>")
|
||||
self.logger.info("New XML saved to:" + save_file_path)
|
||||
update_config.update_config("config.ini", config_section, config_key,
|
||||
save_path)
|
||||
|
||||
def beautify_xml_part(self, file_path, xpath, alter_lines=False,
|
||||
line_width=80):
|
||||
"""
|
||||
Beautifies part (element node) of an input XML.
|
||||
"""
|
||||
tmp_path = os.path.join(os.path.dirname(file_path), "tmp")
|
||||
tree = etree.ElementTree(self.xml_tree)
|
||||
self.beautified_part = tree.find(xpath)
|
||||
self.beautified_part = ElementTree.tostring(self.beautified_part)
|
||||
self.beautified_part = etree.fromstring(self.beautified_part)
|
||||
self.beautified_part = etree.ElementTree(self.beautified_part)
|
||||
if not os.path.exists(tmp_path):
|
||||
os.mkdir(tmp_path)
|
||||
tmp_file_path = os.path.join(tmp_path, "tmp.xml")
|
||||
self.beautified_part.write(tmp_file_path,
|
||||
pretty_print=True,
|
||||
xml_declaration=True,
|
||||
encoding="utf8")
|
||||
if(alter_lines is True):
|
||||
os.system("html-beautify -r -q -w {} --no-preserve-newlines {}".format(line_width, tmp_file_path))
|
||||
self.beautified_part = etree.parse(tmp_file_path).getroot()
|
||||
elif(alter_lines is False):
|
||||
os.system("html-beautify -r -q {}".format(tmp_file_path))
|
||||
self.beautified_part = etree.parse(tmp_file_path).getroot()
|
||||
update_config.update_config("config.ini", "File paths", "tmp_path",
|
||||
tmp_path)
|
||||
delete_folder.delete_folder(tmp_path)
|
||||
|
||||
def beautify_xml(self, file_path, alter_lines=False, line_width=80):
|
||||
if(alter_lines is True):
|
||||
os.system("html-beautify -r -q -w {} --no-preserve-newlines {}".format(line_width, file_path))
|
||||
elif(alter_lines is False):
|
||||
os.system("html-beautify -r -q {}".format(file_path))
|
||||
|
||||
def expand_element(self, element_to_expand, expand_attr_key,
|
||||
expand_attr_value, check_child=True):
|
||||
"""
|
||||
This function takes an XPath expression for an xml element.
|
||||
The tag of this element will be expanded with the given
|
||||
expand_attrkey and expand_attr_value. Also needs a regex to determine if
|
||||
the current selected element is an element which should be replaced.
|
||||
For this the text of the first child of the current element is checked
|
||||
against the given regex. Per default the child element text of the
|
||||
current element is checked wether the regex matches the string or not.
|
||||
Set check_child to False to avoid this and just expand the current
|
||||
element.
|
||||
"""
|
||||
elements = self.xml_tree.findall(element_to_expand)
|
||||
for element in elements:
|
||||
if(check_child is True):
|
||||
first_child = element.getchildren()[0]
|
||||
match = self.regex_compiled.search(first_child.text)
|
||||
if(match):
|
||||
element.set(expand_attr_key, expand_attr_value)
|
||||
self.xml_tree = self.xml_tree
|
||||
else:
|
||||
element.set(expand_attr_key, expand_attr_value)
|
||||
self.xml_tree = self.xml_tree
|
||||
|
||||
def replace_tag_name(self, element_to_replace, tag_name, check_child=True):
|
||||
"""
|
||||
Replaces a given element tag(as XPath) name with a new tag name.
|
||||
"""
|
||||
elements = self.xml_tree.findall(element_to_replace)
|
||||
for element in elements:
|
||||
if(check_child is True):
|
||||
first_child = element.getchildren()[0]
|
||||
match = self.regex_compiled.search(first_child.text)
|
||||
if(match):
|
||||
element.tag = tag_name
|
||||
else:
|
||||
element.tag = tag_name
|
||||
self.xml_tree = self.xml_tree
|
||||
|
||||
def replace_tag_attr(self, element_to_replace, tag_name, attr_key,
|
||||
attr_value, check_child=True):
|
||||
"""
|
||||
Replaces tag name of given element(as XPath) with new name and adds an
|
||||
attribute Can also check if the child of the current element contains
|
||||
some specific text like in the expand_element function.
|
||||
"""
|
||||
elements = self.xml_tree.findall(element_to_replace)
|
||||
for element in elements:
|
||||
if(check_child is True):
|
||||
first_child = element.getchildren()[0]
|
||||
match = self.regex_compiled.search(first_child.text)
|
||||
if(match):
|
||||
element.tag = tag_name
|
||||
element.set(attr_key, attr_value)
|
||||
else:
|
||||
element.tag = tag_name
|
||||
element.set(attr_key, attr_value)
|
||||
self.xml_tree = self.xml_tree
|
||||
|
||||
def replace_elements(self, elements_to_replace, replacment_elements,
|
||||
keep_parent_text=False):
|
||||
"""
|
||||
Replaces elements identifeid by XPath with new elements. Can either keep
|
||||
the text of the parent element or not.
|
||||
"""
|
||||
elements = self.xml_tree.findall(elements_to_replace)
|
||||
parents_text_xpath = elements_to_replace + "/" + "parent::node()" + "/" + "text()"
|
||||
elements_text = self.xml_tree.xpath(parents_text_xpath)
|
||||
if(len(elements) == len(replacment_elements)):
|
||||
if(keep_parent_text is False):
|
||||
for element, replacement_element in zip(elements, replacment_elements):
|
||||
element.getparent().replace(element, replacement_element)
|
||||
else:
|
||||
for element, replacement_element in zip(elements, replacment_elements):
|
||||
element.getparent().replace(element, replacement_element)
|
||||
self.xml_tree = self.xml_tree
|
||||
elements = self.xml_tree.findall(elements_to_replace)
|
||||
for element, text in zip(elements, elements_text):
|
||||
element.tail = text
|
||||
self.xml_tree = self.xml_tree
|
||||
else:
|
||||
self.logger.warning(("Elements missmatch. There are "
|
||||
+ str(len(elements))
|
||||
+ " that should be repalced."
|
||||
+ " There are " + str(len(replacment_elements))
|
||||
+ " present."
|
||||
+ " No elements have been replaced."))
|
||||
|
||||
def compile_regex(self, regex):
|
||||
self.regex_string = regex
|
||||
"""
|
||||
Takes the input regex string and compiles it for better performance
|
||||
and redability.
|
||||
"""
|
||||
self.regex_compiled = re.compile(self.regex_string, re.MULTILINE)
|
||||
|
||||
def clean_text(self, regex, xpath, replacement_string="",):
|
||||
"""
|
||||
Replaces regex matches with nothing by default or replacement string
|
||||
for an element matched by the xpath in the xml_tree. Works with
|
||||
matchgroups.
|
||||
"""
|
||||
elements = self.xml_tree.xpath(xpath)
|
||||
for element in elements:
|
||||
replaced = re.sub(regex, replacement_string, element.text)
|
||||
element.text = replaced
|
||||
self.xml_tree = self.xml_tree
|
0
bundesdata_markup_nlp/utility/__init__.py
Executable file
0
bundesdata_markup_nlp/utility/__init__.py
Executable file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
15
bundesdata_markup_nlp/utility/delete_folder.py
Executable file
15
bundesdata_markup_nlp/utility/delete_folder.py
Executable file
@ -0,0 +1,15 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import shutil
|
||||
|
||||
|
||||
def delete_folder(folder_path):
|
||||
"""
|
||||
Deletes folder idetified by input folder path string.
|
||||
"""
|
||||
shutil.rmtree(folder_path)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
delete_folder()
|
22
bundesdata_markup_nlp/utility/move_ngrams.py
Executable file
22
bundesdata_markup_nlp/utility/move_ngrams.py
Executable file
@ -0,0 +1,22 @@
|
||||
import os
|
||||
|
||||
"""
|
||||
Helper script to move n_gram csvs to seperate folders. Just copy this into the
|
||||
folder containing the n-grams and execute it. Change n to number of N in N-grams.
|
||||
"""
|
||||
current_path = os.getcwd()
|
||||
files = []
|
||||
n = 5
|
||||
for file in os.listdir(current_path):
|
||||
if file.endswith(".csv"):
|
||||
files.append(file)
|
||||
files = sorted(files)
|
||||
|
||||
dir_list = ["1_grams", "2_grams", "3_grams", "4_grams", "5_grams"][:n]
|
||||
for dir in dir_list:
|
||||
os.system("mkdir {}".format(dir))
|
||||
|
||||
for step, dir in zip(range(0, n), dir_list):
|
||||
for file in files[step::n]:
|
||||
print(file)
|
||||
os.system("mv {} {}".format(file, dir))
|
21
bundesdata_markup_nlp/utility/update_config.py
Executable file
21
bundesdata_markup_nlp/utility/update_config.py
Executable file
@ -0,0 +1,21 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import configparser
|
||||
|
||||
|
||||
def update_config(file_name, section, key, value):
|
||||
"""
|
||||
This script updates a config file identified by file_name. Updates the data
|
||||
of one key value pair in a specific section.
|
||||
"""
|
||||
config = configparser.ConfigParser()
|
||||
config.read(file_name)
|
||||
file = open(file_name, "w")
|
||||
config.set(section, key, value)
|
||||
config.write(file)
|
||||
file.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
update_config()
|
BIN
docs/BT-PP_DTD_kommentiert_20150519.pdf
Executable file
BIN
docs/BT-PP_DTD_kommentiert_20150519.pdf
Executable file
Binary file not shown.
5
docs/metadaten.md
Executable file
5
docs/metadaten.md
Executable file
@ -0,0 +1,5 @@
|
||||
# Metadaten
|
||||
|
||||
Quelle der Strukturdefinition: https://www.bundestag.de/blob/577234/f9159cee3e045cbc37dcd6de6322fcdd/dbtplenarprotokoll_kommentiert-data.pdf
|
||||
Heruntergleaden am: 06.11.2018
|
||||
|
7
requirements.txt
Executable file
7
requirements.txt
Executable file
@ -0,0 +1,7 @@
|
||||
# Bundesdata
|
||||
lxml==4.2.5
|
||||
Babel==2.6.0
|
||||
tqdm==4.28.1
|
||||
spacy==2.0.18
|
||||
https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.0.0/de_core_news_sm-2.0.0.tar.gz
|
||||
scikit-learn[alldeps]==0.20.2
|
Loading…
Reference in New Issue
Block a user