2019-02-21 19:29:44 +01:00
|
|
|
#!/usr/bin/env python
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
import configparser
|
|
|
|
import os
|
|
|
|
import logging
|
|
|
|
from utility.FileGetter import FileGetter
|
|
|
|
from utility import update_config
|
|
|
|
from utility import delete_folder
|
|
|
|
from markup import beautify_markup
|
|
|
|
from nlp import tokenize, lemmatization, n_grams
|
|
|
|
from datetime import datetime
|
|
|
|
|
|
|
|
"""
|
|
|
|
This script handles the tokenization, lemmatization and ngramm calculation of
|
|
|
|
the input protocols. Needs some user input specfied int parse_arguments().
|
|
|
|
"""
|
|
|
|
|
2019-03-03 18:41:12 +01:00
|
|
|
|
2019-02-21 19:29:44 +01:00
|
|
|
def parse_arguments():
|
|
|
|
"""
|
|
|
|
Argument Parser
|
|
|
|
"""
|
|
|
|
parser = argparse.ArgumentParser(description="Starts the nlp analysis of \
|
|
|
|
the newly created XML-protocols")
|
|
|
|
parser.add_argument("-sp",
|
|
|
|
"--set_paths",
|
|
|
|
nargs=2,
|
|
|
|
help="User can set the input and output paths for the \
|
|
|
|
files created during the nlp process. The paths will be\
|
|
|
|
written to the config file.",
|
|
|
|
required=False,
|
|
|
|
type=str,
|
|
|
|
metavar=("input_path", "output_path"))
|
|
|
|
parser.add_argument("-fr",
|
|
|
|
"--fresh_run",
|
|
|
|
help="Deltes all temporary folders and output folders \
|
|
|
|
created during a previously nlp run before this one \
|
|
|
|
starts.",
|
|
|
|
action="store_true",
|
|
|
|
required=False)
|
|
|
|
parser.add_argument("-sb",
|
|
|
|
"--skip_beautify_xml",
|
|
|
|
help="Skips the script creating beautiful xml files.",
|
|
|
|
action="store_true",
|
|
|
|
required=False)
|
|
|
|
parser.add_argument("-ns",
|
|
|
|
"--no_stop_words",
|
|
|
|
help="If this is used the lemmatization or tokenization\
|
|
|
|
of the input protocols will exculde stop words.",
|
|
|
|
required=False,
|
|
|
|
action="store_true")
|
|
|
|
group = parser.add_mutually_exclusive_group(required=False)
|
|
|
|
group.add_argument("-lm",
|
|
|
|
"--lemmatize",
|
|
|
|
help="Lemmatizes the XML protocols in the input directory\
|
|
|
|
and saves them into the output directory.",
|
|
|
|
action="store_true",
|
|
|
|
required=False)
|
|
|
|
group.add_argument("-tn",
|
|
|
|
"--tokenize",
|
|
|
|
help="Tokenizes the XML protocols in the input directory\
|
|
|
|
and saves them into the output directory.",
|
|
|
|
action="store_true",
|
|
|
|
required=False)
|
|
|
|
group.add_argument("-cn",
|
|
|
|
"--calculate_n_grams",
|
|
|
|
nargs=2,
|
|
|
|
help="Calculates n_grams for any tokenized or leammtized\
|
|
|
|
XML protocol created by this script. \
|
|
|
|
feature_to_group_n_grams_by can be set to the following:\
|
|
|
|
'year','month_year', 'speaker' or 'speech'.",
|
|
|
|
required=False,
|
|
|
|
type=str,
|
|
|
|
metavar=("feature_to_group_n_grams_by", "input_type_name"))
|
|
|
|
args = parser.parse_args()
|
|
|
|
return args
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
# logging and start time
|
|
|
|
logging.basicConfig(filename="logs/bundesdata_nlp.log", level=logging.INFO,
|
|
|
|
format="%(asctime)s %(name)s %(levelname)s:%(message)s",
|
|
|
|
datefmt='%Y/%m/%d %H:%M:%S',
|
|
|
|
filemode="w")
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
start_time = datetime.now()
|
|
|
|
print("Start time of script is:", start_time)
|
|
|
|
print("Info and status about the nlp process can be found in:",
|
|
|
|
"logs/bundesdata_nlp.log")
|
|
|
|
logger.info("Start time of script is: " + str(start_time))
|
|
|
|
# get arguments
|
|
|
|
args = parse_arguments()
|
|
|
|
# reads config
|
|
|
|
config = configparser.ConfigParser()
|
|
|
|
config.read("config.ini")
|
|
|
|
# if fresh_run is true directory nlp_output will be deleted
|
|
|
|
if(args.fresh_run is True):
|
|
|
|
config = configparser.ConfigParser()
|
|
|
|
config.read("config.ini")
|
|
|
|
options = config.items("File paths")
|
|
|
|
for option in options:
|
|
|
|
if(option[0] == "nlp_output"):
|
|
|
|
try:
|
|
|
|
delete_folder.delete_folder(option[1])
|
|
|
|
except FileNotFoundError:
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
config.remove_option("File paths", option[0])
|
|
|
|
with open("config.ini", 'w') as out:
|
|
|
|
config.write(out)
|
|
|
|
|
|
|
|
# create outputfolder if it does not exists and wirtes path to config
|
|
|
|
if(args.set_paths):
|
|
|
|
output_path = os.path.join(args.set_paths[1], "nlp_output")
|
|
|
|
if not os.path.exists(output_path):
|
|
|
|
os.mkdir(output_path)
|
|
|
|
update_config.update_config("config.ini", "File paths",
|
|
|
|
"nlp_output", output_path)
|
|
|
|
else:
|
|
|
|
output_path = config["File paths"]["nlp_output"]
|
|
|
|
if not os.path.exists(output_path):
|
|
|
|
os.mkdir(output_path)
|
|
|
|
update_config.update_config("config.ini", "File paths",
|
|
|
|
"nlp_output", output_path)
|
|
|
|
# gets file_path list of input files and wirtes inputfolder path to config
|
|
|
|
if(args.set_paths):
|
|
|
|
input_path = args.set_paths[0]
|
|
|
|
update_config.update_config("config.ini", "File paths",
|
|
|
|
"nlp_input", input_path)
|
|
|
|
elif(args.calculate_n_grams):
|
|
|
|
input_path = config["File paths"]["nlp_beuatiful_xml"]
|
|
|
|
else:
|
|
|
|
input_path = config["File paths"]["nlp_input"]
|
|
|
|
files = FileGetter(input_path, "*.xml")
|
|
|
|
files = files.get_files()
|
|
|
|
# if statements deciding which script will be executed
|
|
|
|
if(args.lemmatize is True and args.no_stop_words is True):
|
|
|
|
print("Starting lemmatization excluding stop words.")
|
|
|
|
lemmatization.lemmatization(files, True)
|
|
|
|
print("Finished lemmatization excluding stop words.")
|
|
|
|
elif(args.lemmatize is True and args.no_stop_words is False):
|
|
|
|
print("Starting lemmatization including stop words.")
|
|
|
|
lemmatization.lemmatization(files)
|
|
|
|
print("Finished lemmatization including stop words.")
|
|
|
|
|
|
|
|
if(args.tokenize is True and args.no_stop_words is True):
|
|
|
|
print("Starting tokenization excluding stop words.")
|
|
|
|
tokenize.tokenize(files, True)
|
|
|
|
print("Finished tokenization excluding stop words.")
|
|
|
|
elif(args.tokenize is True and args.no_stop_words is False):
|
|
|
|
print("Starting tokenization including stop words.")
|
|
|
|
tokenize.tokenize(files)
|
|
|
|
print("Finished tokenization including stop words.")
|
|
|
|
|
|
|
|
if(args.calculate_n_grams):
|
|
|
|
print("Starting calculation of n-grams for input files.")
|
|
|
|
n_grams.n_grams(files, args.calculate_n_grams[0], args.calculate_n_grams[1])
|
|
|
|
print("Finished calculation of n-grams for input files.")
|
|
|
|
|
|
|
|
if(args.skip_beautify_xml is not True and args.lemmatize is True
|
|
|
|
or args.tokenize is True):
|
|
|
|
print("Starting to prettyfy the xmls.")
|
|
|
|
beautify_markup.beautify_xml("nlp", True, 80)
|
|
|
|
print("Prettyfied the xmls.")
|
|
|
|
elif(args.skip_beautify_xml is True):
|
|
|
|
print("Skipping script beautify_markup.py.")
|
|
|
|
|
|
|
|
end_time = datetime.now()
|
|
|
|
print("End time of script is:", str(end_time))
|
|
|
|
logger.info("End time of script is: " + str(end_time))
|
|
|
|
duration = end_time - start_time
|
|
|
|
print("Duration of script is:", duration)
|
|
|
|
logger.info("Script duration is: " + str(duration))
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|