#!/usr/bin/env python # -*- coding: utf-8 -*- import argparse import configparser import os import logging from utility.FileGetter import FileGetter from utility import update_config from utility import delete_folder from markup import beautify_markup from nlp import tokenize, lemmatization, n_grams from datetime import datetime """ This script handles the tokenization, lemmatization and ngramm calculation of the input protocols. Needs some user input specfied int parse_arguments(). """ def parse_arguments(): """ Argument Parser """ parser = argparse.ArgumentParser(description="Starts the nlp analysis of \ the newly created XML-protocols") parser.add_argument("-sp", "--set_paths", nargs=2, help="User can set the input and output paths for the \ files created during the nlp process. The paths will be\ written to the config file.", required=False, type=str, metavar=("input_path", "output_path")) parser.add_argument("-fr", "--fresh_run", help="Deltes all temporary folders and output folders \ created during a previously nlp run before this one \ starts.", action="store_true", required=False) parser.add_argument("-sb", "--skip_beautify_xml", help="Skips the script creating beautiful xml files.", action="store_true", required=False) parser.add_argument("-ns", "--no_stop_words", help="If this is used the lemmatization or tokenization\ of the input protocols will exculde stop words.", required=False, action="store_true") group = parser.add_mutually_exclusive_group(required=False) group.add_argument("-lm", "--lemmatize", help="Lemmatizes the XML protocols in the input directory\ and saves them into the output directory.", action="store_true", required=False) group.add_argument("-tn", "--tokenize", help="Tokenizes the XML protocols in the input directory\ and saves them into the output directory.", action="store_true", required=False) group.add_argument("-cn", "--calculate_n_grams", nargs=2, help="Calculates n_grams for any tokenized or leammtized\ XML protocol created by this script. \ feature_to_group_n_grams_by can be set to the following:\ 'year','month_year', 'speaker' or 'speech'.", required=False, type=str, metavar=("feature_to_group_n_grams_by", "input_type_name")) args = parser.parse_args() return args def main(): # logging and start time logging.basicConfig(filename="logs/bundesdata_nlp.log", level=logging.INFO, format="%(asctime)s %(name)s %(levelname)s:%(message)s", datefmt='%Y/%m/%d %H:%M:%S', filemode="w") logger = logging.getLogger(__name__) start_time = datetime.now() print("Start time of script is:", start_time) print("Info and status about the nlp process can be found in:", "logs/bundesdata_nlp.log") logger.info("Start time of script is: " + str(start_time)) # get arguments args = parse_arguments() # reads config config = configparser.ConfigParser() config.read("config.ini") # if fresh_run is true directory nlp_output will be deleted if(args.fresh_run is True): config = configparser.ConfigParser() config.read("config.ini") options = config.items("File paths") for option in options: if(option[0] == "nlp_output"): try: delete_folder.delete_folder(option[1]) except FileNotFoundError: pass else: config.remove_option("File paths", option[0]) with open("config.ini", 'w') as out: config.write(out) # create outputfolder if it does not exists and wirtes path to config if(args.set_paths): output_path = os.path.join(args.set_paths[1], "nlp_output") if not os.path.exists(output_path): os.mkdir(output_path) update_config.update_config("config.ini", "File paths", "nlp_output", output_path) else: output_path = config["File paths"]["nlp_output"] if not os.path.exists(output_path): os.mkdir(output_path) update_config.update_config("config.ini", "File paths", "nlp_output", output_path) # gets file_path list of input files and wirtes inputfolder path to config if(args.set_paths): input_path = args.set_paths[0] update_config.update_config("config.ini", "File paths", "nlp_input", input_path) elif(args.calculate_n_grams): input_path = config["File paths"]["nlp_beuatiful_xml"] else: input_path = config["File paths"]["nlp_input"] files = FileGetter(input_path, "*.xml") files = files.get_files() # if statements deciding which script will be executed if(args.lemmatize is True and args.no_stop_words is True): print("Starting lemmatization excluding stop words.") lemmatization.lemmatization(files, True) print("Finished lemmatization excluding stop words.") elif(args.lemmatize is True and args.no_stop_words is False): print("Starting lemmatization including stop words.") lemmatization.lemmatization(files) print("Finished lemmatization including stop words.") if(args.tokenize is True and args.no_stop_words is True): print("Starting tokenization excluding stop words.") tokenize.tokenize(files, True) print("Finished tokenization excluding stop words.") elif(args.tokenize is True and args.no_stop_words is False): print("Starting tokenization including stop words.") tokenize.tokenize(files) print("Finished tokenization including stop words.") if(args.calculate_n_grams): print("Starting calculation of n-grams for input files.") n_grams.n_grams(files, args.calculate_n_grams[0], args.calculate_n_grams[1]) print("Finished calculation of n-grams for input files.") if(args.skip_beautify_xml is not True and args.lemmatize is True or args.tokenize is True): print("Starting to prettyfy the xmls.") beautify_markup.beautify_xml("nlp", True, 80) print("Prettyfied the xmls.") elif(args.skip_beautify_xml is True): print("Skipping script beautify_markup.py.") end_time = datetime.now() print("End time of script is:", str(end_time)) logger.info("End time of script is: " + str(end_time)) duration = end_time - start_time print("Duration of script is:", duration) logger.info("Script duration is: " + str(duration)) if __name__ == '__main__': main()