bundesdata_markup_nlp_software/bundesdata_markup_nlp/bundesdata_nlp.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import argparse
import configparser
import os
import logging
from utility.FileGetter import FileGetter
from utility import update_config
from utility import delete_folder
from markup import beautify_markup
from nlp import tokenize, lemmatization, n_grams
from datetime import datetime

"""
This script handles the tokenization, lemmatization and ngramm calculation of
the input protocols. Needs some user input specfied int parse_arguments().
"""

def parse_arguments():
    """
    Argument Parser
    """
    parser = argparse.ArgumentParser(description="Starts the nlp analysis of   \
                                     the newly created XML-protocols")
    parser.add_argument("-sp",
                        "--set_paths",
                        nargs=2,
                        help="User can set the input and output paths for the  \
                        files created during the nlp process. The paths will be\
                        written to the config file.",
                        required=False,
                        type=str,
                        metavar=("input_path", "output_path"))
    parser.add_argument("-fr",
                        "--fresh_run",
                        help="Deltes all temporary folders and output folders  \
                        created during a previously nlp run before this one    \
                        starts.",
                        action="store_true",
                        required=False)
    parser.add_argument("-sb",
                        "--skip_beautify_xml",
                        help="Skips the script creating beautiful xml files.",
                        action="store_true",
                        required=False)
    parser.add_argument("-ns",
                        "--no_stop_words",
                        help="If this is used the lemmatization or tokenization\
                        of the input protocols will exculde stop words.",
                        required=False,
                        action="store_true")
    group = parser.add_mutually_exclusive_group(required=False)
    group.add_argument("-lm",
                       "--lemmatize",
                       help="Lemmatizes the XML protocols in the input directory\
                       and saves them into the output directory.",
                       action="store_true",
                       required=False)
    group.add_argument("-tn",
                       "--tokenize",
                       help="Tokenizes the XML protocols in the input directory\
                       and saves them into the output directory.",
                       action="store_true",
                       required=False)
    group.add_argument("-cn",
                       "--calculate_n_grams",
                       nargs=2,
                       help="Calculates n_grams for any tokenized or leammtized\
                       XML protocol created by this script.                    \
                       feature_to_group_n_grams_by can be set to the following:\
                       'year','month_year', 'speaker' or 'speech'.",
                       required=False,
                       type=str,
                       metavar=("feature_to_group_n_grams_by", "input_type_name"))
    args = parser.parse_args()
    return args


def main():
    # logging and start time
    logging.basicConfig(filename="logs/bundesdata_nlp.log", level=logging.INFO,
                        format="%(asctime)s %(name)s %(levelname)s:%(message)s",
                        datefmt='%Y/%m/%d %H:%M:%S',
                        filemode="w")
    logger = logging.getLogger(__name__)
    start_time = datetime.now()
    print("Start time of script is:", start_time)
    print("Info and status about the nlp process can be found in:",
          "logs/bundesdata_nlp.log")
    logger.info("Start time of script is: " + str(start_time))
    # get arguments
    args = parse_arguments()
    # reads config
    config = configparser.ConfigParser()
    config.read("config.ini")
    # if fresh_run is true directory nlp_output will be deleted
    if(args.fresh_run is True):
        config = configparser.ConfigParser()
        config.read("config.ini")
        options = config.items("File paths")
        for option in options:
            if(option[0] == "nlp_output"):
                try:
                    delete_folder.delete_folder(option[1])
                except FileNotFoundError:
                    pass
            else:
                config.remove_option("File paths", option[0])
        with open("config.ini", 'w') as out:
            config.write(out)

    # create outputfolder if it does not exists and wirtes path to config
    if(args.set_paths):
        output_path = os.path.join(args.set_paths[1], "nlp_output")
        if not os.path.exists(output_path):
            os.mkdir(output_path)
        update_config.update_config("config.ini", "File paths",
                                    "nlp_output", output_path)
    else:
        output_path = config["File paths"]["nlp_output"]
        if not os.path.exists(output_path):
            os.mkdir(output_path)
            update_config.update_config("config.ini", "File paths",
                                        "nlp_output", output_path)
    # gets file_path list of input files and wirtes inputfolder path to config
    if(args.set_paths):
        input_path = args.set_paths[0]
        update_config.update_config("config.ini", "File paths",
                                    "nlp_input", input_path)
    elif(args.calculate_n_grams):
        input_path = config["File paths"]["nlp_beuatiful_xml"]
    else:
        input_path = config["File paths"]["nlp_input"]
    files = FileGetter(input_path, "*.xml")
    files = files.get_files()
    # if statements deciding which script will be executed
    if(args.lemmatize is True and args.no_stop_words is True):
        print("Starting lemmatization excluding stop words.")
        lemmatization.lemmatization(files, True)
        print("Finished lemmatization excluding stop words.")
    elif(args.lemmatize is True and args.no_stop_words is False):
        print("Starting lemmatization including stop words.")
        lemmatization.lemmatization(files)
        print("Finished lemmatization including stop words.")

    if(args.tokenize is True and args.no_stop_words is True):
        print("Starting tokenization excluding stop words.")
        tokenize.tokenize(files, True)
        print("Finished tokenization excluding stop words.")
    elif(args.tokenize is True and args.no_stop_words is False):
        print("Starting tokenization including stop words.")
        tokenize.tokenize(files)
        print("Finished tokenization including stop words.")

    if(args.calculate_n_grams):
        print("Starting calculation of n-grams for input files.")
        n_grams.n_grams(files, args.calculate_n_grams[0], args.calculate_n_grams[1])
        print("Finished calculation of n-grams for input files.")

    if(args.skip_beautify_xml is not True and args.lemmatize is True
       or args.tokenize is True):
        print("Starting to prettyfy the xmls.")
        beautify_markup.beautify_xml("nlp", True, 80)
        print("Prettyfied the xmls.")
    elif(args.skip_beautify_xml is True):
        print("Skipping script beautify_markup.py.")

    end_time = datetime.now()
    print("End time of script is:", str(end_time))
    logger.info("End time of script is: " + str(end_time))
    duration = end_time - start_time
    print("Duration of script is:", duration)
    logger.info("Script duration is: " + str(duration))


if __name__ == '__main__':
    main()