bundesdata_markup_nlp_software/bundesdata_markup_nlp/bundesdata_nlp.py

179 lines
7.5 KiB
Python
Raw Normal View History

2019-02-21 18:29:44 +00:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argparse
import configparser
import os
import logging
from utility.FileGetter import FileGetter
from utility import update_config
from utility import delete_folder
from markup import beautify_markup
from nlp import tokenize, lemmatization, n_grams
from datetime import datetime
"""
This script handles the tokenization, lemmatization and ngramm calculation of
the input protocols. Needs some user input specfied int parse_arguments().
"""
def parse_arguments():
"""
Argument Parser
"""
parser = argparse.ArgumentParser(description="Starts the nlp analysis of \
the newly created XML-protocols")
parser.add_argument("-sp",
"--set_paths",
nargs=2,
help="User can set the input and output paths for the \
files created during the nlp process. The paths will be\
written to the config file.",
required=False,
type=str,
metavar=("input_path", "output_path"))
parser.add_argument("-fr",
"--fresh_run",
help="Deltes all temporary folders and output folders \
created during a previously nlp run before this one \
starts.",
action="store_true",
required=False)
parser.add_argument("-sb",
"--skip_beautify_xml",
help="Skips the script creating beautiful xml files.",
action="store_true",
required=False)
parser.add_argument("-ns",
"--no_stop_words",
help="If this is used the lemmatization or tokenization\
of the input protocols will exculde stop words.",
required=False,
action="store_true")
group = parser.add_mutually_exclusive_group(required=False)
group.add_argument("-lm",
"--lemmatize",
help="Lemmatizes the XML protocols in the input directory\
and saves them into the output directory.",
action="store_true",
required=False)
group.add_argument("-tn",
"--tokenize",
help="Tokenizes the XML protocols in the input directory\
and saves them into the output directory.",
action="store_true",
required=False)
group.add_argument("-cn",
"--calculate_n_grams",
nargs=2,
help="Calculates n_grams for any tokenized or leammtized\
XML protocol created by this script. \
feature_to_group_n_grams_by can be set to the following:\
'year','month_year', 'speaker' or 'speech'.",
required=False,
type=str,
metavar=("feature_to_group_n_grams_by", "input_type_name"))
args = parser.parse_args()
return args
def main():
# logging and start time
logging.basicConfig(filename="logs/bundesdata_nlp.log", level=logging.INFO,
format="%(asctime)s %(name)s %(levelname)s:%(message)s",
datefmt='%Y/%m/%d %H:%M:%S',
filemode="w")
logger = logging.getLogger(__name__)
start_time = datetime.now()
print("Start time of script is:", start_time)
print("Info and status about the nlp process can be found in:",
"logs/bundesdata_nlp.log")
logger.info("Start time of script is: " + str(start_time))
# get arguments
args = parse_arguments()
# reads config
config = configparser.ConfigParser()
config.read("config.ini")
# if fresh_run is true directory nlp_output will be deleted
if(args.fresh_run is True):
config = configparser.ConfigParser()
config.read("config.ini")
options = config.items("File paths")
for option in options:
if(option[0] == "nlp_output"):
try:
delete_folder.delete_folder(option[1])
except FileNotFoundError:
pass
else:
config.remove_option("File paths", option[0])
with open("config.ini", 'w') as out:
config.write(out)
# create outputfolder if it does not exists and wirtes path to config
if(args.set_paths):
output_path = os.path.join(args.set_paths[1], "nlp_output")
if not os.path.exists(output_path):
os.mkdir(output_path)
update_config.update_config("config.ini", "File paths",
"nlp_output", output_path)
else:
output_path = config["File paths"]["nlp_output"]
if not os.path.exists(output_path):
os.mkdir(output_path)
update_config.update_config("config.ini", "File paths",
"nlp_output", output_path)
# gets file_path list of input files and wirtes inputfolder path to config
if(args.set_paths):
input_path = args.set_paths[0]
update_config.update_config("config.ini", "File paths",
"nlp_input", input_path)
elif(args.calculate_n_grams):
input_path = config["File paths"]["nlp_beuatiful_xml"]
else:
input_path = config["File paths"]["nlp_input"]
files = FileGetter(input_path, "*.xml")
files = files.get_files()
# if statements deciding which script will be executed
if(args.lemmatize is True and args.no_stop_words is True):
print("Starting lemmatization excluding stop words.")
lemmatization.lemmatization(files, True)
print("Finished lemmatization excluding stop words.")
elif(args.lemmatize is True and args.no_stop_words is False):
print("Starting lemmatization including stop words.")
lemmatization.lemmatization(files)
print("Finished lemmatization including stop words.")
if(args.tokenize is True and args.no_stop_words is True):
print("Starting tokenization excluding stop words.")
tokenize.tokenize(files, True)
print("Finished tokenization excluding stop words.")
elif(args.tokenize is True and args.no_stop_words is False):
print("Starting tokenization including stop words.")
tokenize.tokenize(files)
print("Finished tokenization including stop words.")
if(args.calculate_n_grams):
print("Starting calculation of n-grams for input files.")
n_grams.n_grams(files, args.calculate_n_grams[0], args.calculate_n_grams[1])
print("Finished calculation of n-grams for input files.")
if(args.skip_beautify_xml is not True and args.lemmatize is True
or args.tokenize is True):
print("Starting to prettyfy the xmls.")
beautify_markup.beautify_xml("nlp", True, 80)
print("Prettyfied the xmls.")
elif(args.skip_beautify_xml is True):
print("Skipping script beautify_markup.py.")
end_time = datetime.now()
print("End time of script is:", str(end_time))
logger.info("End time of script is: " + str(end_time))
duration = end_time - start_time
print("Duration of script is:", duration)
logger.info("Script duration is: " + str(duration))
if __name__ == '__main__':
main()