bundesdata_markup_nlp_software/bundesdata_markup_nlp/bundesdata_markup.py

215 lines
8.9 KiB
Python
Raw Normal View History

2019-02-21 18:29:44 +00:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from markup import metadata, speakers, speaker_names, speeches
from utility import update_config
from markup import beautify_markup
from utility import delete_folder
import argparse
import time
import configparser
from datetime import datetime
import logging
import os
"""
This is the mains script handeling the automatic markup of the protocols. Needs
some user Input specified in parse-arguments().
"""
def parse_arguments():
"""
Argument Parser
"""
parser = argparse.ArgumentParser(description="Starts the markup process of \
the XML protocols. Uses either the input \
and output paths currently specified in \
the config file or the paths set when \
calling the script from the terminal with \
the flag argument '-sp' or '--set_paths'. \
Using this parameter writes the given \
paths into the config file. \
Some steps of the markup process can be \
skipped if they already have been executed\
once while useing the -kt option \
by using the corresponding parameters. \
")
parser.add_argument("-sp",
"--set_paths",
nargs=2,
help="User can set the input and output paths for the \
files created during the markup. The paths will be \
written to the config file.",
required=False,
type=str,
metavar=("input_path", "output_path"))
parser.add_argument("-sm",
"--skip_metadata",
help="Skips the script creating metadata and first \
xml strucutre.",
action="store_true",
required=False)
parser.add_argument("-ss",
"--skip_simple_speakers",
help="Skips the script creating the first simple \
speaker markup.",
action="store_true",
required=False)
parser.add_argument("-sn",
"--skip_name_markup",
help="Skips the script creating the name markup.",
action="store_true",
required=False)
parser.add_argument("-ssp",
"--skip_speeches",
help="Skips the script creating markup inside of \
speeches.",
action="store_true",
required=False)
parser.add_argument("-sb",
"--skip_beautify_xml",
help="Skips the script creating beautiful xml files.",
action="store_true",
required=False)
parser.add_argument("-kt",
"--keep_tmp_files",
help="Keeps all temporary xml files beeing created \
during the entire markup process. Using this flag is \
needed when skipping steps of the entire markup during \
a rerun of the script. \
If this is not set temporary files will always be \
deleted.",
action="store_true",
required=False)
parser.add_argument("-fr",
"--fresh_run",
help="Deltes all temporary folders in output folder \
also deletes all paths saved in the config file file \
before starting the markup process. The user has to set\
the paths again with -sp.",
action="store_true",
required=False)
parser.add_argument("-la",
"--log_all",
help="If set the programm will log all information \
about the markup process (statistics etc.). Otherwise \
it only logs errors and warnings.",
action="store_true",
required=False)
args = parser.parse_args()
return args
def main():
"""
Main function calling all other scripts for the automatic markup of the
protocols.
"""
args = parse_arguments()
if(args.log_all is True):
level = logging.INFO
elif(args.log_all is False):
level = logging.WARNING
logging.basicConfig(filename="logs/bundesdata.log", level=level,
format="%(asctime)s %(name)s %(levelname)s:%(message)s",
datefmt='%Y/%m/%d %H:%M:%S',
filemode="w")
logger = logging.getLogger(__name__)
start_time = datetime.now()
print("Start time of script is:", start_time)
print("Info and status about the markup process can be found in:",
"logs/bundesdata.log")
logger.info("Start time of script is: " + str(start_time))
# Deletes output folder and all folders inside that.
# Also removes all path options from the section "File paths"
if(args.fresh_run is True):
config = configparser.ConfigParser()
config.read("config.ini")
options = config.items("File paths")
for option in options:
if(option[0] == "output_folder"):
try:
delete_folder.delete_folder(option[1])
except FileNotFoundError:
pass
else:
config.remove_option("File paths", option[0])
with open("config.ini", 'w') as out:
config.write(out)
# sets paths and creates output folder
if(args.set_paths):
input_path = args.set_paths[0]
output_path = os.path.join(args.set_paths[1], "output")
if not os.path.exists(output_path):
os.mkdir(output_path)
config = configparser.ConfigParser()
config.read("config.ini")
update_config.update_config("config.ini", "File paths",
"input_folder_xmls", input_path)
update_config.update_config("config.ini", "File paths",
"output_folder", output_path)
if(args.skip_metadata is not True):
print("Starting metadata extraction and markup.")
metadata.get_metadata()
print("Metadata creation and content splits finished.")
elif(args.skip_metadata is True):
print("Skipping script metadata.py.")
time.sleep(1)
if(args.skip_simple_speakers is not True):
print("Starting first simple speeches and speaker markup.")
speakers.get_speakers()
print(("Finished simple markup."))
elif(args.skip_simple_speakers is True):
print("Skipping script speakers.py.")
time.sleep(1)
if(args.skip_name_markup is not True):
print("Starting complex markup of speaker names.")
speaker_names.get_names()
print("Finished complex name markup. (names etc.)")
elif(args.skip_name_markup is True):
print("Skipping script speaker_names.py.")
time.sleep(1)
if(args.skip_speeches is not True):
print("Starting markup of comments etc. in speeches.")
speeches.markup_speeches()
print("Finished markup of comments etc. in speeches.")
elif(args.skip_speeches is True):
print("Skipping script speeches.py.")
time.sleep(1)
if(args.skip_beautify_xml is not True):
print("Starting to prettyfie the xmls.")
beautify_markup.beautify_xml("markup")
print("Prettyfied the xmls.")
elif(args.skip_beautify_xml is True):
print("Skipping script beautify_markup.py.")
if(args.keep_tmp_files is not True):
config = configparser.ConfigParser()
config.read("config.ini")
folder_paths = []
folder_paths.append(config["File paths"]["new_metadata"])
folder_paths.append(config["File paths"]["new_simple_markup"])
folder_paths.append(config["File paths"]["complex_markup"])
folder_paths.append(config["File paths"]["clear_speech_markup"])
for folder_path in folder_paths:
delete_folder.delete_folder(folder_path)
end_time = datetime.now()
print("End time of script is:", str(end_time))
logger.info("End time of script is: " + str(end_time))
duration = end_time - start_time
print("Duration of script is:", duration)
logger.info("Script duration is: " + str(duration))
if __name__ == '__main__':
main()