215 lines
8.9 KiB
Python
215 lines
8.9 KiB
Python
|
#!/usr/bin/env python
|
||
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
from markup import metadata, speakers, speaker_names, speeches
|
||
|
from utility import update_config
|
||
|
from markup import beautify_markup
|
||
|
from utility import delete_folder
|
||
|
import argparse
|
||
|
import time
|
||
|
import configparser
|
||
|
from datetime import datetime
|
||
|
import logging
|
||
|
import os
|
||
|
|
||
|
"""
|
||
|
This is the mains script handeling the automatic markup of the protocols. Needs
|
||
|
some user Input specified in parse-arguments().
|
||
|
"""
|
||
|
|
||
|
|
||
|
def parse_arguments():
|
||
|
"""
|
||
|
Argument Parser
|
||
|
"""
|
||
|
parser = argparse.ArgumentParser(description="Starts the markup process of \
|
||
|
the XML protocols. Uses either the input \
|
||
|
and output paths currently specified in \
|
||
|
the config file or the paths set when \
|
||
|
calling the script from the terminal with \
|
||
|
the flag argument '-sp' or '--set_paths'. \
|
||
|
Using this parameter writes the given \
|
||
|
paths into the config file. \
|
||
|
Some steps of the markup process can be \
|
||
|
skipped if they already have been executed\
|
||
|
once while useing the -kt option \
|
||
|
by using the corresponding parameters. \
|
||
|
")
|
||
|
parser.add_argument("-sp",
|
||
|
"--set_paths",
|
||
|
nargs=2,
|
||
|
help="User can set the input and output paths for the \
|
||
|
files created during the markup. The paths will be \
|
||
|
written to the config file.",
|
||
|
required=False,
|
||
|
type=str,
|
||
|
metavar=("input_path", "output_path"))
|
||
|
parser.add_argument("-sm",
|
||
|
"--skip_metadata",
|
||
|
help="Skips the script creating metadata and first \
|
||
|
xml strucutre.",
|
||
|
action="store_true",
|
||
|
required=False)
|
||
|
parser.add_argument("-ss",
|
||
|
"--skip_simple_speakers",
|
||
|
help="Skips the script creating the first simple \
|
||
|
speaker markup.",
|
||
|
action="store_true",
|
||
|
required=False)
|
||
|
parser.add_argument("-sn",
|
||
|
"--skip_name_markup",
|
||
|
help="Skips the script creating the name markup.",
|
||
|
action="store_true",
|
||
|
required=False)
|
||
|
parser.add_argument("-ssp",
|
||
|
"--skip_speeches",
|
||
|
help="Skips the script creating markup inside of \
|
||
|
speeches.",
|
||
|
action="store_true",
|
||
|
required=False)
|
||
|
parser.add_argument("-sb",
|
||
|
"--skip_beautify_xml",
|
||
|
help="Skips the script creating beautiful xml files.",
|
||
|
action="store_true",
|
||
|
required=False)
|
||
|
parser.add_argument("-kt",
|
||
|
"--keep_tmp_files",
|
||
|
help="Keeps all temporary xml files beeing created \
|
||
|
during the entire markup process. Using this flag is \
|
||
|
needed when skipping steps of the entire markup during \
|
||
|
a rerun of the script. \
|
||
|
If this is not set temporary files will always be \
|
||
|
deleted.",
|
||
|
action="store_true",
|
||
|
required=False)
|
||
|
parser.add_argument("-fr",
|
||
|
"--fresh_run",
|
||
|
help="Deltes all temporary folders in output folder \
|
||
|
also deletes all paths saved in the config file file \
|
||
|
before starting the markup process. The user has to set\
|
||
|
the paths again with -sp.",
|
||
|
action="store_true",
|
||
|
required=False)
|
||
|
parser.add_argument("-la",
|
||
|
"--log_all",
|
||
|
help="If set the programm will log all information \
|
||
|
about the markup process (statistics etc.). Otherwise \
|
||
|
it only logs errors and warnings.",
|
||
|
action="store_true",
|
||
|
required=False)
|
||
|
args = parser.parse_args()
|
||
|
return args
|
||
|
|
||
|
|
||
|
def main():
|
||
|
"""
|
||
|
Main function calling all other scripts for the automatic markup of the
|
||
|
protocols.
|
||
|
"""
|
||
|
args = parse_arguments()
|
||
|
if(args.log_all is True):
|
||
|
level = logging.INFO
|
||
|
elif(args.log_all is False):
|
||
|
level = logging.WARNING
|
||
|
logging.basicConfig(filename="logs/bundesdata.log", level=level,
|
||
|
format="%(asctime)s %(name)s %(levelname)s:%(message)s",
|
||
|
datefmt='%Y/%m/%d %H:%M:%S',
|
||
|
filemode="w")
|
||
|
logger = logging.getLogger(__name__)
|
||
|
start_time = datetime.now()
|
||
|
print("Start time of script is:", start_time)
|
||
|
print("Info and status about the markup process can be found in:",
|
||
|
"logs/bundesdata.log")
|
||
|
logger.info("Start time of script is: " + str(start_time))
|
||
|
|
||
|
# Deletes output folder and all folders inside that.
|
||
|
# Also removes all path options from the section "File paths"
|
||
|
if(args.fresh_run is True):
|
||
|
config = configparser.ConfigParser()
|
||
|
config.read("config.ini")
|
||
|
options = config.items("File paths")
|
||
|
for option in options:
|
||
|
if(option[0] == "output_folder"):
|
||
|
try:
|
||
|
delete_folder.delete_folder(option[1])
|
||
|
except FileNotFoundError:
|
||
|
pass
|
||
|
else:
|
||
|
config.remove_option("File paths", option[0])
|
||
|
with open("config.ini", 'w') as out:
|
||
|
config.write(out)
|
||
|
|
||
|
# sets paths and creates output folder
|
||
|
if(args.set_paths):
|
||
|
input_path = args.set_paths[0]
|
||
|
output_path = os.path.join(args.set_paths[1], "output")
|
||
|
if not os.path.exists(output_path):
|
||
|
os.mkdir(output_path)
|
||
|
config = configparser.ConfigParser()
|
||
|
config.read("config.ini")
|
||
|
update_config.update_config("config.ini", "File paths",
|
||
|
"input_folder_xmls", input_path)
|
||
|
update_config.update_config("config.ini", "File paths",
|
||
|
"output_folder", output_path)
|
||
|
|
||
|
if(args.skip_metadata is not True):
|
||
|
print("Starting metadata extraction and markup.")
|
||
|
metadata.get_metadata()
|
||
|
print("Metadata creation and content splits finished.")
|
||
|
elif(args.skip_metadata is True):
|
||
|
print("Skipping script metadata.py.")
|
||
|
|
||
|
time.sleep(1)
|
||
|
if(args.skip_simple_speakers is not True):
|
||
|
print("Starting first simple speeches and speaker markup.")
|
||
|
speakers.get_speakers()
|
||
|
print(("Finished simple markup."))
|
||
|
elif(args.skip_simple_speakers is True):
|
||
|
print("Skipping script speakers.py.")
|
||
|
|
||
|
time.sleep(1)
|
||
|
if(args.skip_name_markup is not True):
|
||
|
print("Starting complex markup of speaker names.")
|
||
|
speaker_names.get_names()
|
||
|
print("Finished complex name markup. (names etc.)")
|
||
|
elif(args.skip_name_markup is True):
|
||
|
print("Skipping script speaker_names.py.")
|
||
|
|
||
|
time.sleep(1)
|
||
|
if(args.skip_speeches is not True):
|
||
|
print("Starting markup of comments etc. in speeches.")
|
||
|
speeches.markup_speeches()
|
||
|
print("Finished markup of comments etc. in speeches.")
|
||
|
elif(args.skip_speeches is True):
|
||
|
print("Skipping script speeches.py.")
|
||
|
|
||
|
time.sleep(1)
|
||
|
if(args.skip_beautify_xml is not True):
|
||
|
print("Starting to prettyfie the xmls.")
|
||
|
beautify_markup.beautify_xml("markup")
|
||
|
print("Prettyfied the xmls.")
|
||
|
elif(args.skip_beautify_xml is True):
|
||
|
print("Skipping script beautify_markup.py.")
|
||
|
|
||
|
if(args.keep_tmp_files is not True):
|
||
|
config = configparser.ConfigParser()
|
||
|
config.read("config.ini")
|
||
|
folder_paths = []
|
||
|
folder_paths.append(config["File paths"]["new_metadata"])
|
||
|
folder_paths.append(config["File paths"]["new_simple_markup"])
|
||
|
folder_paths.append(config["File paths"]["complex_markup"])
|
||
|
folder_paths.append(config["File paths"]["clear_speech_markup"])
|
||
|
for folder_path in folder_paths:
|
||
|
delete_folder.delete_folder(folder_path)
|
||
|
|
||
|
end_time = datetime.now()
|
||
|
print("End time of script is:", str(end_time))
|
||
|
logger.info("End time of script is: " + str(end_time))
|
||
|
duration = end_time - start_time
|
||
|
print("Duration of script is:", duration)
|
||
|
logger.info("Script duration is: " + str(duration))
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
main()
|