#!/usr/bin/env python # -*- coding: utf-8 -*- from utility.FileGetter import FileGetter from markup.MetadataMarkup import MetadataMarkup from tqdm import tqdm import os import configparser import logging def get_metadata(): """ This script creates a valid metadata head and first level xml tag strucutre for all files in one directory with subdirs. It needs all filepaths for all files to consider. File paths will be extracted by using the FileGetter class. After that it extracts the given metadata for one file each and writes it as valid XML according to the new offical schema into a new file at the given output path. """ logger = logging.getLogger(__name__) print("Running metadata creation for original XML-protocolls.") config = configparser.ConfigParser() config.read("config.ini") input_path = config["File paths"]["input_folder_xmls"] output_path = config["File paths"]["output_folder"] Files = FileGetter(input_path, "*.xml") file_list = Files.get_files() metadata = MetadataMarkup() for file in tqdm(sorted(file_list), desc="Metadata status:"): logger.info("\nCreating metadata for: " + str(os.path.basename(file))) root = metadata.read_protcol(file) metadata.extract_metadata(root) metadata.built_iso_date(metadata.datum_ger_non_iso) metadata.built_date_string(metadata.datum_iso) metadata.delete_old_metadata(root) metadata.split_content(root) metadata.insert_new_metadata(root) metadata.get_session_times() metadata.write_to_attr("dbtplenarprotokoll", "sitzung-datum", metadata.datum_ger_non_iso) metadata.write_to_attr("dbtplenarprotokoll", "sitzung-start-uhrzeit", metadata.session_start_time) metadata.write_to_attr("dbtplenarprotokol", "sitzung-ende-uhrzeit", metadata.session_end_time) metadata.write_to_attr("dbtplenarprotokoll", "sitzungs-nr", metadata.sitzungsnr) metadata.write_to_attr("dbtplenarprotokol", "wahlperiode", metadata.wahlperiode) metadata.save_to_file(output_path, file, "new_metadata", "File paths", "new_metadata") logger.info("New metadata created for: " + str(os.path.basename(file))) print("Succesfully extracted and wrote new metadata to XML-protocolls.") if __name__ == '__main__': get_metadata()