58 lines
2.5 KiB
Python
58 lines
2.5 KiB
Python
|
#!/usr/bin/env python
|
||
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
from utility.FileGetter import FileGetter
|
||
|
from markup.MetadataMarkup import MetadataMarkup
|
||
|
from tqdm import tqdm
|
||
|
import os
|
||
|
import configparser
|
||
|
import logging
|
||
|
|
||
|
|
||
|
def get_metadata():
|
||
|
"""
|
||
|
This script creates a valid metadata head and first level xml tag strucutre
|
||
|
for all files in one directory with subdirs. It needs all filepaths for all
|
||
|
files to consider. File paths will be extracted by using the FileGetter
|
||
|
class.
|
||
|
After that it extracts the given metadata for one file each and writes it as
|
||
|
valid XML according to the new offical schema into a new file at the given
|
||
|
output path.
|
||
|
"""
|
||
|
logger = logging.getLogger(__name__)
|
||
|
print("Running metadata creation for original XML-protocolls.")
|
||
|
config = configparser.ConfigParser()
|
||
|
config.read("config.ini")
|
||
|
input_path = config["File paths"]["input_folder_xmls"]
|
||
|
output_path = config["File paths"]["output_folder"]
|
||
|
Files = FileGetter(input_path, "*.xml")
|
||
|
file_list = Files.get_files()
|
||
|
metadata = MetadataMarkup()
|
||
|
for file in tqdm(sorted(file_list), desc="Metadata status:"):
|
||
|
logger.info("\nCreating metadata for: " + str(os.path.basename(file)))
|
||
|
root = metadata.read_protcol(file)
|
||
|
metadata.extract_metadata(root)
|
||
|
metadata.built_iso_date(metadata.datum_ger_non_iso)
|
||
|
metadata.built_date_string(metadata.datum_iso)
|
||
|
metadata.delete_old_metadata(root)
|
||
|
metadata.split_content(root)
|
||
|
metadata.insert_new_metadata(root)
|
||
|
metadata.get_session_times()
|
||
|
metadata.write_to_attr("dbtplenarprotokoll", "sitzung-datum",
|
||
|
metadata.datum_ger_non_iso)
|
||
|
metadata.write_to_attr("dbtplenarprotokoll", "sitzung-start-uhrzeit",
|
||
|
metadata.session_start_time)
|
||
|
metadata.write_to_attr("dbtplenarprotokol", "sitzung-ende-uhrzeit",
|
||
|
metadata.session_end_time)
|
||
|
metadata.write_to_attr("dbtplenarprotokoll", "sitzungs-nr",
|
||
|
metadata.sitzungsnr)
|
||
|
metadata.write_to_attr("dbtplenarprotokol", "wahlperiode",
|
||
|
metadata.wahlperiode)
|
||
|
metadata.save_to_file(output_path, file, "new_metadata", "File paths", "new_metadata")
|
||
|
logger.info("New metadata created for: " + str(os.path.basename(file)))
|
||
|
print("Succesfully extracted and wrote new metadata to XML-protocolls.")
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
get_metadata()
|