bundesdata_markup_nlp_software/bundesdata_markup_nlp/markup/metadata.py

58 lines
2.5 KiB
Python
Raw Normal View History

2019-02-21 18:29:44 +00:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from utility.FileGetter import FileGetter
from markup.MetadataMarkup import MetadataMarkup
from tqdm import tqdm
import os
import configparser
import logging
def get_metadata():
"""
This script creates a valid metadata head and first level xml tag strucutre
for all files in one directory with subdirs. It needs all filepaths for all
files to consider. File paths will be extracted by using the FileGetter
class.
After that it extracts the given metadata for one file each and writes it as
valid XML according to the new offical schema into a new file at the given
output path.
"""
logger = logging.getLogger(__name__)
print("Running metadata creation for original XML-protocolls.")
config = configparser.ConfigParser()
config.read("config.ini")
input_path = config["File paths"]["input_folder_xmls"]
output_path = config["File paths"]["output_folder"]
Files = FileGetter(input_path, "*.xml")
file_list = Files.get_files()
metadata = MetadataMarkup()
for file in tqdm(sorted(file_list), desc="Metadata status:"):
logger.info("\nCreating metadata for: " + str(os.path.basename(file)))
root = metadata.read_protcol(file)
metadata.extract_metadata(root)
metadata.built_iso_date(metadata.datum_ger_non_iso)
metadata.built_date_string(metadata.datum_iso)
metadata.delete_old_metadata(root)
metadata.split_content(root)
metadata.insert_new_metadata(root)
metadata.get_session_times()
metadata.write_to_attr("dbtplenarprotokoll", "sitzung-datum",
metadata.datum_ger_non_iso)
metadata.write_to_attr("dbtplenarprotokoll", "sitzung-start-uhrzeit",
metadata.session_start_time)
metadata.write_to_attr("dbtplenarprotokol", "sitzung-ende-uhrzeit",
metadata.session_end_time)
metadata.write_to_attr("dbtplenarprotokoll", "sitzungs-nr",
metadata.sitzungsnr)
metadata.write_to_attr("dbtplenarprotokol", "wahlperiode",
metadata.wahlperiode)
metadata.save_to_file(output_path, file, "new_metadata", "File paths", "new_metadata")
logger.info("New metadata created for: " + str(os.path.basename(file)))
print("Succesfully extracted and wrote new metadata to XML-protocolls.")
if __name__ == '__main__':
get_metadata()