Initial commit

2019-02-21 19:29:44 +01:00
commit 4263e5f41e
52 changed files with 3024 additions and 0 deletions
--- a/bundesdata_markup_nlp/utility/FileGetter.py
+++ b/bundesdata_markup_nlp/utility/FileGetter.py
@ -0,0 +1,35 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import os
+import fnmatch
+
+"""
+This class is for getting filepaths of all files in a given directory. Also
+gets files in subdirectories.
+"""
+
+
+class FileGetter(object):
+    """
+    Class for getting file paths of given path wich will be opend and/or
+    further processed later on.
+    """
+
+    def __init__(self, path, file_type):
+        super(FileGetter, self).__init__()
+        self.path = path
+        self.file_type = file_type
+
+    def get_files(self):
+        """
+        Creates file list with full paths of all files in the given
+        directory and its sub directories and returns it.
+        """
+        list_of_files = []
+        for path, subdirs, files in os.walk(self.path):
+            for name in files:
+                if fnmatch.fnmatch(name, self.file_type):
+                    list_of_files.append(os.path.join(path, name))
+        self.list_of_files = list_of_files
+        return list_of_files
--- a/bundesdata_markup_nlp/utility/XMLProtocol.py
+++ b/bundesdata_markup_nlp/utility/XMLProtocol.py
@ -0,0 +1,209 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from utility import delete_folder
+from utility import update_config
+from xml.etree import ElementTree
+from os import path
+from lxml import etree
+import os
+import logging
+import re
+
+
+class XMLProtocol(object):
+    """Class for standard operations on/with the XML protocols. Has functions
+    for reading, saving and manipulationg an XML protocol. All other classes
+    inherit from this one.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.logger = logging.getLogger(__name__)
+
+    def read_protcol(self, file_path):
+        """
+        Takes a file path and parses the file as an XML returns a root element.
+        """
+        self.file_path = file_path
+        self.filename = os.path.basename(self.file_path)
+        parser = etree.XMLParser(remove_blank_text=True)
+        self.tree = etree.parse(file_path, parser)  # for better xml indentation
+        root = self.tree.getroot()
+        self.logger.info("File successfully parsed as XML.")
+        return root
+
+    def read_xml(self, file_path):
+        """Takes a file path and parses the file as an XML."""
+        parser = etree.XMLParser(encoding='utf-8', remove_blank_text=True)
+        tree = etree.parse(file_path, parser)  # for better xml indentation
+        self.xml_tree = tree.getroot()
+
+    def save_to_file(self, output_path, file_path, subfolder, config_section,
+                     config_key, filename_sufix=""):
+        """
+        Writes the new markup to a new xml file. Takes the output path and
+        creates a new folder there. Also updates the config file with the new
+        path.
+        """
+        if(filename_sufix == ""):
+            self.filename = path.basename(file_path)
+        elif(filename_sufix != ""):
+            self.filename = path.basename(file_path)[:-4] + filename_sufix
+        save_path = os.path.join(output_path, subfolder)
+        if not os.path.exists(save_path):
+            os.mkdir(save_path)
+        tree = etree.ElementTree(self.xml_tree)
+        new_filename = self.filename
+        save_file_path = os.path.join(save_path, new_filename)
+        tree.write(save_file_path,
+                   pretty_print=True,
+                   xml_declaration=True,
+                   encoding="utf8",
+                   doctype="<!DOCTYPE dbtplenarprotokoll SYSTEM 'dbtplenarprotokoll_minimal.dtd\'>")
+        self.logger.info("New XML saved to:" + save_file_path)
+        update_config.update_config("config.ini", config_section, config_key,
+                                    save_path)
+
+    def beautify_xml_part(self, file_path, xpath, alter_lines=False,
+                          line_width=80):
+        """
+        Beautifies part (element node) of an input XML.
+        """
+        tmp_path = os.path.join(os.path.dirname(file_path), "tmp")
+        tree = etree.ElementTree(self.xml_tree)
+        self.beautified_part = tree.find(xpath)
+        self.beautified_part = ElementTree.tostring(self.beautified_part)
+        self.beautified_part = etree.fromstring(self.beautified_part)
+        self.beautified_part = etree.ElementTree(self.beautified_part)
+        if not os.path.exists(tmp_path):
+            os.mkdir(tmp_path)
+        tmp_file_path = os.path.join(tmp_path, "tmp.xml")
+        self.beautified_part.write(tmp_file_path,
+                                   pretty_print=True,
+                                   xml_declaration=True,
+                                   encoding="utf8")
+        if(alter_lines is True):
+            os.system("html-beautify -r -q -w {} --no-preserve-newlines {}".format(line_width, tmp_file_path))
+            self.beautified_part = etree.parse(tmp_file_path).getroot()
+        elif(alter_lines is False):
+            os.system("html-beautify -r -q {}".format(tmp_file_path))
+            self.beautified_part = etree.parse(tmp_file_path).getroot()
+        update_config.update_config("config.ini", "File paths", "tmp_path",
+                                    tmp_path)
+        delete_folder.delete_folder(tmp_path)
+
+    def beautify_xml(self, file_path, alter_lines=False, line_width=80):
+        if(alter_lines is True):
+            os.system("html-beautify -r -q -w {} --no-preserve-newlines {}".format(line_width, file_path))
+        elif(alter_lines is False):
+            os.system("html-beautify -r -q {}".format(file_path))
+
+    def expand_element(self, element_to_expand, expand_attr_key,
+                       expand_attr_value, check_child=True):
+        """
+        This function takes an XPath expression for an xml element.
+        The tag of this element will be expanded with the given
+        expand_attrkey and expand_attr_value. Also needs a regex to determine if
+        the current selected element is an element which should be replaced.
+        For this the text of the first child of the current element is checked
+        against the given regex. Per default the child element text of the
+        current element is checked wether the regex matches the string or not.
+        Set check_child to False to avoid this and just expand the current
+        element.
+        """
+        elements = self.xml_tree.findall(element_to_expand)
+        for element in elements:
+            if(check_child is True):
+                first_child = element.getchildren()[0]
+                match = self.regex_compiled.search(first_child.text)
+                if(match):
+                    element.set(expand_attr_key, expand_attr_value)
+                self.xml_tree = self.xml_tree
+            else:
+                element.set(expand_attr_key, expand_attr_value)
+                self.xml_tree = self.xml_tree
+
+    def replace_tag_name(self, element_to_replace, tag_name, check_child=True):
+        """
+        Replaces a given element tag(as XPath) name with a new tag name.
+        """
+        elements = self.xml_tree.findall(element_to_replace)
+        for element in elements:
+            if(check_child is True):
+                first_child = element.getchildren()[0]
+                match = self.regex_compiled.search(first_child.text)
+                if(match):
+                    element.tag = tag_name
+            else:
+                element.tag = tag_name
+        self.xml_tree = self.xml_tree
+
+    def replace_tag_attr(self, element_to_replace, tag_name, attr_key,
+                         attr_value, check_child=True):
+        """
+        Replaces tag name of given element(as XPath) with new name and adds an
+        attribute Can also check if the child of the current element contains
+        some specific text like in the expand_element function.
+        """
+        elements = self.xml_tree.findall(element_to_replace)
+        for element in elements:
+            if(check_child is True):
+                first_child = element.getchildren()[0]
+                match = self.regex_compiled.search(first_child.text)
+                if(match):
+                    element.tag = tag_name
+                    element.set(attr_key, attr_value)
+            else:
+                element.tag = tag_name
+                element.set(attr_key, attr_value)
+        self.xml_tree = self.xml_tree
+
+    def replace_elements(self, elements_to_replace, replacment_elements,
+                         keep_parent_text=False):
+        """
+        Replaces elements identifeid by XPath with new elements. Can either keep
+        the text of the parent element or not.
+        """
+        elements = self.xml_tree.findall(elements_to_replace)
+        parents_text_xpath = elements_to_replace + "/" + "parent::node()" + "/" + "text()"
+        elements_text = self.xml_tree.xpath(parents_text_xpath)
+        if(len(elements) == len(replacment_elements)):
+            if(keep_parent_text is False):
+                for element, replacement_element in zip(elements, replacment_elements):
+                    element.getparent().replace(element, replacement_element)
+            else:
+                for element, replacement_element in zip(elements, replacment_elements):
+                    element.getparent().replace(element, replacement_element)
+                self.xml_tree = self.xml_tree
+                elements = self.xml_tree.findall(elements_to_replace)
+                for element, text in zip(elements, elements_text):
+                    element.tail = text
+            self.xml_tree = self.xml_tree
+        else:
+            self.logger.warning(("Elements missmatch. There are "
+                                 + str(len(elements))
+                                 + " that should be repalced."
+                                 + " There are " + str(len(replacment_elements))
+                                 + " present."
+                                 + " No elements have been replaced."))
+
+    def compile_regex(self, regex):
+        self.regex_string = regex
+        """
+        Takes the input regex string and compiles it for better performance
+        and redability.
+        """
+        self.regex_compiled = re.compile(self.regex_string, re.MULTILINE)
+
+    def clean_text(self, regex, xpath, replacement_string="",):
+        """
+        Replaces regex matches with nothing by default or replacement string
+        for an element matched by the xpath in the xml_tree. Works with
+        matchgroups.
+        """
+        elements = self.xml_tree.xpath(xpath)
+        for element in elements:
+            replaced = re.sub(regex, replacement_string, element.text)
+            element.text = replaced
+        self.xml_tree = self.xml_tree
--- a/bundesdata_markup_nlp/utility/init.py
+++ b/bundesdata_markup_nlp/utility/init.py
--- a/bundesdata_markup_nlp/utility/pycache/FileGetter.cpython-37.pyc
+++ b/bundesdata_markup_nlp/utility/pycache/FileGetter.cpython-37.pyc
--- a/bundesdata_markup_nlp/utility/pycache/XMLProtocol.cpython-37.pyc
+++ b/bundesdata_markup_nlp/utility/pycache/XMLProtocol.cpython-37.pyc
--- a/bundesdata_markup_nlp/utility/pycache/init.cpython-37.pyc
+++ b/bundesdata_markup_nlp/utility/pycache/init.cpython-37.pyc
--- a/bundesdata_markup_nlp/utility/pycache/delete_folder.cpython-37.pyc
+++ b/bundesdata_markup_nlp/utility/pycache/delete_folder.cpython-37.pyc
--- a/bundesdata_markup_nlp/utility/pycache/update_config.cpython-37.pyc
+++ b/bundesdata_markup_nlp/utility/pycache/update_config.cpython-37.pyc
--- a/bundesdata_markup_nlp/utility/delete_folder.py
+++ b/bundesdata_markup_nlp/utility/delete_folder.py
@ -0,0 +1,15 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import shutil
+
+
+def delete_folder(folder_path):
+    """
+    Deletes folder idetified by input folder path string.
+    """
+    shutil.rmtree(folder_path)
+
+
+if __name__ == '__main__':
+    delete_folder()
--- a/bundesdata_markup_nlp/utility/move_ngrams.py
+++ b/bundesdata_markup_nlp/utility/move_ngrams.py
@ -0,0 +1,22 @@
+import os
+
+"""
+Helper script to move n_gram csvs to seperate folders. Just copy this into the
+folder containing the n-grams and execute it. Change n to number of N in N-grams.
+"""
+current_path = os.getcwd()
+files = []
+n = 5
+for file in os.listdir(current_path):
+    if file.endswith(".csv"):
+        files.append(file)
+files = sorted(files)
+
+dir_list = ["1_grams", "2_grams", "3_grams", "4_grams", "5_grams"][:n]
+for dir in dir_list:
+    os.system("mkdir {}".format(dir))
+
+for step, dir in zip(range(0, n), dir_list):
+    for file in files[step::n]:
+        print(file)
+        os.system("mv {} {}".format(file, dir))
--- a/bundesdata_markup_nlp/utility/update_config.py
+++ b/bundesdata_markup_nlp/utility/update_config.py
@ -0,0 +1,21 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import configparser
+
+
+def update_config(file_name, section, key, value):
+    """
+    This script updates a config file identified by file_name. Updates the data
+    of one key value pair in a specific section.
+    """
+    config = configparser.ConfigParser()
+    config.read(file_name)
+    file = open(file_name, "w")
+    config.set(section, key, value)
+    config.write(file)
+    file.close()
+
+
+if __name__ == '__main__':
+    update_config()