Initial commit

This commit is contained in:
Stephan Porada
2019-02-21 19:29:44 +01:00
commit 4263e5f41e
52 changed files with 3024 additions and 0 deletions

View File

@ -0,0 +1,35 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import fnmatch
"""
This class is for getting filepaths of all files in a given directory. Also
gets files in subdirectories.
"""
class FileGetter(object):
"""
Class for getting file paths of given path wich will be opend and/or
further processed later on.
"""
def __init__(self, path, file_type):
super(FileGetter, self).__init__()
self.path = path
self.file_type = file_type
def get_files(self):
"""
Creates file list with full paths of all files in the given
directory and its sub directories and returns it.
"""
list_of_files = []
for path, subdirs, files in os.walk(self.path):
for name in files:
if fnmatch.fnmatch(name, self.file_type):
list_of_files.append(os.path.join(path, name))
self.list_of_files = list_of_files
return list_of_files

View File

@ -0,0 +1,209 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from utility import delete_folder
from utility import update_config
from xml.etree import ElementTree
from os import path
from lxml import etree
import os
import logging
import re
class XMLProtocol(object):
"""Class for standard operations on/with the XML protocols. Has functions
for reading, saving and manipulationg an XML protocol. All other classes
inherit from this one.
"""
def __init__(self):
super().__init__()
self.logger = logging.getLogger(__name__)
def read_protcol(self, file_path):
"""
Takes a file path and parses the file as an XML returns a root element.
"""
self.file_path = file_path
self.filename = os.path.basename(self.file_path)
parser = etree.XMLParser(remove_blank_text=True)
self.tree = etree.parse(file_path, parser) # for better xml indentation
root = self.tree.getroot()
self.logger.info("File successfully parsed as XML.")
return root
def read_xml(self, file_path):
"""Takes a file path and parses the file as an XML."""
parser = etree.XMLParser(encoding='utf-8', remove_blank_text=True)
tree = etree.parse(file_path, parser) # for better xml indentation
self.xml_tree = tree.getroot()
def save_to_file(self, output_path, file_path, subfolder, config_section,
config_key, filename_sufix=""):
"""
Writes the new markup to a new xml file. Takes the output path and
creates a new folder there. Also updates the config file with the new
path.
"""
if(filename_sufix == ""):
self.filename = path.basename(file_path)
elif(filename_sufix != ""):
self.filename = path.basename(file_path)[:-4] + filename_sufix
save_path = os.path.join(output_path, subfolder)
if not os.path.exists(save_path):
os.mkdir(save_path)
tree = etree.ElementTree(self.xml_tree)
new_filename = self.filename
save_file_path = os.path.join(save_path, new_filename)
tree.write(save_file_path,
pretty_print=True,
xml_declaration=True,
encoding="utf8",
doctype="<!DOCTYPE dbtplenarprotokoll SYSTEM 'dbtplenarprotokoll_minimal.dtd\'>")
self.logger.info("New XML saved to:" + save_file_path)
update_config.update_config("config.ini", config_section, config_key,
save_path)
def beautify_xml_part(self, file_path, xpath, alter_lines=False,
line_width=80):
"""
Beautifies part (element node) of an input XML.
"""
tmp_path = os.path.join(os.path.dirname(file_path), "tmp")
tree = etree.ElementTree(self.xml_tree)
self.beautified_part = tree.find(xpath)
self.beautified_part = ElementTree.tostring(self.beautified_part)
self.beautified_part = etree.fromstring(self.beautified_part)
self.beautified_part = etree.ElementTree(self.beautified_part)
if not os.path.exists(tmp_path):
os.mkdir(tmp_path)
tmp_file_path = os.path.join(tmp_path, "tmp.xml")
self.beautified_part.write(tmp_file_path,
pretty_print=True,
xml_declaration=True,
encoding="utf8")
if(alter_lines is True):
os.system("html-beautify -r -q -w {} --no-preserve-newlines {}".format(line_width, tmp_file_path))
self.beautified_part = etree.parse(tmp_file_path).getroot()
elif(alter_lines is False):
os.system("html-beautify -r -q {}".format(tmp_file_path))
self.beautified_part = etree.parse(tmp_file_path).getroot()
update_config.update_config("config.ini", "File paths", "tmp_path",
tmp_path)
delete_folder.delete_folder(tmp_path)
def beautify_xml(self, file_path, alter_lines=False, line_width=80):
if(alter_lines is True):
os.system("html-beautify -r -q -w {} --no-preserve-newlines {}".format(line_width, file_path))
elif(alter_lines is False):
os.system("html-beautify -r -q {}".format(file_path))
def expand_element(self, element_to_expand, expand_attr_key,
expand_attr_value, check_child=True):
"""
This function takes an XPath expression for an xml element.
The tag of this element will be expanded with the given
expand_attrkey and expand_attr_value. Also needs a regex to determine if
the current selected element is an element which should be replaced.
For this the text of the first child of the current element is checked
against the given regex. Per default the child element text of the
current element is checked wether the regex matches the string or not.
Set check_child to False to avoid this and just expand the current
element.
"""
elements = self.xml_tree.findall(element_to_expand)
for element in elements:
if(check_child is True):
first_child = element.getchildren()[0]
match = self.regex_compiled.search(first_child.text)
if(match):
element.set(expand_attr_key, expand_attr_value)
self.xml_tree = self.xml_tree
else:
element.set(expand_attr_key, expand_attr_value)
self.xml_tree = self.xml_tree
def replace_tag_name(self, element_to_replace, tag_name, check_child=True):
"""
Replaces a given element tag(as XPath) name with a new tag name.
"""
elements = self.xml_tree.findall(element_to_replace)
for element in elements:
if(check_child is True):
first_child = element.getchildren()[0]
match = self.regex_compiled.search(first_child.text)
if(match):
element.tag = tag_name
else:
element.tag = tag_name
self.xml_tree = self.xml_tree
def replace_tag_attr(self, element_to_replace, tag_name, attr_key,
attr_value, check_child=True):
"""
Replaces tag name of given element(as XPath) with new name and adds an
attribute Can also check if the child of the current element contains
some specific text like in the expand_element function.
"""
elements = self.xml_tree.findall(element_to_replace)
for element in elements:
if(check_child is True):
first_child = element.getchildren()[0]
match = self.regex_compiled.search(first_child.text)
if(match):
element.tag = tag_name
element.set(attr_key, attr_value)
else:
element.tag = tag_name
element.set(attr_key, attr_value)
self.xml_tree = self.xml_tree
def replace_elements(self, elements_to_replace, replacment_elements,
keep_parent_text=False):
"""
Replaces elements identifeid by XPath with new elements. Can either keep
the text of the parent element or not.
"""
elements = self.xml_tree.findall(elements_to_replace)
parents_text_xpath = elements_to_replace + "/" + "parent::node()" + "/" + "text()"
elements_text = self.xml_tree.xpath(parents_text_xpath)
if(len(elements) == len(replacment_elements)):
if(keep_parent_text is False):
for element, replacement_element in zip(elements, replacment_elements):
element.getparent().replace(element, replacement_element)
else:
for element, replacement_element in zip(elements, replacment_elements):
element.getparent().replace(element, replacement_element)
self.xml_tree = self.xml_tree
elements = self.xml_tree.findall(elements_to_replace)
for element, text in zip(elements, elements_text):
element.tail = text
self.xml_tree = self.xml_tree
else:
self.logger.warning(("Elements missmatch. There are "
+ str(len(elements))
+ " that should be repalced."
+ " There are " + str(len(replacment_elements))
+ " present."
+ " No elements have been replaced."))
def compile_regex(self, regex):
self.regex_string = regex
"""
Takes the input regex string and compiles it for better performance
and redability.
"""
self.regex_compiled = re.compile(self.regex_string, re.MULTILINE)
def clean_text(self, regex, xpath, replacement_string="",):
"""
Replaces regex matches with nothing by default or replacement string
for an element matched by the xpath in the xml_tree. Works with
matchgroups.
"""
elements = self.xml_tree.xpath(xpath)
for element in elements:
replaced = re.sub(regex, replacement_string, element.text)
element.text = replaced
self.xml_tree = self.xml_tree

View File

View File

@ -0,0 +1,15 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import shutil
def delete_folder(folder_path):
"""
Deletes folder idetified by input folder path string.
"""
shutil.rmtree(folder_path)
if __name__ == '__main__':
delete_folder()

View File

@ -0,0 +1,22 @@
import os
"""
Helper script to move n_gram csvs to seperate folders. Just copy this into the
folder containing the n-grams and execute it. Change n to number of N in N-grams.
"""
current_path = os.getcwd()
files = []
n = 5
for file in os.listdir(current_path):
if file.endswith(".csv"):
files.append(file)
files = sorted(files)
dir_list = ["1_grams", "2_grams", "3_grams", "4_grams", "5_grams"][:n]
for dir in dir_list:
os.system("mkdir {}".format(dir))
for step, dir in zip(range(0, n), dir_list):
for file in files[step::n]:
print(file)
os.system("mv {} {}".format(file, dir))

View File

@ -0,0 +1,21 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import configparser
def update_config(file_name, section, key, value):
"""
This script updates a config file identified by file_name. Updates the data
of one key value pair in a specific section.
"""
config = configparser.ConfigParser()
config.read(file_name)
file = open(file_name, "w")
config.set(section, key, value)
config.write(file)
file.close()
if __name__ == '__main__':
update_config()