Initial commit
This commit is contained in:
35
bundesdata_markup_nlp/utility/FileGetter.py
Executable file
35
bundesdata_markup_nlp/utility/FileGetter.py
Executable file
@ -0,0 +1,35 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import fnmatch
|
||||
|
||||
"""
|
||||
This class is for getting filepaths of all files in a given directory. Also
|
||||
gets files in subdirectories.
|
||||
"""
|
||||
|
||||
|
||||
class FileGetter(object):
|
||||
"""
|
||||
Class for getting file paths of given path wich will be opend and/or
|
||||
further processed later on.
|
||||
"""
|
||||
|
||||
def __init__(self, path, file_type):
|
||||
super(FileGetter, self).__init__()
|
||||
self.path = path
|
||||
self.file_type = file_type
|
||||
|
||||
def get_files(self):
|
||||
"""
|
||||
Creates file list with full paths of all files in the given
|
||||
directory and its sub directories and returns it.
|
||||
"""
|
||||
list_of_files = []
|
||||
for path, subdirs, files in os.walk(self.path):
|
||||
for name in files:
|
||||
if fnmatch.fnmatch(name, self.file_type):
|
||||
list_of_files.append(os.path.join(path, name))
|
||||
self.list_of_files = list_of_files
|
||||
return list_of_files
|
209
bundesdata_markup_nlp/utility/XMLProtocol.py
Executable file
209
bundesdata_markup_nlp/utility/XMLProtocol.py
Executable file
@ -0,0 +1,209 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from utility import delete_folder
|
||||
from utility import update_config
|
||||
from xml.etree import ElementTree
|
||||
from os import path
|
||||
from lxml import etree
|
||||
import os
|
||||
import logging
|
||||
import re
|
||||
|
||||
|
||||
class XMLProtocol(object):
|
||||
"""Class for standard operations on/with the XML protocols. Has functions
|
||||
for reading, saving and manipulationg an XML protocol. All other classes
|
||||
inherit from this one.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def read_protcol(self, file_path):
|
||||
"""
|
||||
Takes a file path and parses the file as an XML returns a root element.
|
||||
"""
|
||||
self.file_path = file_path
|
||||
self.filename = os.path.basename(self.file_path)
|
||||
parser = etree.XMLParser(remove_blank_text=True)
|
||||
self.tree = etree.parse(file_path, parser) # for better xml indentation
|
||||
root = self.tree.getroot()
|
||||
self.logger.info("File successfully parsed as XML.")
|
||||
return root
|
||||
|
||||
def read_xml(self, file_path):
|
||||
"""Takes a file path and parses the file as an XML."""
|
||||
parser = etree.XMLParser(encoding='utf-8', remove_blank_text=True)
|
||||
tree = etree.parse(file_path, parser) # for better xml indentation
|
||||
self.xml_tree = tree.getroot()
|
||||
|
||||
def save_to_file(self, output_path, file_path, subfolder, config_section,
|
||||
config_key, filename_sufix=""):
|
||||
"""
|
||||
Writes the new markup to a new xml file. Takes the output path and
|
||||
creates a new folder there. Also updates the config file with the new
|
||||
path.
|
||||
"""
|
||||
if(filename_sufix == ""):
|
||||
self.filename = path.basename(file_path)
|
||||
elif(filename_sufix != ""):
|
||||
self.filename = path.basename(file_path)[:-4] + filename_sufix
|
||||
save_path = os.path.join(output_path, subfolder)
|
||||
if not os.path.exists(save_path):
|
||||
os.mkdir(save_path)
|
||||
tree = etree.ElementTree(self.xml_tree)
|
||||
new_filename = self.filename
|
||||
save_file_path = os.path.join(save_path, new_filename)
|
||||
tree.write(save_file_path,
|
||||
pretty_print=True,
|
||||
xml_declaration=True,
|
||||
encoding="utf8",
|
||||
doctype="<!DOCTYPE dbtplenarprotokoll SYSTEM 'dbtplenarprotokoll_minimal.dtd\'>")
|
||||
self.logger.info("New XML saved to:" + save_file_path)
|
||||
update_config.update_config("config.ini", config_section, config_key,
|
||||
save_path)
|
||||
|
||||
def beautify_xml_part(self, file_path, xpath, alter_lines=False,
|
||||
line_width=80):
|
||||
"""
|
||||
Beautifies part (element node) of an input XML.
|
||||
"""
|
||||
tmp_path = os.path.join(os.path.dirname(file_path), "tmp")
|
||||
tree = etree.ElementTree(self.xml_tree)
|
||||
self.beautified_part = tree.find(xpath)
|
||||
self.beautified_part = ElementTree.tostring(self.beautified_part)
|
||||
self.beautified_part = etree.fromstring(self.beautified_part)
|
||||
self.beautified_part = etree.ElementTree(self.beautified_part)
|
||||
if not os.path.exists(tmp_path):
|
||||
os.mkdir(tmp_path)
|
||||
tmp_file_path = os.path.join(tmp_path, "tmp.xml")
|
||||
self.beautified_part.write(tmp_file_path,
|
||||
pretty_print=True,
|
||||
xml_declaration=True,
|
||||
encoding="utf8")
|
||||
if(alter_lines is True):
|
||||
os.system("html-beautify -r -q -w {} --no-preserve-newlines {}".format(line_width, tmp_file_path))
|
||||
self.beautified_part = etree.parse(tmp_file_path).getroot()
|
||||
elif(alter_lines is False):
|
||||
os.system("html-beautify -r -q {}".format(tmp_file_path))
|
||||
self.beautified_part = etree.parse(tmp_file_path).getroot()
|
||||
update_config.update_config("config.ini", "File paths", "tmp_path",
|
||||
tmp_path)
|
||||
delete_folder.delete_folder(tmp_path)
|
||||
|
||||
def beautify_xml(self, file_path, alter_lines=False, line_width=80):
|
||||
if(alter_lines is True):
|
||||
os.system("html-beautify -r -q -w {} --no-preserve-newlines {}".format(line_width, file_path))
|
||||
elif(alter_lines is False):
|
||||
os.system("html-beautify -r -q {}".format(file_path))
|
||||
|
||||
def expand_element(self, element_to_expand, expand_attr_key,
|
||||
expand_attr_value, check_child=True):
|
||||
"""
|
||||
This function takes an XPath expression for an xml element.
|
||||
The tag of this element will be expanded with the given
|
||||
expand_attrkey and expand_attr_value. Also needs a regex to determine if
|
||||
the current selected element is an element which should be replaced.
|
||||
For this the text of the first child of the current element is checked
|
||||
against the given regex. Per default the child element text of the
|
||||
current element is checked wether the regex matches the string or not.
|
||||
Set check_child to False to avoid this and just expand the current
|
||||
element.
|
||||
"""
|
||||
elements = self.xml_tree.findall(element_to_expand)
|
||||
for element in elements:
|
||||
if(check_child is True):
|
||||
first_child = element.getchildren()[0]
|
||||
match = self.regex_compiled.search(first_child.text)
|
||||
if(match):
|
||||
element.set(expand_attr_key, expand_attr_value)
|
||||
self.xml_tree = self.xml_tree
|
||||
else:
|
||||
element.set(expand_attr_key, expand_attr_value)
|
||||
self.xml_tree = self.xml_tree
|
||||
|
||||
def replace_tag_name(self, element_to_replace, tag_name, check_child=True):
|
||||
"""
|
||||
Replaces a given element tag(as XPath) name with a new tag name.
|
||||
"""
|
||||
elements = self.xml_tree.findall(element_to_replace)
|
||||
for element in elements:
|
||||
if(check_child is True):
|
||||
first_child = element.getchildren()[0]
|
||||
match = self.regex_compiled.search(first_child.text)
|
||||
if(match):
|
||||
element.tag = tag_name
|
||||
else:
|
||||
element.tag = tag_name
|
||||
self.xml_tree = self.xml_tree
|
||||
|
||||
def replace_tag_attr(self, element_to_replace, tag_name, attr_key,
|
||||
attr_value, check_child=True):
|
||||
"""
|
||||
Replaces tag name of given element(as XPath) with new name and adds an
|
||||
attribute Can also check if the child of the current element contains
|
||||
some specific text like in the expand_element function.
|
||||
"""
|
||||
elements = self.xml_tree.findall(element_to_replace)
|
||||
for element in elements:
|
||||
if(check_child is True):
|
||||
first_child = element.getchildren()[0]
|
||||
match = self.regex_compiled.search(first_child.text)
|
||||
if(match):
|
||||
element.tag = tag_name
|
||||
element.set(attr_key, attr_value)
|
||||
else:
|
||||
element.tag = tag_name
|
||||
element.set(attr_key, attr_value)
|
||||
self.xml_tree = self.xml_tree
|
||||
|
||||
def replace_elements(self, elements_to_replace, replacment_elements,
|
||||
keep_parent_text=False):
|
||||
"""
|
||||
Replaces elements identifeid by XPath with new elements. Can either keep
|
||||
the text of the parent element or not.
|
||||
"""
|
||||
elements = self.xml_tree.findall(elements_to_replace)
|
||||
parents_text_xpath = elements_to_replace + "/" + "parent::node()" + "/" + "text()"
|
||||
elements_text = self.xml_tree.xpath(parents_text_xpath)
|
||||
if(len(elements) == len(replacment_elements)):
|
||||
if(keep_parent_text is False):
|
||||
for element, replacement_element in zip(elements, replacment_elements):
|
||||
element.getparent().replace(element, replacement_element)
|
||||
else:
|
||||
for element, replacement_element in zip(elements, replacment_elements):
|
||||
element.getparent().replace(element, replacement_element)
|
||||
self.xml_tree = self.xml_tree
|
||||
elements = self.xml_tree.findall(elements_to_replace)
|
||||
for element, text in zip(elements, elements_text):
|
||||
element.tail = text
|
||||
self.xml_tree = self.xml_tree
|
||||
else:
|
||||
self.logger.warning(("Elements missmatch. There are "
|
||||
+ str(len(elements))
|
||||
+ " that should be repalced."
|
||||
+ " There are " + str(len(replacment_elements))
|
||||
+ " present."
|
||||
+ " No elements have been replaced."))
|
||||
|
||||
def compile_regex(self, regex):
|
||||
self.regex_string = regex
|
||||
"""
|
||||
Takes the input regex string and compiles it for better performance
|
||||
and redability.
|
||||
"""
|
||||
self.regex_compiled = re.compile(self.regex_string, re.MULTILINE)
|
||||
|
||||
def clean_text(self, regex, xpath, replacement_string="",):
|
||||
"""
|
||||
Replaces regex matches with nothing by default or replacement string
|
||||
for an element matched by the xpath in the xml_tree. Works with
|
||||
matchgroups.
|
||||
"""
|
||||
elements = self.xml_tree.xpath(xpath)
|
||||
for element in elements:
|
||||
replaced = re.sub(regex, replacement_string, element.text)
|
||||
element.text = replaced
|
||||
self.xml_tree = self.xml_tree
|
0
bundesdata_markup_nlp/utility/__init__.py
Executable file
0
bundesdata_markup_nlp/utility/__init__.py
Executable file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
15
bundesdata_markup_nlp/utility/delete_folder.py
Executable file
15
bundesdata_markup_nlp/utility/delete_folder.py
Executable file
@ -0,0 +1,15 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import shutil
|
||||
|
||||
|
||||
def delete_folder(folder_path):
|
||||
"""
|
||||
Deletes folder idetified by input folder path string.
|
||||
"""
|
||||
shutil.rmtree(folder_path)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
delete_folder()
|
22
bundesdata_markup_nlp/utility/move_ngrams.py
Executable file
22
bundesdata_markup_nlp/utility/move_ngrams.py
Executable file
@ -0,0 +1,22 @@
|
||||
import os
|
||||
|
||||
"""
|
||||
Helper script to move n_gram csvs to seperate folders. Just copy this into the
|
||||
folder containing the n-grams and execute it. Change n to number of N in N-grams.
|
||||
"""
|
||||
current_path = os.getcwd()
|
||||
files = []
|
||||
n = 5
|
||||
for file in os.listdir(current_path):
|
||||
if file.endswith(".csv"):
|
||||
files.append(file)
|
||||
files = sorted(files)
|
||||
|
||||
dir_list = ["1_grams", "2_grams", "3_grams", "4_grams", "5_grams"][:n]
|
||||
for dir in dir_list:
|
||||
os.system("mkdir {}".format(dir))
|
||||
|
||||
for step, dir in zip(range(0, n), dir_list):
|
||||
for file in files[step::n]:
|
||||
print(file)
|
||||
os.system("mv {} {}".format(file, dir))
|
21
bundesdata_markup_nlp/utility/update_config.py
Executable file
21
bundesdata_markup_nlp/utility/update_config.py
Executable file
@ -0,0 +1,21 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import configparser
|
||||
|
||||
|
||||
def update_config(file_name, section, key, value):
|
||||
"""
|
||||
This script updates a config file identified by file_name. Updates the data
|
||||
of one key value pair in a specific section.
|
||||
"""
|
||||
config = configparser.ConfigParser()
|
||||
config.read(file_name)
|
||||
file = open(file_name, "w")
|
||||
config.set(section, key, value)
|
||||
config.write(file)
|
||||
file.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
update_config()
|
Reference in New Issue
Block a user