79 lines
3.6 KiB
Python
Executable File
79 lines
3.6 KiB
Python
Executable File
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
|
|
import de_core_news_sm
|
|
import configparser
|
|
from utility.XMLProtocol import XMLProtocol
|
|
from lxml import etree
|
|
from tqdm import tqdm
|
|
import re
|
|
|
|
|
|
def tokenize(files, no_stop_words=False):
|
|
"""
|
|
Tokenizes the speeches of the input XML protocols. Can include or exclude
|
|
stop words. Tokenized speeches will be written into a new element
|
|
<rede_tokenisiert>. Always removes punctuation. Joines hyphenated strings
|
|
before they will be tokenized.
|
|
"""
|
|
nlp = de_core_news_sm.load()
|
|
config = configparser.ConfigParser()
|
|
config.read("config.ini")
|
|
output_path = config["File paths"]["nlp_output"]
|
|
for file_path in tqdm(sorted(files), desc="Tokenization file status"):
|
|
xml = XMLProtocol()
|
|
xml.read_xml(file_path)
|
|
speeches = xml.xml_tree.xpath(".//rede | .//sitzungsbeginn")
|
|
for speech in speeches:
|
|
parts = speech.xpath(".//p")
|
|
tmp_list = []
|
|
for part in parts:
|
|
if(part.text is not None):
|
|
tmp_list.append(re.sub(r"_", " ", str(part.text + "\n")))
|
|
"""
|
|
replaces "_" with " ". Is needed because a string like
|
|
"Treffsicherheit einer Schrotflinte;_Sie haben nämlich kaum
|
|
den Punkt getroffen" will not be lemmatized correctly in spacy.
|
|
"Schrotflinte;_Sie" wil be recognized as one token.
|
|
Furthermore this meeses up the sorted ngram calculation.
|
|
Also adds \n at end of every line to help identifying
|
|
hyphenated words.
|
|
"""
|
|
part.getparent().remove(part)
|
|
new_text = "".join(tmp_list)
|
|
new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[a-ßzäüö])", "\g<wordend>\g<wordstart>", new_text)
|
|
"""
|
|
joins hyphenated words together:
|
|
'Länderfinanz- ausgleich' --> Länderfinanzausgleich.
|
|
Better to do it here because most of the comments and metadata has
|
|
already been marked.
|
|
Ignores strings like: 'Finanz-, Handels- und Sicherheitspolitik'.
|
|
Does not ignore them when they happen at a linebreak. This is a rare
|
|
occasion though.
|
|
"""
|
|
new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[A-ZÄÜÖ])", "\g<wordend>-\g<wordstart>", new_text)
|
|
"""
|
|
Removes all line breaks again. This way compound names with a line
|
|
break inbetween like "Sütterlin-\nWaack" will be recognized as one
|
|
string by spacy. --> Sütterlin-Waack
|
|
"""
|
|
tokenized_speech = etree.Element("rede_tokenisiert")
|
|
doc = nlp(new_text)
|
|
if(no_stop_words is False):
|
|
tokenized = " ".join([token.text for token in doc
|
|
if token.pos_ != "PUNCT"])
|
|
filename_sufix = "_tokenized_with_stopwords.xml"
|
|
elif(no_stop_words is True):
|
|
tokenized = " ".join([token.text for token in doc
|
|
if token.is_stop is False
|
|
and token.pos_ != "PUNCT"])
|
|
filename_sufix = "_tokenized_without_stopwords.xml"
|
|
tokenized_speech.text = tokenized
|
|
speech.append(tokenized_speech)
|
|
xml.save_to_file(output_path, file_path, "tokenized", "File paths",
|
|
"nlp_lemmatized_tokenized", filename_sufix)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
tokenize()
|