bundesdata_markup_nlp_software/bundesdata_markup_nlp/nlp/lemmatization.py
2019-02-21 19:29:44 +01:00

85 lines
3.9 KiB
Python
Executable File

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import de_core_news_sm
import configparser
from utility.XMLProtocol import XMLProtocol
from lxml import etree
from tqdm import tqdm
import re
def lemmatization(files, no_stop_words=False):
"""
Lemmatizes the speeches of the input XML protocols with the built in spacy
lookup-table function. Can include or exclude stop words.
Lemmatized text will be written into an new Element named
<rede_lemmatisiert>. Always removes punctuation. Joines hyphenated strings
before they will be lemmatised.
"""
nlp = de_core_news_sm.load()
config = configparser.ConfigParser()
config.read("config.ini")
output_path = config["File paths"]["nlp_output"]
for file_path in tqdm(sorted(files), desc="Lemmatization file status"):
xml = XMLProtocol()
xml.read_xml(file_path)
speeches = xml.xml_tree.xpath(".//rede | .//sitzungsbeginn")
for speech in speeches:
parts = speech.xpath(".//p")
tmp_list = []
for part in parts:
if(part.text is not None):
tmp_list.append(re.sub(r"_", " ", str(part.text + "\n")))
"""
replaces "_" with " ". Is needed because a string like
"Treffsicherheit einer Schrotflinte;_Sie haben nämlich kaum
den Punkt getroffen" will not be lemmatized correctly in spacy.
"Schrotflinte;_Sie" wil be recognized as one token.
Furthermore this meeses up the sorted ngram calculation.
Also adds \n at end of every line to help identifying
hyphenated words.
"""
part.getparent().remove(part)
new_text = "".join(tmp_list)
new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[a-ßzäüö])", "\g<wordend>\g<wordstart>", new_text)
"""
joins hyphenated words together:
'Länderfinanz- ausgleich' --> Länderfinanzausgleich.
Better to do it here because most of the comments and metadata has
already been marked.
Ignores strings like: 'Finanz-, Handels- und Sicherheitspolitik'.
Does not ignore them when they happen at a linebreak. This is a rare
occasion though.
"""
new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[A-ZÄÜÖ])", "\g<wordend>-\g<wordstart>", new_text)
"""
Removes all line breaks again. This way compound names with a line
break inbetween like "Sütterlin-\nWaack" will be recognized as one
string by spacy. --> Sütterlin-Waack
"""
lemmatized_speech = etree.Element("rede_lemmatisiert")
doc = nlp(new_text)
if(no_stop_words is False):
lemmatized = " ".join([token.lemma_ for token in doc
if token.pos_ != "PUNCT" and token.text != "_"])
"""
Removes "_" from text. Has to be removed
because it is some kind of special
character in spacy.
"""
filename_sufix = "_lemmatized_with_stopwords.xml"
elif(no_stop_words is True):
lemmatized = " ".join([token.lemma_ for token in doc
if token.is_stop is False
and token.pos_ != "PUNCT" and token.text != "_"])
filename_sufix = "_lemmatized_without_stopwords.xml"
lemmatized_speech.text = lemmatized
speech.append(lemmatized_speech)
xml.save_to_file(output_path, file_path, "lemmatized", "File paths",
"nlp_lemmatized_tokenized", filename_sufix)
if __name__ == '__main__':
lemmatization()