Initial commit

This commit is contained in:
Stephan Porada
2019-02-21 19:29:44 +01:00
commit 4263e5f41e
52 changed files with 3024 additions and 0 deletions

View File

@ -0,0 +1,84 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import de_core_news_sm
import configparser
from utility.XMLProtocol import XMLProtocol
from lxml import etree
from tqdm import tqdm
import re
def lemmatization(files, no_stop_words=False):
"""
Lemmatizes the speeches of the input XML protocols with the built in spacy
lookup-table function. Can include or exclude stop words.
Lemmatized text will be written into an new Element named
<rede_lemmatisiert>. Always removes punctuation. Joines hyphenated strings
before they will be lemmatised.
"""
nlp = de_core_news_sm.load()
config = configparser.ConfigParser()
config.read("config.ini")
output_path = config["File paths"]["nlp_output"]
for file_path in tqdm(sorted(files), desc="Lemmatization file status"):
xml = XMLProtocol()
xml.read_xml(file_path)
speeches = xml.xml_tree.xpath(".//rede | .//sitzungsbeginn")
for speech in speeches:
parts = speech.xpath(".//p")
tmp_list = []
for part in parts:
if(part.text is not None):
tmp_list.append(re.sub(r"_", " ", str(part.text + "\n")))
"""
replaces "_" with " ". Is needed because a string like
"Treffsicherheit einer Schrotflinte;_Sie haben nämlich kaum
den Punkt getroffen" will not be lemmatized correctly in spacy.
"Schrotflinte;_Sie" wil be recognized as one token.
Furthermore this meeses up the sorted ngram calculation.
Also adds \n at end of every line to help identifying
hyphenated words.
"""
part.getparent().remove(part)
new_text = "".join(tmp_list)
new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[a-ßzäüö])", "\g<wordend>\g<wordstart>", new_text)
"""
joins hyphenated words together:
'Länderfinanz- ausgleich' --> Länderfinanzausgleich.
Better to do it here because most of the comments and metadata has
already been marked.
Ignores strings like: 'Finanz-, Handels- und Sicherheitspolitik'.
Does not ignore them when they happen at a linebreak. This is a rare
occasion though.
"""
new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[A-ZÄÜÖ])", "\g<wordend>-\g<wordstart>", new_text)
"""
Removes all line breaks again. This way compound names with a line
break inbetween like "Sütterlin-\nWaack" will be recognized as one
string by spacy. --> Sütterlin-Waack
"""
lemmatized_speech = etree.Element("rede_lemmatisiert")
doc = nlp(new_text)
if(no_stop_words is False):
lemmatized = " ".join([token.lemma_ for token in doc
if token.pos_ != "PUNCT" and token.text != "_"])
"""
Removes "_" from text. Has to be removed
because it is some kind of special
character in spacy.
"""
filename_sufix = "_lemmatized_with_stopwords.xml"
elif(no_stop_words is True):
lemmatized = " ".join([token.lemma_ for token in doc
if token.is_stop is False
and token.pos_ != "PUNCT" and token.text != "_"])
filename_sufix = "_lemmatized_without_stopwords.xml"
lemmatized_speech.text = lemmatized
speech.append(lemmatized_speech)
xml.save_to_file(output_path, file_path, "lemmatized", "File paths",
"nlp_lemmatized_tokenized", filename_sufix)
if __name__ == '__main__':
lemmatization()

View File

@ -0,0 +1,142 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*
import configparser
import csv
import os
import gc
from utility.XMLProtocol import XMLProtocol
from collections import Counter
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from itertools import groupby, chain
from operator import itemgetter
import locale
locale.setlocale(locale.LC_COLLATE, "C") # Sets locale to portable "C" locale.
def n_grams(files, group_by_feature="year",
input_type_name="lemmatized_without_stopwords"):
"""
Clacluates 1 to 5 grams for given input protocols. Can either handel
lemmatized or non lemmatized files. Writes the ngrams to a tab separated csv
file. One row inclueds the ngram, the match count of it, the year or date,
or rede_id or redner_id. One file per unigram, bigram, trigram etc. per
group key will be created. (There wil be one file for unigrams starting with
the letter 'A' one for unigrams starting with 'B' etc.)
Third parameter is a string set by the user which will be added to
the file names to help distinguish lemmatized and non lemmatized ngrams etc.
The more protocols are used as input the more RAM the script needs.
For all 4106 protocols 32GB of RAM with a 32GB swap file was used!
"""
config = configparser.ConfigParser()
config.read("config.ini")
output_path = config["File paths"]["nlp_output"]
output_path = os.path.join(output_path, "n-grams")
if not os.path.exists(output_path):
os.mkdir(output_path)
for step in tqdm(range(6)[1:], desc="Current ngram calculating"):
N_GRAMS = []
file_name_prefix = str(step) + "_grams"
counter_vectorizer = CountVectorizer(ngram_range=(step, step),
lowercase=False)
for file_path in tqdm(sorted(files), desc="File status"):
xml = XMLProtocol()
xml.read_xml(file_path)
feature_year = xml.xml_tree.xpath("@sitzung-datum")[0][-4:]
feature_mont_year = xml.xml_tree.xpath("@sitzung-datum")[0][-7:]
speeches = xml.xml_tree.xpath(".//rede | .//sitzungsbeginn")
for speech in speeches:
# gets id of current speech
feature_rede_id = speech.xpath("@id")
if(len(feature_rede_id) == 0):
feature_rede_id = "sitzungsbeginn"
else:
feature_rede_id = feature_rede_id[0]
# gets id of current speaker
feature_redner_id = speech.xpath(".//redner/@id")[0]
# gets speech text from tokenized or lemmatized protocol
speech_text = speech.xpath("node()[2]")[0] # gets second child of speech
if(speech_text.text is not None):
tmp_str = speech_text.text
ngrams = counter_vectorizer.build_analyzer()
ngrams_list = ngrams(tmp_str)
if(group_by_feature == "year"):
pairs = [(pair,) + (feature_year,) for pair
in ngrams_list]
elif(group_by_feature == "month_year"):
pairs = [(pair,) + (feature_mont_year,) for pair
in ngrams_list]
elif(group_by_feature == "speaker"):
pairs = [(pair,) + (feature_redner_id,) for pair
in ngrams_list]
elif(group_by_feature == "speech"):
pairs = [(pair,) + (feature_rede_id,) for pair
in ngrams_list]
N_GRAMS.extend(pairs)
speeches = None
# puts uppercase ngram at first position in line to sort by this
# will be delted later on
print("Start counting ngrams.")
N_GRAMS = Counter(N_GRAMS)
print("Finished counting ngrams.")
print("Start sorting ngrams")
N_GRAMS = [item[0][0][0].upper()
+ "||"
+ item[0][0]
+ "||"
+ str(item[0][1])
+ "||"
+ str(item[1])
for item in N_GRAMS.items()]
N_GRAMS = sorted(N_GRAMS, key=locale.strxfrm)
print("Finished sorting ngrams")
# sorts all ngrams into groups one group for each german uppercasse
# letter except ß
# Also one group for every decimal from 0 to 10
# Other non ascii or non decimal ngrams will be sorted in own groups
# These groups will be joined together later on into one non ascii group
alphabetically = []
tmp_list = []
for letter, entries in tqdm(groupby(N_GRAMS, key=itemgetter(0)),
desc="Grouping ngrams alphabetically"):
if(letter):
print(letter)
for entry in entries:
tmp_list.append(entry)
alphabetically.append(tmp_list)
tmp_list = []
N_GRAMS = None
gc.collect() # frees RAM
key_list = ([i for i in range(10)]
+ "A B C D E F G H I J K L M N O P Q R S T U V W X Y Z".split()
+ ["_Non_ASCII"])
# groups all non ascii ngrams into one list to save them into one csv
if(len(alphabetically) > 37):
joined_tail = alphabetically[36:]
joined_tail = chain.from_iterable(list(joined_tail))
del alphabetically[36:]
alphabetically.append(joined_tail)
# save groups to individual files
for group, key in tqdm(zip(alphabetically, key_list),
desc="Writing ngrams to files"):
group_ngrams = [entry.split("||")[1:] for entry in group]
file_name = (str(key)
+ "_"
+ file_name_prefix
+ "_per_"
+ group_by_feature
+ "_"
+ input_type_name
+ ".csv")
file_output_path = os.path.join(output_path, file_name)
with open(file_output_path, "w", newline="", encoding="utf8") as file:
writer = csv.writer(file, delimiter="\t")
writer.writerows(group_ngrams)
alphabetically = None
if __name__ == '__main__':
n_grams()

View File

@ -0,0 +1,78 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import de_core_news_sm
import configparser
from utility.XMLProtocol import XMLProtocol
from lxml import etree
from tqdm import tqdm
import re
def tokenize(files, no_stop_words=False):
"""
Tokenizes the speeches of the input XML protocols. Can include or exclude
stop words. Tokenized speeches will be written into a new element
<rede_tokenisiert>. Always removes punctuation. Joines hyphenated strings
before they will be tokenized.
"""
nlp = de_core_news_sm.load()
config = configparser.ConfigParser()
config.read("config.ini")
output_path = config["File paths"]["nlp_output"]
for file_path in tqdm(sorted(files), desc="Tokenization file status"):
xml = XMLProtocol()
xml.read_xml(file_path)
speeches = xml.xml_tree.xpath(".//rede | .//sitzungsbeginn")
for speech in speeches:
parts = speech.xpath(".//p")
tmp_list = []
for part in parts:
if(part.text is not None):
tmp_list.append(re.sub(r"_", " ", str(part.text + "\n")))
"""
replaces "_" with " ". Is needed because a string like
"Treffsicherheit einer Schrotflinte;_Sie haben nämlich kaum
den Punkt getroffen" will not be lemmatized correctly in spacy.
"Schrotflinte;_Sie" wil be recognized as one token.
Furthermore this meeses up the sorted ngram calculation.
Also adds \n at end of every line to help identifying
hyphenated words.
"""
part.getparent().remove(part)
new_text = "".join(tmp_list)
new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[a-ßzäüö])", "\g<wordend>\g<wordstart>", new_text)
"""
joins hyphenated words together:
'Länderfinanz- ausgleich' --> Länderfinanzausgleich.
Better to do it here because most of the comments and metadata has
already been marked.
Ignores strings like: 'Finanz-, Handels- und Sicherheitspolitik'.
Does not ignore them when they happen at a linebreak. This is a rare
occasion though.
"""
new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[A-ZÄÜÖ])", "\g<wordend>-\g<wordstart>", new_text)
"""
Removes all line breaks again. This way compound names with a line
break inbetween like "Sütterlin-\nWaack" will be recognized as one
string by spacy. --> Sütterlin-Waack
"""
tokenized_speech = etree.Element("rede_tokenisiert")
doc = nlp(new_text)
if(no_stop_words is False):
tokenized = " ".join([token.text for token in doc
if token.pos_ != "PUNCT"])
filename_sufix = "_tokenized_with_stopwords.xml"
elif(no_stop_words is True):
tokenized = " ".join([token.text for token in doc
if token.is_stop is False
and token.pos_ != "PUNCT"])
filename_sufix = "_tokenized_without_stopwords.xml"
tokenized_speech.text = tokenized
speech.append(tokenized_speech)
xml.save_to_file(output_path, file_path, "tokenized", "File paths",
"nlp_lemmatized_tokenized", filename_sufix)
if __name__ == '__main__':
tokenize()