Initial commit
This commit is contained in:
Binary file not shown.
BIN
bundesdata_markup_nlp/nlp/__pycache__/n_grams.cpython-37.pyc
Normal file
BIN
bundesdata_markup_nlp/nlp/__pycache__/n_grams.cpython-37.pyc
Normal file
Binary file not shown.
BIN
bundesdata_markup_nlp/nlp/__pycache__/tokenize.cpython-37.pyc
Normal file
BIN
bundesdata_markup_nlp/nlp/__pycache__/tokenize.cpython-37.pyc
Normal file
Binary file not shown.
84
bundesdata_markup_nlp/nlp/lemmatization.py
Executable file
84
bundesdata_markup_nlp/nlp/lemmatization.py
Executable file
@ -0,0 +1,84 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import de_core_news_sm
|
||||
import configparser
|
||||
from utility.XMLProtocol import XMLProtocol
|
||||
from lxml import etree
|
||||
from tqdm import tqdm
|
||||
import re
|
||||
|
||||
|
||||
def lemmatization(files, no_stop_words=False):
|
||||
"""
|
||||
Lemmatizes the speeches of the input XML protocols with the built in spacy
|
||||
lookup-table function. Can include or exclude stop words.
|
||||
Lemmatized text will be written into an new Element named
|
||||
<rede_lemmatisiert>. Always removes punctuation. Joines hyphenated strings
|
||||
before they will be lemmatised.
|
||||
"""
|
||||
nlp = de_core_news_sm.load()
|
||||
config = configparser.ConfigParser()
|
||||
config.read("config.ini")
|
||||
output_path = config["File paths"]["nlp_output"]
|
||||
for file_path in tqdm(sorted(files), desc="Lemmatization file status"):
|
||||
xml = XMLProtocol()
|
||||
xml.read_xml(file_path)
|
||||
speeches = xml.xml_tree.xpath(".//rede | .//sitzungsbeginn")
|
||||
for speech in speeches:
|
||||
parts = speech.xpath(".//p")
|
||||
tmp_list = []
|
||||
for part in parts:
|
||||
if(part.text is not None):
|
||||
tmp_list.append(re.sub(r"_", " ", str(part.text + "\n")))
|
||||
"""
|
||||
replaces "_" with " ". Is needed because a string like
|
||||
"Treffsicherheit einer Schrotflinte;_Sie haben nämlich kaum
|
||||
den Punkt getroffen" will not be lemmatized correctly in spacy.
|
||||
"Schrotflinte;_Sie" wil be recognized as one token.
|
||||
Furthermore this meeses up the sorted ngram calculation.
|
||||
Also adds \n at end of every line to help identifying
|
||||
hyphenated words.
|
||||
"""
|
||||
part.getparent().remove(part)
|
||||
new_text = "".join(tmp_list)
|
||||
new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[a-ßzäüö])", "\g<wordend>\g<wordstart>", new_text)
|
||||
"""
|
||||
joins hyphenated words together:
|
||||
'Länderfinanz- ausgleich' --> Länderfinanzausgleich.
|
||||
Better to do it here because most of the comments and metadata has
|
||||
already been marked.
|
||||
Ignores strings like: 'Finanz-, Handels- und Sicherheitspolitik'.
|
||||
Does not ignore them when they happen at a linebreak. This is a rare
|
||||
occasion though.
|
||||
"""
|
||||
new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[A-ZÄÜÖ])", "\g<wordend>-\g<wordstart>", new_text)
|
||||
"""
|
||||
Removes all line breaks again. This way compound names with a line
|
||||
break inbetween like "Sütterlin-\nWaack" will be recognized as one
|
||||
string by spacy. --> Sütterlin-Waack
|
||||
"""
|
||||
lemmatized_speech = etree.Element("rede_lemmatisiert")
|
||||
doc = nlp(new_text)
|
||||
if(no_stop_words is False):
|
||||
lemmatized = " ".join([token.lemma_ for token in doc
|
||||
if token.pos_ != "PUNCT" and token.text != "_"])
|
||||
"""
|
||||
Removes "_" from text. Has to be removed
|
||||
because it is some kind of special
|
||||
character in spacy.
|
||||
"""
|
||||
filename_sufix = "_lemmatized_with_stopwords.xml"
|
||||
elif(no_stop_words is True):
|
||||
lemmatized = " ".join([token.lemma_ for token in doc
|
||||
if token.is_stop is False
|
||||
and token.pos_ != "PUNCT" and token.text != "_"])
|
||||
filename_sufix = "_lemmatized_without_stopwords.xml"
|
||||
lemmatized_speech.text = lemmatized
|
||||
speech.append(lemmatized_speech)
|
||||
xml.save_to_file(output_path, file_path, "lemmatized", "File paths",
|
||||
"nlp_lemmatized_tokenized", filename_sufix)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
lemmatization()
|
142
bundesdata_markup_nlp/nlp/n_grams.py
Executable file
142
bundesdata_markup_nlp/nlp/n_grams.py
Executable file
@ -0,0 +1,142 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*
|
||||
|
||||
import configparser
|
||||
import csv
|
||||
import os
|
||||
import gc
|
||||
from utility.XMLProtocol import XMLProtocol
|
||||
from collections import Counter
|
||||
from tqdm import tqdm
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from itertools import groupby, chain
|
||||
from operator import itemgetter
|
||||
import locale
|
||||
locale.setlocale(locale.LC_COLLATE, "C") # Sets locale to portable "C" locale.
|
||||
|
||||
|
||||
def n_grams(files, group_by_feature="year",
|
||||
input_type_name="lemmatized_without_stopwords"):
|
||||
"""
|
||||
Clacluates 1 to 5 grams for given input protocols. Can either handel
|
||||
lemmatized or non lemmatized files. Writes the ngrams to a tab separated csv
|
||||
file. One row inclueds the ngram, the match count of it, the year or date,
|
||||
or rede_id or redner_id. One file per unigram, bigram, trigram etc. per
|
||||
group key will be created. (There wil be one file for unigrams starting with
|
||||
the letter 'A' one for unigrams starting with 'B' etc.)
|
||||
Third parameter is a string set by the user which will be added to
|
||||
the file names to help distinguish lemmatized and non lemmatized ngrams etc.
|
||||
The more protocols are used as input the more RAM the script needs.
|
||||
For all 4106 protocols 32GB of RAM with a 32GB swap file was used!
|
||||
"""
|
||||
config = configparser.ConfigParser()
|
||||
config.read("config.ini")
|
||||
output_path = config["File paths"]["nlp_output"]
|
||||
output_path = os.path.join(output_path, "n-grams")
|
||||
if not os.path.exists(output_path):
|
||||
os.mkdir(output_path)
|
||||
for step in tqdm(range(6)[1:], desc="Current ngram calculating"):
|
||||
N_GRAMS = []
|
||||
file_name_prefix = str(step) + "_grams"
|
||||
counter_vectorizer = CountVectorizer(ngram_range=(step, step),
|
||||
lowercase=False)
|
||||
for file_path in tqdm(sorted(files), desc="File status"):
|
||||
xml = XMLProtocol()
|
||||
xml.read_xml(file_path)
|
||||
feature_year = xml.xml_tree.xpath("@sitzung-datum")[0][-4:]
|
||||
feature_mont_year = xml.xml_tree.xpath("@sitzung-datum")[0][-7:]
|
||||
speeches = xml.xml_tree.xpath(".//rede | .//sitzungsbeginn")
|
||||
for speech in speeches:
|
||||
# gets id of current speech
|
||||
feature_rede_id = speech.xpath("@id")
|
||||
if(len(feature_rede_id) == 0):
|
||||
feature_rede_id = "sitzungsbeginn"
|
||||
else:
|
||||
feature_rede_id = feature_rede_id[0]
|
||||
# gets id of current speaker
|
||||
feature_redner_id = speech.xpath(".//redner/@id")[0]
|
||||
# gets speech text from tokenized or lemmatized protocol
|
||||
speech_text = speech.xpath("node()[2]")[0] # gets second child of speech
|
||||
if(speech_text.text is not None):
|
||||
tmp_str = speech_text.text
|
||||
|
||||
ngrams = counter_vectorizer.build_analyzer()
|
||||
ngrams_list = ngrams(tmp_str)
|
||||
|
||||
if(group_by_feature == "year"):
|
||||
pairs = [(pair,) + (feature_year,) for pair
|
||||
in ngrams_list]
|
||||
elif(group_by_feature == "month_year"):
|
||||
pairs = [(pair,) + (feature_mont_year,) for pair
|
||||
in ngrams_list]
|
||||
elif(group_by_feature == "speaker"):
|
||||
pairs = [(pair,) + (feature_redner_id,) for pair
|
||||
in ngrams_list]
|
||||
elif(group_by_feature == "speech"):
|
||||
pairs = [(pair,) + (feature_rede_id,) for pair
|
||||
in ngrams_list]
|
||||
N_GRAMS.extend(pairs)
|
||||
speeches = None
|
||||
# puts uppercase ngram at first position in line to sort by this
|
||||
# will be delted later on
|
||||
print("Start counting ngrams.")
|
||||
N_GRAMS = Counter(N_GRAMS)
|
||||
print("Finished counting ngrams.")
|
||||
print("Start sorting ngrams")
|
||||
N_GRAMS = [item[0][0][0].upper()
|
||||
+ "||"
|
||||
+ item[0][0]
|
||||
+ "||"
|
||||
+ str(item[0][1])
|
||||
+ "||"
|
||||
+ str(item[1])
|
||||
for item in N_GRAMS.items()]
|
||||
N_GRAMS = sorted(N_GRAMS, key=locale.strxfrm)
|
||||
print("Finished sorting ngrams")
|
||||
# sorts all ngrams into groups one group for each german uppercasse
|
||||
# letter except ß
|
||||
# Also one group for every decimal from 0 to 10
|
||||
# Other non ascii or non decimal ngrams will be sorted in own groups
|
||||
# These groups will be joined together later on into one non ascii group
|
||||
alphabetically = []
|
||||
tmp_list = []
|
||||
for letter, entries in tqdm(groupby(N_GRAMS, key=itemgetter(0)),
|
||||
desc="Grouping ngrams alphabetically"):
|
||||
if(letter):
|
||||
print(letter)
|
||||
for entry in entries:
|
||||
tmp_list.append(entry)
|
||||
alphabetically.append(tmp_list)
|
||||
tmp_list = []
|
||||
N_GRAMS = None
|
||||
gc.collect() # frees RAM
|
||||
key_list = ([i for i in range(10)]
|
||||
+ "A B C D E F G H I J K L M N O P Q R S T U V W X Y Z".split()
|
||||
+ ["_Non_ASCII"])
|
||||
# groups all non ascii ngrams into one list to save them into one csv
|
||||
if(len(alphabetically) > 37):
|
||||
joined_tail = alphabetically[36:]
|
||||
joined_tail = chain.from_iterable(list(joined_tail))
|
||||
del alphabetically[36:]
|
||||
alphabetically.append(joined_tail)
|
||||
# save groups to individual files
|
||||
for group, key in tqdm(zip(alphabetically, key_list),
|
||||
desc="Writing ngrams to files"):
|
||||
group_ngrams = [entry.split("||")[1:] for entry in group]
|
||||
file_name = (str(key)
|
||||
+ "_"
|
||||
+ file_name_prefix
|
||||
+ "_per_"
|
||||
+ group_by_feature
|
||||
+ "_"
|
||||
+ input_type_name
|
||||
+ ".csv")
|
||||
file_output_path = os.path.join(output_path, file_name)
|
||||
with open(file_output_path, "w", newline="", encoding="utf8") as file:
|
||||
writer = csv.writer(file, delimiter="\t")
|
||||
writer.writerows(group_ngrams)
|
||||
alphabetically = None
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
n_grams()
|
78
bundesdata_markup_nlp/nlp/tokenize.py
Executable file
78
bundesdata_markup_nlp/nlp/tokenize.py
Executable file
@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import de_core_news_sm
|
||||
import configparser
|
||||
from utility.XMLProtocol import XMLProtocol
|
||||
from lxml import etree
|
||||
from tqdm import tqdm
|
||||
import re
|
||||
|
||||
|
||||
def tokenize(files, no_stop_words=False):
|
||||
"""
|
||||
Tokenizes the speeches of the input XML protocols. Can include or exclude
|
||||
stop words. Tokenized speeches will be written into a new element
|
||||
<rede_tokenisiert>. Always removes punctuation. Joines hyphenated strings
|
||||
before they will be tokenized.
|
||||
"""
|
||||
nlp = de_core_news_sm.load()
|
||||
config = configparser.ConfigParser()
|
||||
config.read("config.ini")
|
||||
output_path = config["File paths"]["nlp_output"]
|
||||
for file_path in tqdm(sorted(files), desc="Tokenization file status"):
|
||||
xml = XMLProtocol()
|
||||
xml.read_xml(file_path)
|
||||
speeches = xml.xml_tree.xpath(".//rede | .//sitzungsbeginn")
|
||||
for speech in speeches:
|
||||
parts = speech.xpath(".//p")
|
||||
tmp_list = []
|
||||
for part in parts:
|
||||
if(part.text is not None):
|
||||
tmp_list.append(re.sub(r"_", " ", str(part.text + "\n")))
|
||||
"""
|
||||
replaces "_" with " ". Is needed because a string like
|
||||
"Treffsicherheit einer Schrotflinte;_Sie haben nämlich kaum
|
||||
den Punkt getroffen" will not be lemmatized correctly in spacy.
|
||||
"Schrotflinte;_Sie" wil be recognized as one token.
|
||||
Furthermore this meeses up the sorted ngram calculation.
|
||||
Also adds \n at end of every line to help identifying
|
||||
hyphenated words.
|
||||
"""
|
||||
part.getparent().remove(part)
|
||||
new_text = "".join(tmp_list)
|
||||
new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[a-ßzäüö])", "\g<wordend>\g<wordstart>", new_text)
|
||||
"""
|
||||
joins hyphenated words together:
|
||||
'Länderfinanz- ausgleich' --> Länderfinanzausgleich.
|
||||
Better to do it here because most of the comments and metadata has
|
||||
already been marked.
|
||||
Ignores strings like: 'Finanz-, Handels- und Sicherheitspolitik'.
|
||||
Does not ignore them when they happen at a linebreak. This is a rare
|
||||
occasion though.
|
||||
"""
|
||||
new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[A-ZÄÜÖ])", "\g<wordend>-\g<wordstart>", new_text)
|
||||
"""
|
||||
Removes all line breaks again. This way compound names with a line
|
||||
break inbetween like "Sütterlin-\nWaack" will be recognized as one
|
||||
string by spacy. --> Sütterlin-Waack
|
||||
"""
|
||||
tokenized_speech = etree.Element("rede_tokenisiert")
|
||||
doc = nlp(new_text)
|
||||
if(no_stop_words is False):
|
||||
tokenized = " ".join([token.text for token in doc
|
||||
if token.pos_ != "PUNCT"])
|
||||
filename_sufix = "_tokenized_with_stopwords.xml"
|
||||
elif(no_stop_words is True):
|
||||
tokenized = " ".join([token.text for token in doc
|
||||
if token.is_stop is False
|
||||
and token.pos_ != "PUNCT"])
|
||||
filename_sufix = "_tokenized_without_stopwords.xml"
|
||||
tokenized_speech.text = tokenized
|
||||
speech.append(tokenized_speech)
|
||||
xml.save_to_file(output_path, file_path, "tokenized", "File paths",
|
||||
"nlp_lemmatized_tokenized", filename_sufix)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
tokenize()
|
Reference in New Issue
Block a user