Initial commit

2019-02-21 19:29:44 +01:00
commit 4263e5f41e
52 changed files with 3024 additions and 0 deletions
--- a/bundesdata_markup_nlp/nlp/pycache/lemmatization.cpython-37.pyc
+++ b/bundesdata_markup_nlp/nlp/pycache/lemmatization.cpython-37.pyc
--- a/bundesdata_markup_nlp/nlp/pycache/n_grams.cpython-37.pyc
+++ b/bundesdata_markup_nlp/nlp/pycache/n_grams.cpython-37.pyc
--- a/bundesdata_markup_nlp/nlp/pycache/tokenize.cpython-37.pyc
+++ b/bundesdata_markup_nlp/nlp/pycache/tokenize.cpython-37.pyc
--- a/bundesdata_markup_nlp/nlp/lemmatization.py
+++ b/bundesdata_markup_nlp/nlp/lemmatization.py
@ -0,0 +1,84 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import de_core_news_sm
+import configparser
+from utility.XMLProtocol import XMLProtocol
+from lxml import etree
+from tqdm import tqdm
+import re
+
+
+def lemmatization(files, no_stop_words=False):
+    """
+    Lemmatizes the speeches of the input XML protocols with the built in spacy
+    lookup-table function. Can include or exclude stop words.
+    Lemmatized text will be written into an new Element named
+    <rede_lemmatisiert>. Always removes punctuation. Joines hyphenated strings
+    before they will be lemmatised.
+    """
+    nlp = de_core_news_sm.load()
+    config = configparser.ConfigParser()
+    config.read("config.ini")
+    output_path = config["File paths"]["nlp_output"]
+    for file_path in tqdm(sorted(files), desc="Lemmatization file status"):
+        xml = XMLProtocol()
+        xml.read_xml(file_path)
+        speeches = xml.xml_tree.xpath(".//rede | .//sitzungsbeginn")
+        for speech in speeches:
+            parts = speech.xpath(".//p")
+            tmp_list = []
+            for part in parts:
+                if(part.text is not None):
+                    tmp_list.append(re.sub(r"_", " ", str(part.text + "\n")))
+                    """
+                    replaces "_" with " ". Is needed because a string like
+                    "Treffsicherheit einer Schrotflinte;_Sie haben nämlich kaum
+                    den Punkt getroffen" will not be lemmatized correctly in spacy.
+                    "Schrotflinte;_Sie" wil be recognized as one token.
+                    Furthermore this meeses up the sorted ngram calculation.
+                    Also adds \n at end of every line to help identifying
+                    hyphenated words.
+                    """
+                part.getparent().remove(part)
+            new_text = "".join(tmp_list)
+            new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[a-ßzäüö])", "\g<wordend>\g<wordstart>", new_text)
+            """
+            joins hyphenated words together:
+            'Länderfinanz- ausgleich' --> Länderfinanzausgleich.
+            Better to do it here because most of the comments and metadata has
+            already been marked.
+            Ignores strings like: 'Finanz-, Handels- und Sicherheitspolitik'.
+            Does not ignore them when they happen at a linebreak. This is a rare
+            occasion though.
+            """
+            new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[A-ZÄÜÖ])", "\g<wordend>-\g<wordstart>", new_text)
+            """
+            Removes all line breaks again. This way compound names with a line
+            break inbetween like "Sütterlin-\nWaack" will be recognized as one
+            string by spacy. --> Sütterlin-Waack
+            """
+            lemmatized_speech = etree.Element("rede_lemmatisiert")
+            doc = nlp(new_text)
+            if(no_stop_words is False):
+                lemmatized = " ".join([token.lemma_ for token in doc
+                                       if token.pos_ != "PUNCT" and token.text != "_"])
+                """
+                Removes "_" from text. Has to be removed
+                because it is some kind of special
+                character in spacy.
+                """
+                filename_sufix = "_lemmatized_with_stopwords.xml"
+            elif(no_stop_words is True):
+                lemmatized = " ".join([token.lemma_ for token in doc
+                                       if token.is_stop is False
+                                       and token.pos_ != "PUNCT" and token.text != "_"])
+                filename_sufix = "_lemmatized_without_stopwords.xml"
+            lemmatized_speech.text = lemmatized
+            speech.append(lemmatized_speech)
+        xml.save_to_file(output_path, file_path, "lemmatized", "File paths",
+                         "nlp_lemmatized_tokenized", filename_sufix)
+
+
+if __name__ == '__main__':
+    lemmatization()
--- a/bundesdata_markup_nlp/nlp/n_grams.py
+++ b/bundesdata_markup_nlp/nlp/n_grams.py
@ -0,0 +1,142 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*
+
+import configparser
+import csv
+import os
+import gc
+from utility.XMLProtocol import XMLProtocol
+from collections import Counter
+from tqdm import tqdm
+from sklearn.feature_extraction.text import CountVectorizer
+from itertools import groupby, chain
+from operator import itemgetter
+import locale
+locale.setlocale(locale.LC_COLLATE, "C")  # Sets locale to portable "C" locale.
+
+
+def n_grams(files, group_by_feature="year",
+            input_type_name="lemmatized_without_stopwords"):
+    """
+    Clacluates 1 to 5 grams for given input protocols. Can either handel
+    lemmatized or non lemmatized files. Writes the ngrams to a tab separated csv
+    file. One row inclueds the ngram, the match count of it, the year or date,
+    or rede_id or redner_id. One file per unigram, bigram, trigram etc. per
+    group key will be created. (There wil be one file for unigrams starting with
+    the letter 'A' one for unigrams starting with 'B' etc.)
+    Third parameter is a string set by the user which will be added to
+    the file names to help distinguish lemmatized and non lemmatized ngrams etc.
+    The more protocols are used as input the more RAM the script needs.
+    For all 4106 protocols 32GB of RAM with a 32GB swap file was used!
+    """
+    config = configparser.ConfigParser()
+    config.read("config.ini")
+    output_path = config["File paths"]["nlp_output"]
+    output_path = os.path.join(output_path, "n-grams")
+    if not os.path.exists(output_path):
+        os.mkdir(output_path)
+    for step in tqdm(range(6)[1:], desc="Current ngram calculating"):
+        N_GRAMS = []
+        file_name_prefix = str(step) + "_grams"
+        counter_vectorizer = CountVectorizer(ngram_range=(step, step),
+                                             lowercase=False)
+        for file_path in tqdm(sorted(files), desc="File status"):
+            xml = XMLProtocol()
+            xml.read_xml(file_path)
+            feature_year = xml.xml_tree.xpath("@sitzung-datum")[0][-4:]
+            feature_mont_year = xml.xml_tree.xpath("@sitzung-datum")[0][-7:]
+            speeches = xml.xml_tree.xpath(".//rede | .//sitzungsbeginn")
+            for speech in speeches:
+                # gets id of current speech
+                feature_rede_id = speech.xpath("@id")
+                if(len(feature_rede_id) == 0):
+                    feature_rede_id = "sitzungsbeginn"
+                else:
+                    feature_rede_id = feature_rede_id[0]
+                # gets id of current speaker
+                feature_redner_id = speech.xpath(".//redner/@id")[0]
+                # gets speech text from tokenized or lemmatized protocol
+                speech_text = speech.xpath("node()[2]")[0]  # gets second child of speech
+                if(speech_text.text is not None):
+                    tmp_str = speech_text.text
+
+                ngrams = counter_vectorizer.build_analyzer()
+                ngrams_list = ngrams(tmp_str)
+
+                if(group_by_feature == "year"):
+                    pairs = [(pair,) + (feature_year,) for pair
+                             in ngrams_list]
+                elif(group_by_feature == "month_year"):
+                    pairs = [(pair,) + (feature_mont_year,) for pair
+                             in ngrams_list]
+                elif(group_by_feature == "speaker"):
+                    pairs = [(pair,) + (feature_redner_id,) for pair
+                             in ngrams_list]
+                elif(group_by_feature == "speech"):
+                    pairs = [(pair,) + (feature_rede_id,) for pair
+                             in ngrams_list]
+                N_GRAMS.extend(pairs)
+            speeches = None
+        # puts uppercase ngram at first position in line to sort by this
+        # will be delted later on
+        print("Start counting ngrams.")
+        N_GRAMS = Counter(N_GRAMS)
+        print("Finished counting ngrams.")
+        print("Start sorting ngrams")
+        N_GRAMS = [item[0][0][0].upper()
+                   + "||"
+                   + item[0][0]
+                   + "||"
+                   + str(item[0][1])
+                   + "||"
+                   + str(item[1])
+                   for item in N_GRAMS.items()]
+        N_GRAMS = sorted(N_GRAMS, key=locale.strxfrm)
+        print("Finished sorting ngrams")
+        # sorts all ngrams into groups one group for each german uppercasse
+        # letter except ß
+        # Also one group for every decimal from 0 to 10
+        # Other non ascii or non decimal ngrams will be sorted in own groups
+        # These groups will be joined together later on into one non ascii group
+        alphabetically = []
+        tmp_list = []
+        for letter, entries in tqdm(groupby(N_GRAMS, key=itemgetter(0)),
+                                    desc="Grouping ngrams alphabetically"):
+            if(letter):
+                print(letter)
+                for entry in entries:
+                    tmp_list.append(entry)
+            alphabetically.append(tmp_list)
+            tmp_list = []
+            N_GRAMS = None
+            gc.collect() # frees RAM
+        key_list = ([i for i in range(10)]
+                    + "A B C D E F G H I J K L M N O P Q R S T U V W X Y Z".split()
+                    + ["_Non_ASCII"])
+        # groups all non ascii ngrams into one list to save them into one csv
+        if(len(alphabetically) > 37):
+            joined_tail = alphabetically[36:]
+            joined_tail = chain.from_iterable(list(joined_tail))
+            del alphabetically[36:]
+            alphabetically.append(joined_tail)
+        # save groups to individual files
+        for group, key in tqdm(zip(alphabetically, key_list),
+                               desc="Writing ngrams to files"):
+            group_ngrams = [entry.split("||")[1:] for entry in group]
+            file_name = (str(key)
+                         + "_"
+                         + file_name_prefix
+                         + "_per_"
+                         + group_by_feature
+                         + "_"
+                         + input_type_name
+                         + ".csv")
+            file_output_path = os.path.join(output_path, file_name)
+            with open(file_output_path, "w", newline="", encoding="utf8") as file:
+                writer = csv.writer(file, delimiter="\t")
+                writer.writerows(group_ngrams)
+        alphabetically = None
+
+
+if __name__ == '__main__':
+    n_grams()
--- a/bundesdata_markup_nlp/nlp/tokenize.py
+++ b/bundesdata_markup_nlp/nlp/tokenize.py
@ -0,0 +1,78 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import de_core_news_sm
+import configparser
+from utility.XMLProtocol import XMLProtocol
+from lxml import etree
+from tqdm import tqdm
+import re
+
+
+def tokenize(files, no_stop_words=False):
+    """
+    Tokenizes the speeches of the input XML protocols. Can include or exclude
+    stop words. Tokenized speeches will be written into a new element
+    <rede_tokenisiert>. Always removes punctuation. Joines hyphenated strings
+    before they will be tokenized.
+    """
+    nlp = de_core_news_sm.load()
+    config = configparser.ConfigParser()
+    config.read("config.ini")
+    output_path = config["File paths"]["nlp_output"]
+    for file_path in tqdm(sorted(files), desc="Tokenization file status"):
+        xml = XMLProtocol()
+        xml.read_xml(file_path)
+        speeches = xml.xml_tree.xpath(".//rede | .//sitzungsbeginn")
+        for speech in speeches:
+            parts = speech.xpath(".//p")
+            tmp_list = []
+            for part in parts:
+                if(part.text is not None):
+                    tmp_list.append(re.sub(r"_", " ", str(part.text + "\n")))
+                    """
+                    replaces "_" with " ". Is needed because a string like
+                    "Treffsicherheit einer Schrotflinte;_Sie haben nämlich kaum
+                    den Punkt getroffen" will not be lemmatized correctly in spacy.
+                    "Schrotflinte;_Sie" wil be recognized as one token.
+                    Furthermore this meeses up the sorted ngram calculation.
+                    Also adds \n at end of every line to help identifying
+                    hyphenated words.
+                    """
+                part.getparent().remove(part)
+            new_text = "".join(tmp_list)
+            new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[a-ßzäüö])", "\g<wordend>\g<wordstart>", new_text)
+            """
+            joins hyphenated words together:
+            'Länderfinanz- ausgleich' --> Länderfinanzausgleich.
+            Better to do it here because most of the comments and metadata has
+            already been marked.
+            Ignores strings like: 'Finanz-, Handels- und Sicherheitspolitik'.
+            Does not ignore them when they happen at a linebreak. This is a rare
+            occasion though.
+            """
+            new_text = re.sub(r"(?P<wordend>[a-zßüöä])(?P<replace>\-\n)(?P<wordstart>[A-ZÄÜÖ])", "\g<wordend>-\g<wordstart>", new_text)
+            """
+            Removes all line breaks again. This way compound names with a line
+            break inbetween like "Sütterlin-\nWaack" will be recognized as one
+            string by spacy. --> Sütterlin-Waack
+            """
+            tokenized_speech = etree.Element("rede_tokenisiert")
+            doc = nlp(new_text)
+            if(no_stop_words is False):
+                tokenized = " ".join([token.text for token in doc
+                                      if token.pos_ != "PUNCT"])
+                filename_sufix = "_tokenized_with_stopwords.xml"
+            elif(no_stop_words is True):
+                tokenized = " ".join([token.text for token in doc
+                                      if token.is_stop is False
+                                      and token.pos_ != "PUNCT"])
+                filename_sufix = "_tokenized_without_stopwords.xml"
+            tokenized_speech.text = tokenized
+            speech.append(tokenized_speech)
+        xml.save_to_file(output_path, file_path, "tokenized", "File paths",
+                         "nlp_lemmatized_tokenized", filename_sufix)
+
+
+if __name__ == '__main__':
+    tokenize()