#!/usr/bin/env python # -*- coding: utf-8 -*- import de_core_news_sm import configparser from utility.XMLProtocol import XMLProtocol from lxml import etree from tqdm import tqdm import re def tokenize(files, no_stop_words=False): """ Tokenizes the speeches of the input XML protocols. Can include or exclude stop words. Tokenized speeches will be written into a new element . Always removes punctuation. Joines hyphenated strings before they will be tokenized. """ nlp = de_core_news_sm.load() config = configparser.ConfigParser() config.read("config.ini") output_path = config["File paths"]["nlp_output"] for file_path in tqdm(sorted(files), desc="Tokenization file status"): xml = XMLProtocol() xml.read_xml(file_path) speeches = xml.xml_tree.xpath(".//rede | .//sitzungsbeginn") for speech in speeches: parts = speech.xpath(".//p") tmp_list = [] for part in parts: if(part.text is not None): tmp_list.append(re.sub(r"_", " ", str(part.text + "\n"))) """ replaces "_" with " ". Is needed because a string like "Treffsicherheit einer Schrotflinte;_Sie haben nämlich kaum den Punkt getroffen" will not be lemmatized correctly in spacy. "Schrotflinte;_Sie" wil be recognized as one token. Furthermore this meeses up the sorted ngram calculation. Also adds \n at end of every line to help identifying hyphenated words. """ part.getparent().remove(part) new_text = "".join(tmp_list) new_text = re.sub(r"(?P[a-zßüöä])(?P\-\n)(?P[a-ßzäüö])", "\g\g", new_text) """ joins hyphenated words together: 'Länderfinanz- ausgleich' --> Länderfinanzausgleich. Better to do it here because most of the comments and metadata has already been marked. Ignores strings like: 'Finanz-, Handels- und Sicherheitspolitik'. Does not ignore them when they happen at a linebreak. This is a rare occasion though. """ new_text = re.sub(r"(?P[a-zßüöä])(?P\-\n)(?P[A-ZÄÜÖ])", "\g-\g", new_text) """ Removes all line breaks again. This way compound names with a line break inbetween like "Sütterlin-\nWaack" will be recognized as one string by spacy. --> Sütterlin-Waack """ tokenized_speech = etree.Element("rede_tokenisiert") doc = nlp(new_text) if(no_stop_words is False): tokenized = " ".join([token.text for token in doc if token.pos_ != "PUNCT"]) filename_sufix = "_tokenized_with_stopwords.xml" elif(no_stop_words is True): tokenized = " ".join([token.text for token in doc if token.is_stop is False and token.pos_ != "PUNCT"]) filename_sufix = "_tokenized_without_stopwords.xml" tokenized_speech.text = tokenized speech.append(tokenized_speech) xml.save_to_file(output_path, file_path, "tokenized", "File paths", "nlp_lemmatized_tokenized", filename_sufix) if __name__ == '__main__': tokenize()