Custom Stopword List Settings

This commit is contained in:
Inga Kirschnick 2023-07-11 09:33:11 +02:00
parent 413b6111df
commit deec9e8a76
3 changed files with 271 additions and 168 deletions

View File

@ -61,12 +61,6 @@ def build_corpus(corpus_id):
@bp.route('/stopwords') @bp.route('/stopwords')
@content_negotiation(produces='application/json') @content_negotiation(produces='application/json')
def get_stopwords(): def get_stopwords():
# data = request.json
# if not isinstance(data, dict):
# abort(400)
# language = data.get('language')
# if not isinstance(language, str):
# abort(400)
nltk.download('stopwords') nltk.download('stopwords')
languages = ["german", "english", "catalan", "greek", "spanish", "french", "italian", "russian", "chinese"] languages = ["german", "english", "catalan", "greek", "spanish", "french", "italian", "russian", "chinese"]
stopwords = {} stopwords = {}
@ -74,10 +68,7 @@ def get_stopwords():
stopwords[language] = nltk.corpus.stopwords.words(language) stopwords[language] = nltk.corpus.stopwords.words(language)
stopwords['punctuation'] = list(punctuation) + ['', '|'] stopwords['punctuation'] = list(punctuation) + ['', '|']
stopwords['user_stopwords'] = [] stopwords['user_stopwords'] = []
print(stopwords) response_data = stopwords
response_data = {
'stopwords': stopwords
}
return response_data, 202 return response_data, 202
# @bp.route('/<hashid:corpus_id>/generate-share-link', methods=['POST']) # @bp.route('/<hashid:corpus_id>/generate-share-link', methods=['POST'])

View File

@ -1,7 +1,10 @@
class CorpusAnalysisApp { class CorpusAnalysisApp {
constructor(corpusId) { constructor(corpusId) {
this.data = { this.data = {
promises: {getStopwords: []} stopwords: undefined,
originalStopwords: {},
stopwordCache: {},
promises: {getStopwords: undefined}
}; };
// HTML elements // HTML elements
@ -24,39 +27,17 @@ class CorpusAnalysisApp {
}; };
} }
// getStopwords(language) {
// if (language in this.data.promises.getStopwords) {
// console.log('Stopwords already loaded');
// return this.data.promises.getStopwords[language];
// }
// this.data.promises.getStopwords[language] = new Promise((resolve, reject) => {
// Requests.corpora.entity.getStopwords(language)
// .then((response) => {
// response.json()
// .then((json) => {
// let stopwords = json.stopwords;
// resolve(stopwords);
// })
// .catch((error) => {
// reject(error);
// });
// });
// });
// return this.data.promises.getStopwords[language];
// }
getStopwords() { getStopwords() {
if (this.data.promises.getStopwords.length !== 0) {
console.log('Stopwords already loaded');
return this.data.promises.getStopwords;
}
this.data.promises.getStopwords = new Promise((resolve, reject) => { this.data.promises.getStopwords = new Promise((resolve, reject) => {
Requests.corpora.entity.getStopwords() Requests.corpora.entity.getStopwords()
.then((response) => { .then((response) => {
response.json() response.json()
.then((json) => { .then((json) => {
let stopwords = json.stopwords; for (let [key, value] of Object.entries(json)) {
resolve(stopwords); this.data.originalStopwords[key] = value;
}
this.data.stopwords = json;
resolve(this.data.stopwords);
}) })
.catch((error) => { .catch((error) => {
reject(error); reject(error);
@ -66,7 +47,6 @@ class CorpusAnalysisApp {
return this.data.promises.getStopwords; return this.data.promises.getStopwords;
} }
init() { init() {
this.disableActionElements(); this.disableActionElements();
this.elements.m.initModal.open(); this.elements.m.initModal.open();
@ -79,20 +59,11 @@ class CorpusAnalysisApp {
.then((cqiCorpora) => { .then((cqiCorpora) => {
this.data.corpus = {o: cqiCorpora[0]}; this.data.corpus = {o: cqiCorpora[0]};
console.log(this.data.corpus.o.staticData); console.log(this.data.corpus.o.staticData);
this.renderGeneralCorpusInfo(this.data.corpus.o.staticData); this.renderGeneralCorpusInfo();
this.renderTextInfoList(this.data.corpus.o.staticData); this.renderTextInfoList();
this.renderTextProportionsGraphic(this.data.corpus.o.staticData); this.renderTextProportionsGraphic()
this.renderFrequenciesGraphic(this.data.corpus.o.staticData); this.renderFrequenciesGraphic();
this.renderBoundsGraphic(this.data.corpus.o.staticData); this.renderBoundsGraphic();
// this.data.corpus.o.getCorpusData()
// .then(corpusData => {
// console.log(corpusData);
// this.renderGeneralCorpusInfo(corpusData);
// this.renderTextInfoList(corpusData);
// this.renderTextProportionsGraphic(corpusData);
// this.renderFrequenciesGraphic(corpusData);
// this.renderBoundsGraphic(corpusData);
// });
// TODO: Don't do this hgere // TODO: Don't do this hgere
this.data.corpus.o.updateDb(); this.data.corpus.o.updateDb();
this.enableActionElements(); this.enableActionElements();
@ -117,6 +88,30 @@ class CorpusAnalysisApp {
this.elements.m.extensionTabs.select(extensionSelectorElement.dataset.target); this.elements.m.extensionTabs.select(extensionSelectorElement.dataset.target);
}); });
} }
let frequenciesStopwordSettingModal = document.querySelector('#frequencies-stopwords-setting-modal');
let frequenciesStopwordSettingModalButton = document.querySelector('#frequencies-stopwords-setting-modal-button');
frequenciesStopwordSettingModalButton.addEventListener('click', () => {
this.data.stopwordCache = {};
const stopwordsCopy = Object.assign({}, this.data.stopwords);
for (let [key, value] of Object.entries(stopwordsCopy)) {
this.data.stopwordCache[key] = value;
}
this.renderStopwordSettingsModal(this.data.stopwords);
M.Modal.init(frequenciesStopwordSettingModal, {dismissible: false});
});
for (let actionButton of document.querySelectorAll('.frequencies-stopword-setting-modal-action-buttons')) {
actionButton.addEventListener('click', (event) => {
let action = event.target.closest('.frequencies-stopword-setting-modal-action-buttons').dataset.action;
if (action === 'submit') {
this.renderFrequenciesGraphic();
} else if (action === 'cancel') {
this.data.stopwords = this.data.stopwordCache;
this.renderFrequenciesGraphic();
}
});
}
} }
registerExtension(extension) { registerExtension(extension) {
@ -154,7 +149,8 @@ class CorpusAnalysisApp {
} }
} }
renderGeneralCorpusInfo(corpusData) { renderGeneralCorpusInfo() {
let corpusData = this.data.corpus.o.staticData;
document.querySelector('.corpus-num-tokens').innerHTML = corpusData.corpus.counts.token; document.querySelector('.corpus-num-tokens').innerHTML = corpusData.corpus.counts.token;
document.querySelector('.corpus-num-s').innerHTML = corpusData.corpus.counts.s; document.querySelector('.corpus-num-s').innerHTML = corpusData.corpus.counts.s;
document.querySelector('.corpus-num-unique-words').innerHTML = Object.entries(corpusData.corpus.freqs.word).length; document.querySelector('.corpus-num-unique-words').innerHTML = Object.entries(corpusData.corpus.freqs.word).length;
@ -163,7 +159,8 @@ class CorpusAnalysisApp {
document.querySelector('.corpus-num-unique-simple-pos').innerHTML = Object.entries(corpusData.corpus.freqs.simple_pos).length; document.querySelector('.corpus-num-unique-simple-pos').innerHTML = Object.entries(corpusData.corpus.freqs.simple_pos).length;
} }
renderTextInfoList(corpusData) { renderTextInfoList() {
let corpusData = this.data.corpus.o.staticData;
let corpusTextInfoListElement = document.querySelector('.corpus-text-info-list'); let corpusTextInfoListElement = document.querySelector('.corpus-text-info-list');
let corpusTextInfoList = new CorpusTextInfoList(corpusTextInfoListElement); let corpusTextInfoList = new CorpusTextInfoList(corpusTextInfoListElement);
let texts = corpusData.s_attrs.text.lexicon; let texts = corpusData.s_attrs.text.lexicon;
@ -189,7 +186,8 @@ class CorpusAnalysisApp {
textCountChipElement.innerHTML = `Text count: ${corpusData.corpus.counts.text}`; textCountChipElement.innerHTML = `Text count: ${corpusData.corpus.counts.text}`;
} }
renderTextProportionsGraphic(corpusData) { renderTextProportionsGraphic() {
let corpusData = this.data.corpus.o.staticData;
let textProportionsGraphicElement = document.querySelector('#text-proportions-graphic'); let textProportionsGraphicElement = document.querySelector('#text-proportions-graphic');
let texts = Object.entries(corpusData.s_attrs.text.lexicon); let texts = Object.entries(corpusData.s_attrs.text.lexicon);
let graphData = [ let graphData = [
@ -223,7 +221,8 @@ class CorpusAnalysisApp {
Plotly.newPlot(textProportionsGraphicElement, graphData, graphLayout, config); Plotly.newPlot(textProportionsGraphicElement, graphData, graphLayout, config);
} }
renderFrequenciesGraphic(corpusData) { async renderFrequenciesGraphic() {
let corpusData = this.data.corpus.o.staticData;
let frequenciesTokenCategoryDropdownElement = document.querySelector('[data-target="frequencies-token-category-dropdown"]'); let frequenciesTokenCategoryDropdownElement = document.querySelector('[data-target="frequencies-token-category-dropdown"]');
let frequenciesTokenCategoryDropdownListElement = document.querySelector("#frequencies-token-category-dropdown"); let frequenciesTokenCategoryDropdownListElement = document.querySelector("#frequencies-token-category-dropdown");
let frequenciesGraphicElement = document.querySelector('#frequencies-graphic'); let frequenciesGraphicElement = document.querySelector('#frequencies-graphic');
@ -248,8 +247,7 @@ class CorpusAnalysisApp {
let tokenCategory = frequenciesTokenCategoryDropdownElement.firstChild.textContent.toLowerCase(); let tokenCategory = frequenciesTokenCategoryDropdownElement.firstChild.textContent.toLowerCase();
this.createFrequenciesGraphData(tokenCategory, texts, corpusData, graphtype) let graphData = await this.createFrequenciesGraphData(tokenCategory, texts, corpusData, graphtype);
.then(graphData => {
let graphLayout = { let graphLayout = {
barmode: graphtype === 'bar' ? 'stack' : '', barmode: graphtype === 'bar' ? 'stack' : '',
margin: { margin: {
@ -266,25 +264,26 @@ class CorpusAnalysisApp {
displaylogo: false displaylogo: false
}; };
Plotly.newPlot(frequenciesGraphicElement, graphData, graphLayout, config); Plotly.newPlot(frequenciesGraphicElement, graphData, graphLayout, config);
});
} }
createFrequenciesGraphData(category, texts, corpusData, graphtype) { async createFrequenciesGraphData(category, texts, corpusData, graphtype) {
return new Promise((resolve, reject) => { let stopwords = this.data.stopwords;
this.getStopwords() if (this.data.stopwords === undefined) {
.then(stopwords => { stopwords = await this.getStopwords();
this.renderStopwordSettingsModal(stopwords); }
let stopwordList = []; let stopwordList = [];
Object.values(stopwords).forEach(stopwordItems => { Object.values(stopwords).forEach(stopwordItems => {
stopwordItems.forEach(stopword => { stopwordItems.forEach(stopword => {
stopwordList.push(stopword); stopwordList.push(stopword);
}); });
}); });
let graphData = []; let graphData = [];
let filteredData = Object.entries(corpusData.corpus.freqs[category]) let filteredData = Object.entries(corpusData.corpus.freqs[category])
.sort((a, b) => b[1] - a[1]) .sort((a, b) => b[1] - a[1])
.filter(item => !stopwordList.includes(corpusData.values.p_attrs[category][item[0]].toLowerCase())) .filter(item => !stopwordList.includes(corpusData.values.p_attrs[category][item[0]].toLowerCase()))
.slice(0, 5); .slice(0, 5);
if (graphtype !== 'markers') { if (graphtype !== 'markers') {
for (let item of filteredData) { for (let item of filteredData) {
let data = { let data = {
@ -306,29 +305,126 @@ class CorpusAnalysisApp {
mode: 'markers', mode: 'markers',
marker: { marker: {
size: size, size: size,
// sizeref: 2.0 * Math.max(...size) / (80**2), sizeref: 0.4
// sizemode: 'area',
sizeref: 0.2
} }
}; };
graphData.push(data); graphData.push(data);
} }
} }
resolve(graphData); return graphData;
})
.catch(error => {
reject(error);
});
});
} }
renderStopwordSettingsModal(stopwords) { renderStopwordSettingsModal(stopwords) {
let stopwordInputField = document.querySelector('.stopword-input-field'); let stopwordInputField = document.querySelector('#stopword-input-field');
let userStopwordListContainer = document.querySelector('#user-stopword-list-container');
let stopwordLanguageSelection = document.querySelector('#stopword-language-selection');
let stopwordLanguageChipList = document.querySelector('#stopword-language-chip-list');
let deleteLanguageStopwordListEntriesButton = document.querySelector('#delete-language-stopword-list-entries-button');
let resetLanguageStopwordListEntriesButton = document.querySelector('#reset-language-stopword-list-entries-button');
stopwordLanguageChipList.innerHTML = '';
userStopwordListContainer.innerHTML = '';
// Render stopword language selection. Set english as default language. Filter out user_stopwords.
for (let language of Object.keys(stopwords)) {
if (language !== 'user_stopwords') {
if (language === 'english') {
let optionElement = Utils.HTMLToElement(`<option value="${language}" selected>${language}</option>`);
stopwordLanguageSelection.appendChild(optionElement);
} else {
let optionElement = Utils.HTMLToElement(`<option value="${language}">${language}</option>`);
stopwordLanguageSelection.appendChild(optionElement);
}
}
} }
// Render user stopwords over input field.
if (this.data.stopwords['user_stopwords'].length > 0) {
for (let word of this.data.stopwords['user_stopwords']) {
let chipElement = Utils.HTMLToElement(`<div class="chip">${word}<i class="close material-icons">close</i></div>`);
chipElement.addEventListener('click', (event) => {
let removedListItem = event.target.closest('.chip').firstChild.textContent;
this.data.stopwords['user_stopwords'] = this.data.stopwords['user_stopwords'].filter(item => item !== removedListItem);
});
userStopwordListContainer.appendChild(chipElement);
}
}
// Render english stopwords as default ...
this.renderStopwordLanguageChipList('english', stopwords['english']);
renderBoundsGraphic(corpusData) { // ... or render selected language stopwords.
stopwordLanguageSelection.addEventListener('change', (event) => {
this.renderStopwordLanguageChipList(event.target.value, stopwords[event.target.value]);
});
// Eventlistener for deleting all stopwords of a language.
deleteLanguageStopwordListEntriesButton.addEventListener('click', (event) => {
let selectedLanguage = stopwordLanguageSelection.value;
this.data.stopwords[selectedLanguage] = [];
stopwordLanguageChipList.innerHTML = '';
this.buttonRendering();
});
// Eventlistener for resetting all stopwords of a language to the original stopwords.
resetLanguageStopwordListEntriesButton.addEventListener('click', () => {
let selectedLanguage = stopwordLanguageSelection.value;
this.data.stopwords[selectedLanguage] = this.data.originalStopwords[selectedLanguage];
this.renderStopwordLanguageChipList(selectedLanguage, this.data.stopwords[selectedLanguage]);
});
// Initialize Materialize components.
M.Chips.init(
stopwordInputField,
{
placeholder: 'Add stopwords',
onChipAdd: (event) => {
let userStopwords = [];
for (let word of event[0].M_Chips.chipsData) {
if (!this.data.stopwords['user_stopwords'].includes(word.tag.toLowerCase())) {
userStopwords.push(word.tag.toLowerCase());
}
}
this.data.stopwords['user_stopwords'] = this.data.stopwords['user_stopwords'].concat(userStopwords);
}
}
);
M.FormSelect.init(stopwordLanguageSelection);
}
buttonRendering() {
let stopwordLanguageSelection = document.querySelector('#stopword-language-selection');
let deleteLanguageStopwordListEntriesButton = document.querySelector('#delete-language-stopword-list-entries-button');
let resetLanguageStopwordListEntriesButton = document.querySelector('#reset-language-stopword-list-entries-button');
let selectedLanguage = stopwordLanguageSelection.value;
let stopwordLength = this.data.stopwords[selectedLanguage].length;
let originalStopwordListLength = this.data.originalStopwords[selectedLanguage].length;
resetLanguageStopwordListEntriesButton.classList.toggle('blue', stopwordLength !== originalStopwordListLength);
deleteLanguageStopwordListEntriesButton.classList.toggle('red', stopwordLength > 0);
resetLanguageStopwordListEntriesButton.style.cursor = stopwordLength !== originalStopwordListLength ? 'pointer' : 'default';
deleteLanguageStopwordListEntriesButton.style.cursor = stopwordLength > 0 ? 'pointer' : 'default';
}
renderStopwordLanguageChipList(language, stopwords) {
let stopwordLanguageChipList = document.querySelector('#stopword-language-chip-list');
stopwordLanguageChipList.innerHTML = '';
for (let word of stopwords) {
let chipElement = Utils.HTMLToElement(`<div class="chip">${word}<i class="close material-icons">close</i></div>`);
chipElement.addEventListener('click', (event) => {
let removedListItem = event.target.closest('.chip').firstChild.textContent;
this.data.stopwords[language] = this.data.stopwords[language].filter(item => item !== removedListItem);
this.buttonRendering();
});
stopwordLanguageChipList.appendChild(chipElement);
}
this.buttonRendering();
}
renderBoundsGraphic() {
let corpusData = this.data.corpus.o.staticData;
let boundsGraphicElement = document.querySelector('#bounds-graphic'); let boundsGraphicElement = document.querySelector('#bounds-graphic');
let graphData = []; let graphData = [];

View File

@ -123,7 +123,7 @@
<a class="btn disabled frequencies-graph-mode-button" data-graph-type="bar"><i class="material-icons">equalizer</i></a> <a class="btn disabled frequencies-graph-mode-button" data-graph-type="bar"><i class="material-icons">equalizer</i></a>
<a class="btn frequencies-graph-mode-button" data-graph-type="scatter"><i class="material-icons">show_chart</i></a> <a class="btn frequencies-graph-mode-button" data-graph-type="scatter"><i class="material-icons">show_chart</i></a>
<a class="btn frequencies-graph-mode-button" data-graph-type="markers"><i class="material-icons">bubble_chart</i></a> <a class="btn frequencies-graph-mode-button" data-graph-type="markers"><i class="material-icons">bubble_chart</i></a>
<a class="btn-flat modal-trigger" href="#frequencies-stopwords-setting-modal"><i class="material-icons grey-text text-darken-2">settings</i></a> <a class="btn-flat modal-trigger no-autoinit" id="frequencies-stopwords-setting-modal-button" href="#frequencies-stopwords-setting-modal"><i class="material-icons grey-text text-darken-2">settings</i></a>
</div> </div>
</div> </div>
</div> </div>
@ -140,7 +140,6 @@
</div> </div>
</div> </div>
</div> </div>
</div> </div>
@ -166,18 +165,35 @@
</div> </div>
</div> </div>
<div class="modal" id="frequencies-stopwords-setting-modal"> <div class="modal modal-fixed-footer" id="frequencies-stopwords-setting-modal">
<div class="modal-content"> <div class="modal-content">
<h4>Settings</h4> <h4>Settings</h4>
<p>Here you can change the stopword-lists. Add your own stopwords or change the already existing below.</p>
<div class="chips chips-placeholder stopword-input-field"></div>
<div class="row"> <div class="row">
<p>Here you can change the stopword-lists. Stopwords are common words in a language,
like "the" or "and," that carry little meaning and are often removed in text analysis
to improve efficiency and accuracy.</p>
<div id="user-stopword-list-container"></div>
<div class="chips col s8 no-autoinit input-field" id="stopword-input-field">
</div>
</div>
<div class="row">
<p>Below you can find a list of all stopwords that are always filtered out.
The lists are sorted by language, you can remove single words or remove
whole languages via the settings on the right.</p>
<div class="input-field col s3"> <div class="input-field col s3">
<select class="stopword-language-selection"></select> <select id="stopword-language-selection"></select>
<label>Stopword language select</label> <label>Stopword language select</label>
</div> </div>
</div> </div>
<div class="row">
<div class="chip white-text" id="delete-language-stopword-list-entries-button" style="cursor:pointer">Delete all below<i class="material-icons right" style="margin-top: 4px; margin-left: -1px;">delete</i></div>
<div class="chip white-text" id="reset-language-stopword-list-entries-button" style="cursor:pointer">Reset stopword list<i class="material-icons right disable-on-click" style="margin-top: 4px; margin-left: -1px;">refresh</i></div>
</div> </div>
<div id="stopword-language-chip-list"></div>
</div>
<div class="modal-footer">
<a class="modal-close waves-effect waves-green btn frequencies-stopword-setting-modal-action-buttons" data-action="cancel">Cancel</a>
<a class="modal-close waves-effect waves-green btn frequencies-stopword-setting-modal-action-buttons" data-action="submit">Submit</a>
</div> </div>