From deec9e8a761c76702fcd683d505eafd0a166ae8a Mon Sep 17 00:00:00 2001 From: Inga Kirschnick Date: Tue, 11 Jul 2023 09:33:11 +0200 Subject: [PATCH] Custom Stopword List Settings --- app/corpora/json_routes.py | 11 +- .../js/CorpusAnalysis/CorpusAnalysisApp.js | 400 +++++++++++------- app/templates/corpora/analysis.html.j2 | 28 +- 3 files changed, 271 insertions(+), 168 deletions(-) diff --git a/app/corpora/json_routes.py b/app/corpora/json_routes.py index b6ef0110..6784bb9e 100644 --- a/app/corpora/json_routes.py +++ b/app/corpora/json_routes.py @@ -61,12 +61,6 @@ def build_corpus(corpus_id): @bp.route('/stopwords') @content_negotiation(produces='application/json') def get_stopwords(): - # data = request.json - # if not isinstance(data, dict): - # abort(400) - # language = data.get('language') - # if not isinstance(language, str): - # abort(400) nltk.download('stopwords') languages = ["german", "english", "catalan", "greek", "spanish", "french", "italian", "russian", "chinese"] stopwords = {} @@ -74,10 +68,7 @@ def get_stopwords(): stopwords[language] = nltk.corpus.stopwords.words(language) stopwords['punctuation'] = list(punctuation) + ['—', '|'] stopwords['user_stopwords'] = [] - print(stopwords) - response_data = { - 'stopwords': stopwords - } + response_data = stopwords return response_data, 202 # @bp.route('//generate-share-link', methods=['POST']) diff --git a/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js b/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js index 8af82e8d..a30af989 100644 --- a/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js +++ b/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js @@ -1,7 +1,10 @@ class CorpusAnalysisApp { constructor(corpusId) { this.data = { - promises: {getStopwords: []} + stopwords: undefined, + originalStopwords: {}, + stopwordCache: {}, + promises: {getStopwords: undefined} }; // HTML elements @@ -24,39 +27,17 @@ class CorpusAnalysisApp { }; } - // getStopwords(language) { - // if (language in this.data.promises.getStopwords) { - // console.log('Stopwords already loaded'); - // return this.data.promises.getStopwords[language]; - // } - // this.data.promises.getStopwords[language] = new Promise((resolve, reject) => { - // Requests.corpora.entity.getStopwords(language) - // .then((response) => { - // response.json() - // .then((json) => { - // let stopwords = json.stopwords; - // resolve(stopwords); - // }) - // .catch((error) => { - // reject(error); - // }); - // }); - // }); - // return this.data.promises.getStopwords[language]; - // } - getStopwords() { - if (this.data.promises.getStopwords.length !== 0) { - console.log('Stopwords already loaded'); - return this.data.promises.getStopwords; - } this.data.promises.getStopwords = new Promise((resolve, reject) => { Requests.corpora.entity.getStopwords() .then((response) => { response.json() .then((json) => { - let stopwords = json.stopwords; - resolve(stopwords); + for (let [key, value] of Object.entries(json)) { + this.data.originalStopwords[key] = value; + } + this.data.stopwords = json; + resolve(this.data.stopwords); }) .catch((error) => { reject(error); @@ -66,7 +47,6 @@ class CorpusAnalysisApp { return this.data.promises.getStopwords; } - init() { this.disableActionElements(); this.elements.m.initModal.open(); @@ -79,20 +59,11 @@ class CorpusAnalysisApp { .then((cqiCorpora) => { this.data.corpus = {o: cqiCorpora[0]}; console.log(this.data.corpus.o.staticData); - this.renderGeneralCorpusInfo(this.data.corpus.o.staticData); - this.renderTextInfoList(this.data.corpus.o.staticData); - this.renderTextProportionsGraphic(this.data.corpus.o.staticData); - this.renderFrequenciesGraphic(this.data.corpus.o.staticData); - this.renderBoundsGraphic(this.data.corpus.o.staticData); - // this.data.corpus.o.getCorpusData() - // .then(corpusData => { - // console.log(corpusData); - // this.renderGeneralCorpusInfo(corpusData); - // this.renderTextInfoList(corpusData); - // this.renderTextProportionsGraphic(corpusData); - // this.renderFrequenciesGraphic(corpusData); - // this.renderBoundsGraphic(corpusData); - // }); + this.renderGeneralCorpusInfo(); + this.renderTextInfoList(); + this.renderTextProportionsGraphic() + this.renderFrequenciesGraphic(); + this.renderBoundsGraphic(); // TODO: Don't do this hgere this.data.corpus.o.updateDb(); this.enableActionElements(); @@ -117,6 +88,30 @@ class CorpusAnalysisApp { this.elements.m.extensionTabs.select(extensionSelectorElement.dataset.target); }); } + + let frequenciesStopwordSettingModal = document.querySelector('#frequencies-stopwords-setting-modal'); + let frequenciesStopwordSettingModalButton = document.querySelector('#frequencies-stopwords-setting-modal-button'); + frequenciesStopwordSettingModalButton.addEventListener('click', () => { + this.data.stopwordCache = {}; + const stopwordsCopy = Object.assign({}, this.data.stopwords); + for (let [key, value] of Object.entries(stopwordsCopy)) { + this.data.stopwordCache[key] = value; + } + this.renderStopwordSettingsModal(this.data.stopwords); + M.Modal.init(frequenciesStopwordSettingModal, {dismissible: false}); + }); + + for (let actionButton of document.querySelectorAll('.frequencies-stopword-setting-modal-action-buttons')) { + actionButton.addEventListener('click', (event) => { + let action = event.target.closest('.frequencies-stopword-setting-modal-action-buttons').dataset.action; + if (action === 'submit') { + this.renderFrequenciesGraphic(); + } else if (action === 'cancel') { + this.data.stopwords = this.data.stopwordCache; + this.renderFrequenciesGraphic(); + } + }); + } } registerExtension(extension) { @@ -154,7 +149,8 @@ class CorpusAnalysisApp { } } - renderGeneralCorpusInfo(corpusData) { + renderGeneralCorpusInfo() { + let corpusData = this.data.corpus.o.staticData; document.querySelector('.corpus-num-tokens').innerHTML = corpusData.corpus.counts.token; document.querySelector('.corpus-num-s').innerHTML = corpusData.corpus.counts.s; document.querySelector('.corpus-num-unique-words').innerHTML = Object.entries(corpusData.corpus.freqs.word).length; @@ -163,7 +159,8 @@ class CorpusAnalysisApp { document.querySelector('.corpus-num-unique-simple-pos').innerHTML = Object.entries(corpusData.corpus.freqs.simple_pos).length; } - renderTextInfoList(corpusData) { + renderTextInfoList() { + let corpusData = this.data.corpus.o.staticData; let corpusTextInfoListElement = document.querySelector('.corpus-text-info-list'); let corpusTextInfoList = new CorpusTextInfoList(corpusTextInfoListElement); let texts = corpusData.s_attrs.text.lexicon; @@ -189,7 +186,8 @@ class CorpusAnalysisApp { textCountChipElement.innerHTML = `Text count: ${corpusData.corpus.counts.text}`; } - renderTextProportionsGraphic(corpusData) { + renderTextProportionsGraphic() { + let corpusData = this.data.corpus.o.staticData; let textProportionsGraphicElement = document.querySelector('#text-proportions-graphic'); let texts = Object.entries(corpusData.s_attrs.text.lexicon); let graphData = [ @@ -223,7 +221,8 @@ class CorpusAnalysisApp { Plotly.newPlot(textProportionsGraphicElement, graphData, graphLayout, config); } - renderFrequenciesGraphic(corpusData) { + async renderFrequenciesGraphic() { + let corpusData = this.data.corpus.o.staticData; let frequenciesTokenCategoryDropdownElement = document.querySelector('[data-target="frequencies-token-category-dropdown"]'); let frequenciesTokenCategoryDropdownListElement = document.querySelector("#frequencies-token-category-dropdown"); let frequenciesGraphicElement = document.querySelector('#frequencies-graphic'); @@ -248,123 +247,220 @@ class CorpusAnalysisApp { let tokenCategory = frequenciesTokenCategoryDropdownElement.firstChild.textContent.toLowerCase(); - this.createFrequenciesGraphData(tokenCategory, texts, corpusData, graphtype) - .then(graphData => { - let graphLayout = { - barmode: graphtype === 'bar' ? 'stack' : '', - margin: { - t: 20, - l: 50 - }, - yaxis: { - showticklabels: graphtype === 'markers' ? false : true - }, - }; - let config = { - responsive: true, - modeBarButtonsToRemove: ['zoom2d', 'select2d', 'lasso2d', 'zoomIn2d', 'zoomOut2d', 'autoScale2d', 'resetScale2d'], - displaylogo: false - }; - Plotly.newPlot(frequenciesGraphicElement, graphData, graphLayout, config); - }); + let graphData = await this.createFrequenciesGraphData(tokenCategory, texts, corpusData, graphtype); + let graphLayout = { + barmode: graphtype === 'bar' ? 'stack' : '', + margin: { + t: 20, + l: 50 + }, + yaxis: { + showticklabels: graphtype === 'markers' ? false : true + }, + }; + let config = { + responsive: true, + modeBarButtonsToRemove: ['zoom2d', 'select2d', 'lasso2d', 'zoomIn2d', 'zoomOut2d', 'autoScale2d', 'resetScale2d'], + displaylogo: false + }; + Plotly.newPlot(frequenciesGraphicElement, graphData, graphLayout, config); } - createFrequenciesGraphData(category, texts, corpusData, graphtype) { - return new Promise((resolve, reject) => { - this.getStopwords() - .then(stopwords => { - this.renderStopwordSettingsModal(stopwords); - let stopwordList = []; - Object.values(stopwords).forEach(stopwordItems => { - stopwordItems.forEach(stopword => { - stopwordList.push(stopword); - }); - }); - let graphData = []; - let filteredData = Object.entries(corpusData.corpus.freqs[category]) - .sort((a, b) => b[1] - a[1]) - .filter(item => !stopwordList.includes(corpusData.values.p_attrs[category][item[0]].toLowerCase())) - .slice(0, 5); - if (graphtype !== 'markers') { - for (let item of filteredData) { - let data = { - x: texts.map(text => `${corpusData.values.s_attrs.text[text[0]].title} (${corpusData.values.s_attrs.text[text[0]].publishing_year})`), - y: texts.map(text => text[1].freqs[category][item[0]] || 0), - name: corpusData.values.p_attrs[category][item[0]], - type: graphtype - }; - graphData.push(data); - } - } else { - for (let item of filteredData) { - let size = texts.map(text => text[1].freqs[category][item[0]] || 0); - let data = { - x: texts.map(text => `${corpusData.values.s_attrs.text[text[0]].title} (${corpusData.values.s_attrs.text[text[0]].publishing_year})`), - y: texts.map(text => corpusData.values.p_attrs[category][item[0]]), - name: corpusData.values.p_attrs[category][item[0]], - text: texts.map(text => `${corpusData.values.p_attrs[category][item[0]]}
${text[1].freqs[category][item[0]] || 0}`), - mode: 'markers', - marker: { - size: size, - // sizeref: 2.0 * Math.max(...size) / (80**2), - // sizemode: 'area', - sizeref: 0.2 - } - }; - graphData.push(data); - } - } - resolve(graphData); - }) - .catch(error => { - reject(error); - }); + async createFrequenciesGraphData(category, texts, corpusData, graphtype) { + let stopwords = this.data.stopwords; + if (this.data.stopwords === undefined) { + stopwords = await this.getStopwords(); + } + let stopwordList = []; + Object.values(stopwords).forEach(stopwordItems => { + stopwordItems.forEach(stopword => { + stopwordList.push(stopword); + }); }); + + let graphData = []; + let filteredData = Object.entries(corpusData.corpus.freqs[category]) + .sort((a, b) => b[1] - a[1]) + .filter(item => !stopwordList.includes(corpusData.values.p_attrs[category][item[0]].toLowerCase())) + .slice(0, 5); + + if (graphtype !== 'markers') { + for (let item of filteredData) { + let data = { + x: texts.map(text => `${corpusData.values.s_attrs.text[text[0]].title} (${corpusData.values.s_attrs.text[text[0]].publishing_year})`), + y: texts.map(text => text[1].freqs[category][item[0]] || 0), + name: corpusData.values.p_attrs[category][item[0]], + type: graphtype + }; + graphData.push(data); + } + } else { + for (let item of filteredData) { + let size = texts.map(text => text[1].freqs[category][item[0]] || 0); + let data = { + x: texts.map(text => `${corpusData.values.s_attrs.text[text[0]].title} (${corpusData.values.s_attrs.text[text[0]].publishing_year})`), + y: texts.map(text => corpusData.values.p_attrs[category][item[0]]), + name: corpusData.values.p_attrs[category][item[0]], + text: texts.map(text => `${corpusData.values.p_attrs[category][item[0]]}
${text[1].freqs[category][item[0]] || 0}`), + mode: 'markers', + marker: { + size: size, + sizeref: 0.4 + } + }; + graphData.push(data); + } + } + return graphData; } renderStopwordSettingsModal(stopwords) { - let stopwordInputField = document.querySelector('.stopword-input-field'); + let stopwordInputField = document.querySelector('#stopword-input-field'); + let userStopwordListContainer = document.querySelector('#user-stopword-list-container'); + let stopwordLanguageSelection = document.querySelector('#stopword-language-selection'); + let stopwordLanguageChipList = document.querySelector('#stopword-language-chip-list'); + let deleteLanguageStopwordListEntriesButton = document.querySelector('#delete-language-stopword-list-entries-button'); + let resetLanguageStopwordListEntriesButton = document.querySelector('#reset-language-stopword-list-entries-button'); + + stopwordLanguageChipList.innerHTML = ''; + userStopwordListContainer.innerHTML = ''; + + // Render stopword language selection. Set english as default language. Filter out user_stopwords. + for (let language of Object.keys(stopwords)) { + if (language !== 'user_stopwords') { + if (language === 'english') { + let optionElement = Utils.HTMLToElement(``); + stopwordLanguageSelection.appendChild(optionElement); + } else { + let optionElement = Utils.HTMLToElement(``); + stopwordLanguageSelection.appendChild(optionElement); + } + } + } + + // Render user stopwords over input field. + if (this.data.stopwords['user_stopwords'].length > 0) { + for (let word of this.data.stopwords['user_stopwords']) { + let chipElement = Utils.HTMLToElement(`
${word}close
`); + chipElement.addEventListener('click', (event) => { + let removedListItem = event.target.closest('.chip').firstChild.textContent; + this.data.stopwords['user_stopwords'] = this.data.stopwords['user_stopwords'].filter(item => item !== removedListItem); + }); + userStopwordListContainer.appendChild(chipElement); + } + } + + // Render english stopwords as default ... + this.renderStopwordLanguageChipList('english', stopwords['english']); + + // ... or render selected language stopwords. + stopwordLanguageSelection.addEventListener('change', (event) => { + this.renderStopwordLanguageChipList(event.target.value, stopwords[event.target.value]); + }); + + // Eventlistener for deleting all stopwords of a language. + deleteLanguageStopwordListEntriesButton.addEventListener('click', (event) => { + let selectedLanguage = stopwordLanguageSelection.value; + this.data.stopwords[selectedLanguage] = []; + stopwordLanguageChipList.innerHTML = ''; + this.buttonRendering(); + }); + + // Eventlistener for resetting all stopwords of a language to the original stopwords. + resetLanguageStopwordListEntriesButton.addEventListener('click', () => { + let selectedLanguage = stopwordLanguageSelection.value; + this.data.stopwords[selectedLanguage] = this.data.originalStopwords[selectedLanguage]; + this.renderStopwordLanguageChipList(selectedLanguage, this.data.stopwords[selectedLanguage]); + }); + + // Initialize Materialize components. + M.Chips.init( + stopwordInputField, + { + placeholder: 'Add stopwords', + onChipAdd: (event) => { + let userStopwords = []; + for (let word of event[0].M_Chips.chipsData) { + if (!this.data.stopwords['user_stopwords'].includes(word.tag.toLowerCase())) { + userStopwords.push(word.tag.toLowerCase()); + } + } + this.data.stopwords['user_stopwords'] = this.data.stopwords['user_stopwords'].concat(userStopwords); + } + } + ); + M.FormSelect.init(stopwordLanguageSelection); + + } + + buttonRendering() { + let stopwordLanguageSelection = document.querySelector('#stopword-language-selection'); + let deleteLanguageStopwordListEntriesButton = document.querySelector('#delete-language-stopword-list-entries-button'); + let resetLanguageStopwordListEntriesButton = document.querySelector('#reset-language-stopword-list-entries-button'); + + let selectedLanguage = stopwordLanguageSelection.value; + let stopwordLength = this.data.stopwords[selectedLanguage].length; + let originalStopwordListLength = this.data.originalStopwords[selectedLanguage].length; + + resetLanguageStopwordListEntriesButton.classList.toggle('blue', stopwordLength !== originalStopwordListLength); + deleteLanguageStopwordListEntriesButton.classList.toggle('red', stopwordLength > 0); + resetLanguageStopwordListEntriesButton.style.cursor = stopwordLength !== originalStopwordListLength ? 'pointer' : 'default'; + deleteLanguageStopwordListEntriesButton.style.cursor = stopwordLength > 0 ? 'pointer' : 'default'; } + renderStopwordLanguageChipList(language, stopwords) { + let stopwordLanguageChipList = document.querySelector('#stopword-language-chip-list'); + stopwordLanguageChipList.innerHTML = ''; + for (let word of stopwords) { + let chipElement = Utils.HTMLToElement(`
${word}close
`); + chipElement.addEventListener('click', (event) => { + let removedListItem = event.target.closest('.chip').firstChild.textContent; + this.data.stopwords[language] = this.data.stopwords[language].filter(item => item !== removedListItem); + this.buttonRendering(); + }); + stopwordLanguageChipList.appendChild(chipElement); + } + this.buttonRendering(); + } + renderBoundsGraphic() { + let corpusData = this.data.corpus.o.staticData; + let boundsGraphicElement = document.querySelector('#bounds-graphic'); - renderBoundsGraphic(corpusData) { - let boundsGraphicElement = document.querySelector('#bounds-graphic'); + let graphData = []; + let texts = Object.entries(corpusData.s_attrs.text.lexicon); - let graphData = []; - let texts = Object.entries(corpusData.s_attrs.text.lexicon); + graphData = [{ + type: 'bar', + x: texts.map(text => text[1].bounds[1] - text[1].bounds[0]), + y: texts.map(text => corpusData.values.s_attrs.text[text[0]].title), + base: texts.map(text => text[1].bounds[0]), + text: texts.map(text => `${corpusData.values.s_attrs.text[text[0]].title} (${corpusData.values.s_attrs.text[text[0]].publishing_year})`), + orientation: 'h', + hovertemplate: '%{base} - %{x}
%{y}', + showlegend: false + }]; - graphData = [{ - type: 'bar', - x: texts.map(text => text[1].bounds[1] - text[1].bounds[0]), - y: texts.map(text => corpusData.values.s_attrs.text[text[0]].title), - base: texts.map(text => text[1].bounds[0]), - text: texts.map(text => `${corpusData.values.s_attrs.text[text[0]].title} (${corpusData.values.s_attrs.text[text[0]].publishing_year})`), - orientation: 'h', - hovertemplate: '%{base} - %{x}
%{y}', - showlegend: false - }]; + let graphLayout = { + barmode: 'stack', + type: 'bar', + showgrid: false, + xaxis: { + rangemode: 'nonnegative', + autorange: true + }, + yaxis: { + autorange: true, + showticklabels: false + } + }; - let graphLayout = { - barmode: 'stack', - type: 'bar', - showgrid: false, - xaxis: { - rangemode: 'nonnegative', - autorange: true - }, - yaxis: { - autorange: true, - showticklabels: false - } - }; - - let config = { - responsive: true, - modeBarButtonsToRemove: ['zoom2d', 'select2d', 'lasso2d', 'zoomIn2d', 'zoomOut2d', 'autoScale2d', 'resetScale2d'], - displaylogo: false - }; - - Plotly.newPlot(boundsGraphicElement, graphData, graphLayout, config); + let config = { + responsive: true, + modeBarButtonsToRemove: ['zoom2d', 'select2d', 'lasso2d', 'zoomIn2d', 'zoomOut2d', 'autoScale2d', 'resetScale2d'], + displaylogo: false + }; + + Plotly.newPlot(boundsGraphicElement, graphData, graphLayout, config); } } diff --git a/app/templates/corpora/analysis.html.j2 b/app/templates/corpora/analysis.html.j2 index b9a80c97..2bdb35f2 100644 --- a/app/templates/corpora/analysis.html.j2 +++ b/app/templates/corpora/analysis.html.j2 @@ -123,7 +123,7 @@ equalizer show_chart bubble_chart - settings + settings @@ -140,7 +140,6 @@ - @@ -166,18 +165,35 @@ -