diff --git a/app/static/js/CorpusAnalysis/CQiClient.js b/app/static/js/CorpusAnalysis/CQiClient.js index fcc0c87d..92cc6422 100644 --- a/app/static/js/CorpusAnalysis/CQiClient.js +++ b/app/static/js/CorpusAnalysis/CQiClient.js @@ -101,164 +101,188 @@ class CQiCorpus { getCorpusData() { return new Promise((resolve, reject) => { const dummyData = { - "num_tokens": 2000, // number of tokens in the corpus - "num_unique_words": 500, // number of unique words in the corpus - "num_unique_lemmas": 200, // number of unique lemmas in the corpus - "num_sentences": 90, // number of sentences in the corpus - "average_sentence_length": 11, // average number of tokens per sentence in the corpus - "num_ent_types": 30, // number of entities in the corpus - "num_unique_ent_types":10, - "ent_type_freqs": { - "str": 10, // number of ent_types with ent_type "str" - // ... - }, - "texts": [ - { - "num_tokens": 11, // number of tokens in the text - "num_unique_words": 12, // number of unique words in the text - "word_freqs": { // frequency of unique words in the text (sorted by frequency) - "str": "int", // number of tokens with word "str" - // ... + "corpus": { + "bounds": [1, 689], + "counts": { + "token": 743, + "ent": 321, + "s": 234 + }, + "freqs": { + "word": { + "1": 876, + "2": 234, + "3": 657 }, - "num_unique_lemmas": 15, // number of unique lemmas in the text - "lemma_freqs": { // frequency of unique lemmas in the text (sorted by frequency) - "str": "int", // number of tokens with lemma "str" - // ... + "lemma": { + "1": 543, + "2": 876, + "3": 321 }, - "num_sentences": 4, // number of sentences in the text - "average_sentence_length": 3, // average number of tokens per sentence in the text - "num_ent_types": 12, // number of ent_types in the text - "num_unique_ent_types": 28, // number of unique ent_types in the text - "num_entities_by_id": { - "1": "int", // number of entities with id 1 - // ... - }, - "author": "Author Name", - "title": "Titel", - "publishing_year": 1950 - }, - { - "num_tokens": 15, // number of tokens in the text - "num_unique_words": 4, // number of unique words in the text - "word_freqs": { // frequency of unique words in the text (sorted by frequency) - "str": "int", // number of tokens with word "str" - // ... - }, - "num_unique_lemmas": 90, // number of unique lemmas in the text - "lemma_freqs": { // frequency of unique lemmas in the text (sorted by frequency) - "str": "int", // number of tokens with lemma "str" - // ... - }, - "num_sentences": 11, // number of sentences in the text - "average_sentence_length": 3, // average number of tokens per sentence in the text - "num_ent_types": 4, // number of ent_types in the text - "num_unique_ent_types": 300, // number of unique ent_types in the text - "num_entities_by_id": { - "1": "int", // number of entities with id 1 - // ... - }, - "author": "Author Name", - "title": "Titel 1", - "publishing_year": 1962 - }, - { - "num_tokens": 11, // number of tokens in the text - "num_unique_words": 12, // number of unique words in the text - "word_freqs": { // frequency of unique words in the text (sorted by frequency) - "str": "int", // number of tokens with word "str" - // ... - }, - "num_unique_lemmas": 64, // number of unique lemmas in the text - "lemma_freqs": { // frequency of unique lemmas in the text (sorted by frequency) - "str": "int", // number of tokens with lemma "str" - // ... - }, - "num_sentences": 52, // number of sentences in the text - "average_sentence_length": 3, // average number of tokens per sentence in the text - "num_ent_types": 45, // number of ent_types in the text - "num_unique_ent_types": 68, // number of unique ent_types in the text - "num_entities_by_id": { - "1": "int", // number of entities with id 1 - // ... - }, - "author": "Author Name", - "title": "Titel 2", - "publishing_year": 1850 - }, - { - "num_tokens": 56, // number of tokens in the text - "num_unique_words": 13, // number of unique words in the text - "word_freqs": { // frequency of unique words in the text (sorted by frequency) - "str": "int", // number of tokens with word "str" - // ... - }, - "num_unique_lemmas": 43, // number of unique lemmas in the text - "lemma_freqs": { // frequency of unique lemmas in the text (sorted by frequency) - "str": "int", // number of tokens with lemma "str" - // ... - }, - "num_sentences": 45, // number of sentences in the text - "average_sentence_length": 56, // average number of tokens per sentence in the text - "num_ent_types": 8792, // number of ent_types in the text - "num_unique_ent_types": 56758, // number of unique ent_types in the text - "num_entities_by_id": { - "1": "int", // number of entities with id 1 - // ... - }, - "author": "Author Name", - "title": "Titel 3", - "publishing_year": 1504 - }, - { - "num_tokens": 54345, // number of tokens in the text - "num_unique_words": 561, // number of unique words in the text - "word_freqs": { // frequency of unique words in the text (sorted by frequency) - "str": "int", // number of tokens with word "str" - // ... - }, - "num_unique_lemmas": 546, // number of unique lemmas in the text - "lemma_freqs": { // frequency of unique lemmas in the text (sorted by frequency) - "str": "int", // number of tokens with lemma "str" - // ... - }, - "num_sentences": 5427, // number of sentences in the text - "average_sentence_length": 657, // average number of tokens per sentence in the text - "num_ent_types": 3465, // number of ent_types in the text - "num_unique_ent_types": 45, // number of unique ent_types in the text - "num_entities_by_id": { - "1": "int", // number of entities with id 1 - // ... - }, - "author": "Author Name", - "title": "Titel 4", - "publishing_year": 1712 - }, - { - "num_tokens": 4354, // number of tokens in the text - "num_unique_words": 45234, // number of unique words in the text - "word_freqs": { // frequency of unique words in the text (sorted by frequency) - "testwort": 50, // number of tokens with word "str" - "testwort2": 1 - }, - "num_unique_lemmas": 15, // number of unique lemmas in the text - "lemma_freqs": { // frequency of unique lemmas in the text (sorted by frequency) - "testlemma": 11, // number of tokens with lemma "str" - "testlemma2": 1 - }, - "num_sentences": 90, // number of sentences in the text - "average_sentence_length": 7, // average number of tokens per sentence in the text - "num_ent_types": 19, - "num_unique_ent_types": 5, // number of unique ent_types in the text - "num_entities_by_id": { - "1": "int", // number of entities with id 1 - // ... - }, - "author": "Author Name 2", - "title": "Titel 5", - "publishing_year": 1951 + "pos": { + "1": 456, + "2": 789, + "3": 234 + }, + "simple_pos": { + "1": 987, + "2": 876, + "3": 543 + }, + "ent": { + "1": 654, + "2": 321, + "3": 987 + } } - ] - }; + }, + "text": { + "1": { + "bounds": [0, 435], + "counts": { + "token": 345, + "ent_type": 123, + "s": 89 + }, + "freqs": { + "word": { + "1": 25, + "2": 90, + "3": 200 + }, + "lemma": { + "1": 654, + "2": 321, + "3": 987 + }, + "pos": { + "1": 543, + "2": 876, + "3": 234 + }, + "simple_pos": { + "1": 987, + "2": 654, + "3": 321 + }, + "ent_type": { + "1": 234, + "2": 789, + "3": 543 + } + }, + "values": { + "author": 1, + "publishing_year":1950, + "title": 1 + } + }, + "2": { + "bounds": [435, 689], + "counts": { + "token": 389, + "ent_type": 198, + "s": 145 + }, + "freqs": { + "word": { + "1": 60, + "2": 70, + "3": 100 + }, + "lemma": { + "1": 654, + "2": 321, + "3": 987 + }, + "pos": { + "1": 543, + "2": 876, + "3": 234 + }, + "simple_pos": { + "1": 987, + "2": 654, + "3": 321 + }, + "ent_type": { + "1": 234, + "2": 789, + "3": 543 + } + }, + "values": { + "author": 2, + "publishing_year":1951, + "title": 2 + } + } + }, + "s": { + "1": { + "bounds": [345, 678] + } + }, + "ent": { + "1": { + "bounds": [567, 890], + "values": { + "type": 789 + } + } + }, + "token": { + "310": { + "values": { + "word": 1, + "lemma": 2, + "pos": 1, + "simple_pos": 1 + } + } + }, + "value_lookups": { + "text": { + "author": { + "1": "John Doe", + "2": "Jane Smith" + }, + "title": { + "1": "Test Title 1", + "2": "Test Title 2" + } + }, + "ent": { + "type": { + "1": "Person", + "2": "Organization" + } + }, + "token": { + "word": { + "1": "apple", + "2": "banana", + "3": "orange" + }, + "lemma": { + "1": "run", + "2": "walk", + "3": "jump" + }, + "pos": { + "1": "noun", + "2": "verb", + "3": "adjective" + }, + "simple_pos": { + "1": "subject", + "2": "object", + "3": "predicate" + } + } + } + } + resolve(dummyData); /* diff --git a/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js b/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js index cb012730..fbb91b4c 100644 --- a/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js +++ b/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js @@ -39,6 +39,8 @@ class CorpusAnalysisApp { this.renderGeneralCorpusInfo(corpusData); this.renderTextInfoList(corpusData); this.renderTextProportionsGraphic(corpusData); + this.renderWordFrequenciesGraphic(corpusData); + this.renderWordDistributionsGraphic(corpusData); }); // TODO: Don't do this hgere cQiCorpus.updateDb(); @@ -103,38 +105,85 @@ class CorpusAnalysisApp { } renderGeneralCorpusInfo(corpusData) { - let corpusGeneralInfoListElement = document.querySelector('.corpus-general-info-list'); - corpusGeneralInfoListElement.querySelector('.corpus-num-tokens').innerHTML = `Number of tokens: ${this.data.corpus.o.size}`; - corpusGeneralInfoListElement.querySelector('.corpus-text-count').innerHTML = `Corpus text count: ${corpusData.texts.length}`; - corpusGeneralInfoListElement.querySelector('.corpus-num-unique-words').innerHTML = `Corpus unique word count: ${corpusData.num_unique_words}`; - corpusGeneralInfoListElement.querySelector('.corpus-num-unique-lemmas').innerHTML = `Corpus unique lemma count: ${corpusData.num_unique_lemmas}`; - // corpusGeneralInfoListElement.querySelector('.corpus-most-frequent-words').innerHTML = `Corpus most frequent words: ${corpusData.most_frequent_words.join(', '); - corpusGeneralInfoListElement.querySelector('.corpus-num-sentences').innerHTML = `Corpus sentence count: ${corpusData.num_sentences}`; - corpusGeneralInfoListElement.querySelector('.corpus-average-sentence-length').innerHTML = `Corpus average sentence length: ${corpusData.average_sentence_length}`; - corpusGeneralInfoListElement.querySelector('.corpus-num-ent-types').innerHTML = `Corpus entity count: ${corpusData.num_ent_types}`; - corpusGeneralInfoListElement.querySelector('.corpus-num-unique-ent-types').innerHTML = `Corpus unique entity count: ${corpusData.num_unique_ent_types}`; + document.querySelector('.corpus-num-tokens').innerHTML = corpusData.corpus.counts.token; + document.querySelector('.corpus-num-s').innerHTML = corpusData.corpus.counts.s; + // corpusGeneralInfoListElement.querySelector('.corpus-text-count').innerHTML = Corpus text count: ${Object.entries(corpusData.text).length; + document.querySelector('.corpus-num-unique-words').innerHTML = Object.entries(corpusData.corpus.freqs.word).length; + document.querySelector('.corpus-num-unique-lemmas').innerHTML = Object.entries(corpusData.corpus.freqs.lemma).length; + document.querySelector('.corpus-num-unique-pos').innerHTML = Object.entries(corpusData.corpus.freqs.pos).length; + document.querySelector('.corpus-num-unique-simple-pos').innerHTML = Object.entries(corpusData.corpus.freqs.simple_pos).length; } renderTextInfoList(corpusData) { - let corpusTextInfoListElement = document.querySelector('.corpus-text-info-list'); - let corpusTextInfoList = new CorpusTextInfoList(corpusTextInfoListElement); - corpusTextInfoList.add(corpusData.texts); - + // let corpusTextInfoListElement = document.querySelector('.corpus-text-info-list'); + // let corpusTextInfoList = new CorpusTextInfoList(corpusTextInfoListElement); + // for (let text of Object.values(corpusData.text)) { + // text.values.title = corpusData.value_lookups.text.title[text.values.title]; + // } + // corpusTextInfoList.add(Object.values(corpusData.text)); + + // let textCountChipElement = document.querySelector('.text-count-chip'); + // textCountChipElement.innerHTML = `Text count: ${Object.values(corpusData.text).length}`; } renderTextProportionsGraphic(corpusData) { - let textProportionsGraphicElement = document.querySelector('#text-proportions-graphic'); - let graphData = [ - { - values: corpusData.texts.map(text => text.num_tokens), - labels: corpusData.texts.map(text => `${text.title} (${text.publishing_year})`), - type: 'pie' - } - ]; - let graphLayout = { - height: 400, - width: 500 - }; - Plotly.newPlot(textProportionsGraphicElement, graphData, graphLayout); + // let textProportionsGraphicElement = document.querySelector('#text-proportions-graphic'); + // let texts = Object.values(corpusData.text); + // let graphData = [ + // { + // values: texts.map(text => text.counts.token), + // labels: texts.map(text => `${text.values.title} (${text.values.publishing_year})`), + // type: 'pie' + // } + // ]; + // let graphLayout = { + // height: 400, + // width: 500 + // }; + // Plotly.newPlot(textProportionsGraphicElement, graphData, graphLayout); + } + + renderWordFrequenciesGraphic(corpusData) { + // let wordFrequenciesGraphicElement = document.querySelector('#word-frequencies-graphic'); + // let words = Object.entries(corpusData.value_lookups.token.word); + // let texts = Object.values(corpusData.text); + // let graphData = []; + // for (let word of words) { + // let data = { + // x: texts.map(text => `${text.values.title} (${text.values.publishing_year})`), + // y: texts.map(text => text.freqs.word[word[0]]), + // name: word[1], + // type: 'bar' + // }; + // graphData.push(data); + // } + + // let graphLayout = { + // height: 400, + // width: 500, + // barmode: 'stack', + // type: 'bar' + // }; + // Plotly.newPlot(wordFrequenciesGraphicElement, graphData, graphLayout); + } + + renderWordDistributionsGraphic(corpusData) { + // let wordDistributionGraphicElement = document.querySelector('#word-distributions-graphic'); + // var trace1 = { + // x: [1, 2, 3, 4], + // y: [10, 11, 12, 13], + // mode: 'markers', + // marker: { + // size: [40, 60, 80, 100] + // } + // }; + // var data = [trace1]; + // var layout = { + // title: 'Marker Size', + // showlegend: false, + // height: 600, + // width: 600 + // }; + // Plotly.newPlot(wordDistributionGraphicElement, data, layout); } } diff --git a/app/static/js/ResourceLists/CorpusTextInfoList.js b/app/static/js/ResourceLists/CorpusTextInfoList.js index 3e697d2d..6e8e8310 100644 --- a/app/static/js/ResourceLists/CorpusTextInfoList.js +++ b/app/static/js/ResourceLists/CorpusTextInfoList.js @@ -29,11 +29,11 @@ class CorpusTextInfoList extends ResourceList {
Number of tokens
+Number of sentences
+ +Number of unique words
+ +Number of unique lemmas
+ +Number of unique pos
+ +Number of unique simple_pos
+ +