diff --git a/app/static/js/CorpusAnalysis/CQiClient.js b/app/static/js/CorpusAnalysis/CQiClient.js index fcc0c87d..92cc6422 100644 --- a/app/static/js/CorpusAnalysis/CQiClient.js +++ b/app/static/js/CorpusAnalysis/CQiClient.js @@ -101,164 +101,188 @@ class CQiCorpus { getCorpusData() { return new Promise((resolve, reject) => { const dummyData = { - "num_tokens": 2000, // number of tokens in the corpus - "num_unique_words": 500, // number of unique words in the corpus - "num_unique_lemmas": 200, // number of unique lemmas in the corpus - "num_sentences": 90, // number of sentences in the corpus - "average_sentence_length": 11, // average number of tokens per sentence in the corpus - "num_ent_types": 30, // number of entities in the corpus - "num_unique_ent_types":10, - "ent_type_freqs": { - "str": 10, // number of ent_types with ent_type "str" - // ... - }, - "texts": [ - { - "num_tokens": 11, // number of tokens in the text - "num_unique_words": 12, // number of unique words in the text - "word_freqs": { // frequency of unique words in the text (sorted by frequency) - "str": "int", // number of tokens with word "str" - // ... + "corpus": { + "bounds": [1, 689], + "counts": { + "token": 743, + "ent": 321, + "s": 234 + }, + "freqs": { + "word": { + "1": 876, + "2": 234, + "3": 657 }, - "num_unique_lemmas": 15, // number of unique lemmas in the text - "lemma_freqs": { // frequency of unique lemmas in the text (sorted by frequency) - "str": "int", // number of tokens with lemma "str" - // ... + "lemma": { + "1": 543, + "2": 876, + "3": 321 }, - "num_sentences": 4, // number of sentences in the text - "average_sentence_length": 3, // average number of tokens per sentence in the text - "num_ent_types": 12, // number of ent_types in the text - "num_unique_ent_types": 28, // number of unique ent_types in the text - "num_entities_by_id": { - "1": "int", // number of entities with id 1 - // ... - }, - "author": "Author Name", - "title": "Titel", - "publishing_year": 1950 - }, - { - "num_tokens": 15, // number of tokens in the text - "num_unique_words": 4, // number of unique words in the text - "word_freqs": { // frequency of unique words in the text (sorted by frequency) - "str": "int", // number of tokens with word "str" - // ... - }, - "num_unique_lemmas": 90, // number of unique lemmas in the text - "lemma_freqs": { // frequency of unique lemmas in the text (sorted by frequency) - "str": "int", // number of tokens with lemma "str" - // ... - }, - "num_sentences": 11, // number of sentences in the text - "average_sentence_length": 3, // average number of tokens per sentence in the text - "num_ent_types": 4, // number of ent_types in the text - "num_unique_ent_types": 300, // number of unique ent_types in the text - "num_entities_by_id": { - "1": "int", // number of entities with id 1 - // ... - }, - "author": "Author Name", - "title": "Titel 1", - "publishing_year": 1962 - }, - { - "num_tokens": 11, // number of tokens in the text - "num_unique_words": 12, // number of unique words in the text - "word_freqs": { // frequency of unique words in the text (sorted by frequency) - "str": "int", // number of tokens with word "str" - // ... - }, - "num_unique_lemmas": 64, // number of unique lemmas in the text - "lemma_freqs": { // frequency of unique lemmas in the text (sorted by frequency) - "str": "int", // number of tokens with lemma "str" - // ... - }, - "num_sentences": 52, // number of sentences in the text - "average_sentence_length": 3, // average number of tokens per sentence in the text - "num_ent_types": 45, // number of ent_types in the text - "num_unique_ent_types": 68, // number of unique ent_types in the text - "num_entities_by_id": { - "1": "int", // number of entities with id 1 - // ... - }, - "author": "Author Name", - "title": "Titel 2", - "publishing_year": 1850 - }, - { - "num_tokens": 56, // number of tokens in the text - "num_unique_words": 13, // number of unique words in the text - "word_freqs": { // frequency of unique words in the text (sorted by frequency) - "str": "int", // number of tokens with word "str" - // ... - }, - "num_unique_lemmas": 43, // number of unique lemmas in the text - "lemma_freqs": { // frequency of unique lemmas in the text (sorted by frequency) - "str": "int", // number of tokens with lemma "str" - // ... - }, - "num_sentences": 45, // number of sentences in the text - "average_sentence_length": 56, // average number of tokens per sentence in the text - "num_ent_types": 8792, // number of ent_types in the text - "num_unique_ent_types": 56758, // number of unique ent_types in the text - "num_entities_by_id": { - "1": "int", // number of entities with id 1 - // ... - }, - "author": "Author Name", - "title": "Titel 3", - "publishing_year": 1504 - }, - { - "num_tokens": 54345, // number of tokens in the text - "num_unique_words": 561, // number of unique words in the text - "word_freqs": { // frequency of unique words in the text (sorted by frequency) - "str": "int", // number of tokens with word "str" - // ... - }, - "num_unique_lemmas": 546, // number of unique lemmas in the text - "lemma_freqs": { // frequency of unique lemmas in the text (sorted by frequency) - "str": "int", // number of tokens with lemma "str" - // ... - }, - "num_sentences": 5427, // number of sentences in the text - "average_sentence_length": 657, // average number of tokens per sentence in the text - "num_ent_types": 3465, // number of ent_types in the text - "num_unique_ent_types": 45, // number of unique ent_types in the text - "num_entities_by_id": { - "1": "int", // number of entities with id 1 - // ... - }, - "author": "Author Name", - "title": "Titel 4", - "publishing_year": 1712 - }, - { - "num_tokens": 4354, // number of tokens in the text - "num_unique_words": 45234, // number of unique words in the text - "word_freqs": { // frequency of unique words in the text (sorted by frequency) - "testwort": 50, // number of tokens with word "str" - "testwort2": 1 - }, - "num_unique_lemmas": 15, // number of unique lemmas in the text - "lemma_freqs": { // frequency of unique lemmas in the text (sorted by frequency) - "testlemma": 11, // number of tokens with lemma "str" - "testlemma2": 1 - }, - "num_sentences": 90, // number of sentences in the text - "average_sentence_length": 7, // average number of tokens per sentence in the text - "num_ent_types": 19, - "num_unique_ent_types": 5, // number of unique ent_types in the text - "num_entities_by_id": { - "1": "int", // number of entities with id 1 - // ... - }, - "author": "Author Name 2", - "title": "Titel 5", - "publishing_year": 1951 + "pos": { + "1": 456, + "2": 789, + "3": 234 + }, + "simple_pos": { + "1": 987, + "2": 876, + "3": 543 + }, + "ent": { + "1": 654, + "2": 321, + "3": 987 + } } - ] - }; + }, + "text": { + "1": { + "bounds": [0, 435], + "counts": { + "token": 345, + "ent_type": 123, + "s": 89 + }, + "freqs": { + "word": { + "1": 25, + "2": 90, + "3": 200 + }, + "lemma": { + "1": 654, + "2": 321, + "3": 987 + }, + "pos": { + "1": 543, + "2": 876, + "3": 234 + }, + "simple_pos": { + "1": 987, + "2": 654, + "3": 321 + }, + "ent_type": { + "1": 234, + "2": 789, + "3": 543 + } + }, + "values": { + "author": 1, + "publishing_year":1950, + "title": 1 + } + }, + "2": { + "bounds": [435, 689], + "counts": { + "token": 389, + "ent_type": 198, + "s": 145 + }, + "freqs": { + "word": { + "1": 60, + "2": 70, + "3": 100 + }, + "lemma": { + "1": 654, + "2": 321, + "3": 987 + }, + "pos": { + "1": 543, + "2": 876, + "3": 234 + }, + "simple_pos": { + "1": 987, + "2": 654, + "3": 321 + }, + "ent_type": { + "1": 234, + "2": 789, + "3": 543 + } + }, + "values": { + "author": 2, + "publishing_year":1951, + "title": 2 + } + } + }, + "s": { + "1": { + "bounds": [345, 678] + } + }, + "ent": { + "1": { + "bounds": [567, 890], + "values": { + "type": 789 + } + } + }, + "token": { + "310": { + "values": { + "word": 1, + "lemma": 2, + "pos": 1, + "simple_pos": 1 + } + } + }, + "value_lookups": { + "text": { + "author": { + "1": "John Doe", + "2": "Jane Smith" + }, + "title": { + "1": "Test Title 1", + "2": "Test Title 2" + } + }, + "ent": { + "type": { + "1": "Person", + "2": "Organization" + } + }, + "token": { + "word": { + "1": "apple", + "2": "banana", + "3": "orange" + }, + "lemma": { + "1": "run", + "2": "walk", + "3": "jump" + }, + "pos": { + "1": "noun", + "2": "verb", + "3": "adjective" + }, + "simple_pos": { + "1": "subject", + "2": "object", + "3": "predicate" + } + } + } + } + resolve(dummyData); /* diff --git a/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js b/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js index cb012730..fbb91b4c 100644 --- a/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js +++ b/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js @@ -39,6 +39,8 @@ class CorpusAnalysisApp { this.renderGeneralCorpusInfo(corpusData); this.renderTextInfoList(corpusData); this.renderTextProportionsGraphic(corpusData); + this.renderWordFrequenciesGraphic(corpusData); + this.renderWordDistributionsGraphic(corpusData); }); // TODO: Don't do this hgere cQiCorpus.updateDb(); @@ -103,38 +105,85 @@ class CorpusAnalysisApp { } renderGeneralCorpusInfo(corpusData) { - let corpusGeneralInfoListElement = document.querySelector('.corpus-general-info-list'); - corpusGeneralInfoListElement.querySelector('.corpus-num-tokens').innerHTML = `Number of tokens: ${this.data.corpus.o.size}`; - corpusGeneralInfoListElement.querySelector('.corpus-text-count').innerHTML = `Corpus text count: ${corpusData.texts.length}`; - corpusGeneralInfoListElement.querySelector('.corpus-num-unique-words').innerHTML = `Corpus unique word count: ${corpusData.num_unique_words}`; - corpusGeneralInfoListElement.querySelector('.corpus-num-unique-lemmas').innerHTML = `Corpus unique lemma count: ${corpusData.num_unique_lemmas}`; - // corpusGeneralInfoListElement.querySelector('.corpus-most-frequent-words').innerHTML = `Corpus most frequent words: ${corpusData.most_frequent_words.join(', '); - corpusGeneralInfoListElement.querySelector('.corpus-num-sentences').innerHTML = `Corpus sentence count: ${corpusData.num_sentences}`; - corpusGeneralInfoListElement.querySelector('.corpus-average-sentence-length').innerHTML = `Corpus average sentence length: ${corpusData.average_sentence_length}`; - corpusGeneralInfoListElement.querySelector('.corpus-num-ent-types').innerHTML = `Corpus entity count: ${corpusData.num_ent_types}`; - corpusGeneralInfoListElement.querySelector('.corpus-num-unique-ent-types').innerHTML = `Corpus unique entity count: ${corpusData.num_unique_ent_types}`; + document.querySelector('.corpus-num-tokens').innerHTML = corpusData.corpus.counts.token; + document.querySelector('.corpus-num-s').innerHTML = corpusData.corpus.counts.s; + // corpusGeneralInfoListElement.querySelector('.corpus-text-count').innerHTML = Corpus text count: ${Object.entries(corpusData.text).length; + document.querySelector('.corpus-num-unique-words').innerHTML = Object.entries(corpusData.corpus.freqs.word).length; + document.querySelector('.corpus-num-unique-lemmas').innerHTML = Object.entries(corpusData.corpus.freqs.lemma).length; + document.querySelector('.corpus-num-unique-pos').innerHTML = Object.entries(corpusData.corpus.freqs.pos).length; + document.querySelector('.corpus-num-unique-simple-pos').innerHTML = Object.entries(corpusData.corpus.freqs.simple_pos).length; } renderTextInfoList(corpusData) { - let corpusTextInfoListElement = document.querySelector('.corpus-text-info-list'); - let corpusTextInfoList = new CorpusTextInfoList(corpusTextInfoListElement); - corpusTextInfoList.add(corpusData.texts); - + // let corpusTextInfoListElement = document.querySelector('.corpus-text-info-list'); + // let corpusTextInfoList = new CorpusTextInfoList(corpusTextInfoListElement); + // for (let text of Object.values(corpusData.text)) { + // text.values.title = corpusData.value_lookups.text.title[text.values.title]; + // } + // corpusTextInfoList.add(Object.values(corpusData.text)); + + // let textCountChipElement = document.querySelector('.text-count-chip'); + // textCountChipElement.innerHTML = `Text count: ${Object.values(corpusData.text).length}`; } renderTextProportionsGraphic(corpusData) { - let textProportionsGraphicElement = document.querySelector('#text-proportions-graphic'); - let graphData = [ - { - values: corpusData.texts.map(text => text.num_tokens), - labels: corpusData.texts.map(text => `${text.title} (${text.publishing_year})`), - type: 'pie' - } - ]; - let graphLayout = { - height: 400, - width: 500 - }; - Plotly.newPlot(textProportionsGraphicElement, graphData, graphLayout); + // let textProportionsGraphicElement = document.querySelector('#text-proportions-graphic'); + // let texts = Object.values(corpusData.text); + // let graphData = [ + // { + // values: texts.map(text => text.counts.token), + // labels: texts.map(text => `${text.values.title} (${text.values.publishing_year})`), + // type: 'pie' + // } + // ]; + // let graphLayout = { + // height: 400, + // width: 500 + // }; + // Plotly.newPlot(textProportionsGraphicElement, graphData, graphLayout); + } + + renderWordFrequenciesGraphic(corpusData) { + // let wordFrequenciesGraphicElement = document.querySelector('#word-frequencies-graphic'); + // let words = Object.entries(corpusData.value_lookups.token.word); + // let texts = Object.values(corpusData.text); + // let graphData = []; + // for (let word of words) { + // let data = { + // x: texts.map(text => `${text.values.title} (${text.values.publishing_year})`), + // y: texts.map(text => text.freqs.word[word[0]]), + // name: word[1], + // type: 'bar' + // }; + // graphData.push(data); + // } + + // let graphLayout = { + // height: 400, + // width: 500, + // barmode: 'stack', + // type: 'bar' + // }; + // Plotly.newPlot(wordFrequenciesGraphicElement, graphData, graphLayout); + } + + renderWordDistributionsGraphic(corpusData) { + // let wordDistributionGraphicElement = document.querySelector('#word-distributions-graphic'); + // var trace1 = { + // x: [1, 2, 3, 4], + // y: [10, 11, 12, 13], + // mode: 'markers', + // marker: { + // size: [40, 60, 80, 100] + // } + // }; + // var data = [trace1]; + // var layout = { + // title: 'Marker Size', + // showlegend: false, + // height: 600, + // width: 600 + // }; + // Plotly.newPlot(wordDistributionGraphicElement, data, layout); } } diff --git a/app/static/js/ResourceLists/CorpusTextInfoList.js b/app/static/js/ResourceLists/CorpusTextInfoList.js index 3e697d2d..6e8e8310 100644 --- a/app/static/js/ResourceLists/CorpusTextInfoList.js +++ b/app/static/js/ResourceLists/CorpusTextInfoList.js @@ -29,11 +29,11 @@ class CorpusTextInfoList extends ResourceList { () + - - - + + `.trim(); } @@ -44,11 +44,11 @@ class CorpusTextInfoList extends ResourceList { 'title', 'publishing_year', 'num_tokens', + 'num_sentences', 'num_unique_words', 'num_unique_lemmas', - 'num_sentences', - 'average_sentence_length', - 'num_unique_ent_types' + 'num_unique_pos', + 'num_unique_simple_pos' ]; } @@ -68,11 +68,11 @@ class CorpusTextInfoList extends ResourceList { Textarrow_drop_down Number of tokensarrow_drop_down + Number of sentencesarrow_drop_down Number of unique wordsarrow_drop_down Number of unique lemmasarrow_drop_down - Number of sentencesarrow_drop_down - Average sentence lengtharrow_drop_down - Number of unique entity typesarrow_drop_down + Number of unique posarrow_drop_down + Number of unique simple posarrow_drop_down @@ -83,14 +83,14 @@ class CorpusTextInfoList extends ResourceList { mapResourceToValue(corpusTextData) { return { - title: corpusTextData.title, - publishing_year: corpusTextData.publishing_year, - num_tokens: corpusTextData.num_tokens, - num_unique_words: corpusTextData.num_unique_words, - num_unique_lemmas: corpusTextData.num_unique_lemmas, - num_sentences: corpusTextData.num_sentences, - average_sentence_length: corpusTextData.average_sentence_length, - num_unique_ent_types: corpusTextData.num_unique_ent_types + title: corpusTextData.values.title, + publishing_year: corpusTextData.values.publishing_year, + num_tokens: corpusTextData.counts.token, + num_sentences: corpusTextData.counts.s, + num_unique_words: Object.entries(corpusTextData.freqs.word).length, + num_unique_lemmas: Object.entries(corpusTextData.freqs.lemma).length, + num_unique_pos: Object.entries(corpusTextData.freqs.pos).length, + num_unique_simple_pos: Object.entries(corpusTextData.freqs.simple_pos).length }; } diff --git a/app/templates/corpora/analysis.html.j2 b/app/templates/corpora/analysis.html.j2 index d10126cd..1452d2d0 100644 --- a/app/templates/corpora/analysis.html.j2 +++ b/app/templates/corpora/analysis.html.j2 @@ -35,44 +35,70 @@

query_statsVisualizations

-
-
-
- General information about the Corpus -

-
-
    -
  • -
    -
  • -
    -
  • -
    -
  • -
    -
  • -
    -
  • -
    -
  • -
    -
  • -
    -
+
+
+
+
+
+

Number of tokens

+
-
+
+
+
+

Number of sentences

+ +
+
+
+
+
+
+

Number of unique words

+ +
+
+
+
+
+
+

Number of unique lemmas

+ +
+
+
+
+
+
+

Number of unique pos

+ +
+
+
+
+
+
+

Number of unique simple_pos

+ +
+
+
+
+
+
Text information +
-
+
Text proportions within the corpus @@ -80,7 +106,7 @@
-
+
Word frequencies @@ -88,6 +114,14 @@
+
+
+
+ Word distributions +
+
+
+