visualization testing

This commit is contained in:
Inga Kirschnick 2023-06-19 13:41:56 +02:00
parent 91e68360ac
commit 11e1789d83
4 changed files with 333 additions and 226 deletions

View File

@ -101,164 +101,188 @@ class CQiCorpus {
getCorpusData() {
return new Promise((resolve, reject) => {
const dummyData = {
"num_tokens": 2000, // number of tokens in the corpus
"num_unique_words": 500, // number of unique words in the corpus
"num_unique_lemmas": 200, // number of unique lemmas in the corpus
"num_sentences": 90, // number of sentences in the corpus
"average_sentence_length": 11, // average number of tokens per sentence in the corpus
"num_ent_types": 30, // number of entities in the corpus
"num_unique_ent_types":10,
"ent_type_freqs": {
"str": 10, // number of ent_types with ent_type "str"
// ...
},
"texts": [
{
"num_tokens": 11, // number of tokens in the text
"num_unique_words": 12, // number of unique words in the text
"word_freqs": { // frequency of unique words in the text (sorted by frequency)
"str": "int", // number of tokens with word "str"
// ...
"corpus": {
"bounds": [1, 689],
"counts": {
"token": 743,
"ent": 321,
"s": 234
},
"freqs": {
"word": {
"1": 876,
"2": 234,
"3": 657
},
"num_unique_lemmas": 15, // number of unique lemmas in the text
"lemma_freqs": { // frequency of unique lemmas in the text (sorted by frequency)
"str": "int", // number of tokens with lemma "str"
// ...
"lemma": {
"1": 543,
"2": 876,
"3": 321
},
"num_sentences": 4, // number of sentences in the text
"average_sentence_length": 3, // average number of tokens per sentence in the text
"num_ent_types": 12, // number of ent_types in the text
"num_unique_ent_types": 28, // number of unique ent_types in the text
"num_entities_by_id": {
"1": "int", // number of entities with id 1
// ...
},
"author": "Author Name",
"title": "Titel",
"publishing_year": 1950
},
{
"num_tokens": 15, // number of tokens in the text
"num_unique_words": 4, // number of unique words in the text
"word_freqs": { // frequency of unique words in the text (sorted by frequency)
"str": "int", // number of tokens with word "str"
// ...
},
"num_unique_lemmas": 90, // number of unique lemmas in the text
"lemma_freqs": { // frequency of unique lemmas in the text (sorted by frequency)
"str": "int", // number of tokens with lemma "str"
// ...
},
"num_sentences": 11, // number of sentences in the text
"average_sentence_length": 3, // average number of tokens per sentence in the text
"num_ent_types": 4, // number of ent_types in the text
"num_unique_ent_types": 300, // number of unique ent_types in the text
"num_entities_by_id": {
"1": "int", // number of entities with id 1
// ...
},
"author": "Author Name",
"title": "Titel 1",
"publishing_year": 1962
},
{
"num_tokens": 11, // number of tokens in the text
"num_unique_words": 12, // number of unique words in the text
"word_freqs": { // frequency of unique words in the text (sorted by frequency)
"str": "int", // number of tokens with word "str"
// ...
},
"num_unique_lemmas": 64, // number of unique lemmas in the text
"lemma_freqs": { // frequency of unique lemmas in the text (sorted by frequency)
"str": "int", // number of tokens with lemma "str"
// ...
},
"num_sentences": 52, // number of sentences in the text
"average_sentence_length": 3, // average number of tokens per sentence in the text
"num_ent_types": 45, // number of ent_types in the text
"num_unique_ent_types": 68, // number of unique ent_types in the text
"num_entities_by_id": {
"1": "int", // number of entities with id 1
// ...
},
"author": "Author Name",
"title": "Titel 2",
"publishing_year": 1850
},
{
"num_tokens": 56, // number of tokens in the text
"num_unique_words": 13, // number of unique words in the text
"word_freqs": { // frequency of unique words in the text (sorted by frequency)
"str": "int", // number of tokens with word "str"
// ...
},
"num_unique_lemmas": 43, // number of unique lemmas in the text
"lemma_freqs": { // frequency of unique lemmas in the text (sorted by frequency)
"str": "int", // number of tokens with lemma "str"
// ...
},
"num_sentences": 45, // number of sentences in the text
"average_sentence_length": 56, // average number of tokens per sentence in the text
"num_ent_types": 8792, // number of ent_types in the text
"num_unique_ent_types": 56758, // number of unique ent_types in the text
"num_entities_by_id": {
"1": "int", // number of entities with id 1
// ...
},
"author": "Author Name",
"title": "Titel 3",
"publishing_year": 1504
},
{
"num_tokens": 54345, // number of tokens in the text
"num_unique_words": 561, // number of unique words in the text
"word_freqs": { // frequency of unique words in the text (sorted by frequency)
"str": "int", // number of tokens with word "str"
// ...
},
"num_unique_lemmas": 546, // number of unique lemmas in the text
"lemma_freqs": { // frequency of unique lemmas in the text (sorted by frequency)
"str": "int", // number of tokens with lemma "str"
// ...
},
"num_sentences": 5427, // number of sentences in the text
"average_sentence_length": 657, // average number of tokens per sentence in the text
"num_ent_types": 3465, // number of ent_types in the text
"num_unique_ent_types": 45, // number of unique ent_types in the text
"num_entities_by_id": {
"1": "int", // number of entities with id 1
// ...
},
"author": "Author Name",
"title": "Titel 4",
"publishing_year": 1712
},
{
"num_tokens": 4354, // number of tokens in the text
"num_unique_words": 45234, // number of unique words in the text
"word_freqs": { // frequency of unique words in the text (sorted by frequency)
"testwort": 50, // number of tokens with word "str"
"testwort2": 1
},
"num_unique_lemmas": 15, // number of unique lemmas in the text
"lemma_freqs": { // frequency of unique lemmas in the text (sorted by frequency)
"testlemma": 11, // number of tokens with lemma "str"
"testlemma2": 1
},
"num_sentences": 90, // number of sentences in the text
"average_sentence_length": 7, // average number of tokens per sentence in the text
"num_ent_types": 19,
"num_unique_ent_types": 5, // number of unique ent_types in the text
"num_entities_by_id": {
"1": "int", // number of entities with id 1
// ...
},
"author": "Author Name 2",
"title": "Titel 5",
"publishing_year": 1951
"pos": {
"1": 456,
"2": 789,
"3": 234
},
"simple_pos": {
"1": 987,
"2": 876,
"3": 543
},
"ent": {
"1": 654,
"2": 321,
"3": 987
}
}
]
};
},
"text": {
"1": {
"bounds": [0, 435],
"counts": {
"token": 345,
"ent_type": 123,
"s": 89
},
"freqs": {
"word": {
"1": 25,
"2": 90,
"3": 200
},
"lemma": {
"1": 654,
"2": 321,
"3": 987
},
"pos": {
"1": 543,
"2": 876,
"3": 234
},
"simple_pos": {
"1": 987,
"2": 654,
"3": 321
},
"ent_type": {
"1": 234,
"2": 789,
"3": 543
}
},
"values": {
"author": 1,
"publishing_year":1950,
"title": 1
}
},
"2": {
"bounds": [435, 689],
"counts": {
"token": 389,
"ent_type": 198,
"s": 145
},
"freqs": {
"word": {
"1": 60,
"2": 70,
"3": 100
},
"lemma": {
"1": 654,
"2": 321,
"3": 987
},
"pos": {
"1": 543,
"2": 876,
"3": 234
},
"simple_pos": {
"1": 987,
"2": 654,
"3": 321
},
"ent_type": {
"1": 234,
"2": 789,
"3": 543
}
},
"values": {
"author": 2,
"publishing_year":1951,
"title": 2
}
}
},
"s": {
"1": {
"bounds": [345, 678]
}
},
"ent": {
"1": {
"bounds": [567, 890],
"values": {
"type": 789
}
}
},
"token": {
"310": {
"values": {
"word": 1,
"lemma": 2,
"pos": 1,
"simple_pos": 1
}
}
},
"value_lookups": {
"text": {
"author": {
"1": "John Doe",
"2": "Jane Smith"
},
"title": {
"1": "Test Title 1",
"2": "Test Title 2"
}
},
"ent": {
"type": {
"1": "Person",
"2": "Organization"
}
},
"token": {
"word": {
"1": "apple",
"2": "banana",
"3": "orange"
},
"lemma": {
"1": "run",
"2": "walk",
"3": "jump"
},
"pos": {
"1": "noun",
"2": "verb",
"3": "adjective"
},
"simple_pos": {
"1": "subject",
"2": "object",
"3": "predicate"
}
}
}
}
resolve(dummyData);
/*

View File

@ -39,6 +39,8 @@ class CorpusAnalysisApp {
this.renderGeneralCorpusInfo(corpusData);
this.renderTextInfoList(corpusData);
this.renderTextProportionsGraphic(corpusData);
this.renderWordFrequenciesGraphic(corpusData);
this.renderWordDistributionsGraphic(corpusData);
});
// TODO: Don't do this hgere
cQiCorpus.updateDb();
@ -103,38 +105,85 @@ class CorpusAnalysisApp {
}
renderGeneralCorpusInfo(corpusData) {
let corpusGeneralInfoListElement = document.querySelector('.corpus-general-info-list');
corpusGeneralInfoListElement.querySelector('.corpus-num-tokens').innerHTML = `<b>Number of tokens:</b> ${this.data.corpus.o.size}`;
corpusGeneralInfoListElement.querySelector('.corpus-text-count').innerHTML = `<b>Corpus text count:</b> ${corpusData.texts.length}`;
corpusGeneralInfoListElement.querySelector('.corpus-num-unique-words').innerHTML = `<b>Corpus unique word count:</b> ${corpusData.num_unique_words}`;
corpusGeneralInfoListElement.querySelector('.corpus-num-unique-lemmas').innerHTML = `<b>Corpus unique lemma count:</b> ${corpusData.num_unique_lemmas}`;
// corpusGeneralInfoListElement.querySelector('.corpus-most-frequent-words').innerHTML = `<b>Corpus most frequent words:</b> ${corpusData.most_frequent_words.join(', ');
corpusGeneralInfoListElement.querySelector('.corpus-num-sentences').innerHTML = `<b>Corpus sentence count:</b> ${corpusData.num_sentences}`;
corpusGeneralInfoListElement.querySelector('.corpus-average-sentence-length').innerHTML = `<b>Corpus average sentence length:</b> ${corpusData.average_sentence_length}`;
corpusGeneralInfoListElement.querySelector('.corpus-num-ent-types').innerHTML = `<b>Corpus entity count:</b> ${corpusData.num_ent_types}`;
corpusGeneralInfoListElement.querySelector('.corpus-num-unique-ent-types').innerHTML = `<b>Corpus unique entity count:</b> ${corpusData.num_unique_ent_types}`;
document.querySelector('.corpus-num-tokens').innerHTML = corpusData.corpus.counts.token;
document.querySelector('.corpus-num-s').innerHTML = corpusData.corpus.counts.s;
// corpusGeneralInfoListElement.querySelector('.corpus-text-count').innerHTML = <b>Corpus text count:</b> ${Object.entries(corpusData.text).length;
document.querySelector('.corpus-num-unique-words').innerHTML = Object.entries(corpusData.corpus.freqs.word).length;
document.querySelector('.corpus-num-unique-lemmas').innerHTML = Object.entries(corpusData.corpus.freqs.lemma).length;
document.querySelector('.corpus-num-unique-pos').innerHTML = Object.entries(corpusData.corpus.freqs.pos).length;
document.querySelector('.corpus-num-unique-simple-pos').innerHTML = Object.entries(corpusData.corpus.freqs.simple_pos).length;
}
renderTextInfoList(corpusData) {
let corpusTextInfoListElement = document.querySelector('.corpus-text-info-list');
let corpusTextInfoList = new CorpusTextInfoList(corpusTextInfoListElement);
corpusTextInfoList.add(corpusData.texts);
// let corpusTextInfoListElement = document.querySelector('.corpus-text-info-list');
// let corpusTextInfoList = new CorpusTextInfoList(corpusTextInfoListElement);
// for (let text of Object.values(corpusData.text)) {
// text.values.title = corpusData.value_lookups.text.title[text.values.title];
// }
// corpusTextInfoList.add(Object.values(corpusData.text));
// let textCountChipElement = document.querySelector('.text-count-chip');
// textCountChipElement.innerHTML = `Text count: ${Object.values(corpusData.text).length}`;
}
renderTextProportionsGraphic(corpusData) {
let textProportionsGraphicElement = document.querySelector('#text-proportions-graphic');
let graphData = [
{
values: corpusData.texts.map(text => text.num_tokens),
labels: corpusData.texts.map(text => `${text.title} (${text.publishing_year})`),
type: 'pie'
}
];
let graphLayout = {
height: 400,
width: 500
};
Plotly.newPlot(textProportionsGraphicElement, graphData, graphLayout);
// let textProportionsGraphicElement = document.querySelector('#text-proportions-graphic');
// let texts = Object.values(corpusData.text);
// let graphData = [
// {
// values: texts.map(text => text.counts.token),
// labels: texts.map(text => `${text.values.title} (${text.values.publishing_year})`),
// type: 'pie'
// }
// ];
// let graphLayout = {
// height: 400,
// width: 500
// };
// Plotly.newPlot(textProportionsGraphicElement, graphData, graphLayout);
}
renderWordFrequenciesGraphic(corpusData) {
// let wordFrequenciesGraphicElement = document.querySelector('#word-frequencies-graphic');
// let words = Object.entries(corpusData.value_lookups.token.word);
// let texts = Object.values(corpusData.text);
// let graphData = [];
// for (let word of words) {
// let data = {
// x: texts.map(text => `${text.values.title} (${text.values.publishing_year})`),
// y: texts.map(text => text.freqs.word[word[0]]),
// name: word[1],
// type: 'bar'
// };
// graphData.push(data);
// }
// let graphLayout = {
// height: 400,
// width: 500,
// barmode: 'stack',
// type: 'bar'
// };
// Plotly.newPlot(wordFrequenciesGraphicElement, graphData, graphLayout);
}
renderWordDistributionsGraphic(corpusData) {
// let wordDistributionGraphicElement = document.querySelector('#word-distributions-graphic');
// var trace1 = {
// x: [1, 2, 3, 4],
// y: [10, 11, 12, 13],
// mode: 'markers',
// marker: {
// size: [40, 60, 80, 100]
// }
// };
// var data = [trace1];
// var layout = {
// title: 'Marker Size',
// showlegend: false,
// height: 600,
// width: 600
// };
// Plotly.newPlot(wordDistributionGraphicElement, data, layout);
}
}

View File

@ -29,11 +29,11 @@ class CorpusTextInfoList extends ResourceList {
<tr class="list-item clickable hoverable">
<td><span class="title"></span> (<span class="publishing_year"></span>)</td>
<td><span class="num_tokens"></span></td>
<td><span class="num_sentences"></span></td>
<td><span class="num_unique_words"></span></td>
<td><span class="num_unique_lemmas"></span></td>
<td><span class="num_sentences"></span></td>
<td><span class="average_sentence_length"></span></td>
<td><span class="num_unique_ent_types"></span></td>
<td><span class="num_unique_pos"></span></td>
<td><span class="num_unique_simple_pos"></span></td>
</tr>
`.trim();
}
@ -44,11 +44,11 @@ class CorpusTextInfoList extends ResourceList {
'title',
'publishing_year',
'num_tokens',
'num_sentences',
'num_unique_words',
'num_unique_lemmas',
'num_sentences',
'average_sentence_length',
'num_unique_ent_types'
'num_unique_pos',
'num_unique_simple_pos'
];
}
@ -68,11 +68,11 @@ class CorpusTextInfoList extends ResourceList {
<tr>
<th>Text<span class="sort right material-icons" data-sort="title" style="cursor:pointer; color:#aa9cc9">arrow_drop_down</span></th>
<th>Number of tokens<span class="sort right material-icons" data-sort="num_tokens" style="cursor:pointer">arrow_drop_down</span></th>
<th>Number of sentences<span class="sort right material-icons" data-sort="num_sentences" style="cursor:pointer">arrow_drop_down</span></th>
<th>Number of unique words<span class="sort right material-icons" data-sort="num_unique_words" style="cursor:pointer">arrow_drop_down</span></th>
<th>Number of unique lemmas<span class="sort right material-icons" data-sort="num_unique_lemmas" style="cursor:pointer">arrow_drop_down</span></th>
<th>Number of sentences<span class="sort right material-icons" data-sort="num_sentences" style="cursor:pointer">arrow_drop_down</span></th>
<th>Average sentence length<span class="sort right material-icons" data-sort="average_sentence_length" style="cursor:pointer">arrow_drop_down</span></th>
<th>Number of unique entity types<span class="sort right material-icons" data-sort="num_unique_ent_types" style="cursor:pointer">arrow_drop_down</span></th>
<th>Number of unique pos<span class="sort right material-icons" data-sort="num_unique_pos" style="cursor:pointer">arrow_drop_down</span></th>
<th>Number of unique simple pos<span class="sort right material-icons" data-sort="num_unique_simple_pos" style="cursor:pointer">arrow_drop_down</span></th>
</tr>
</thead>
<tbody class="list"></tbody>
@ -83,14 +83,14 @@ class CorpusTextInfoList extends ResourceList {
mapResourceToValue(corpusTextData) {
return {
title: corpusTextData.title,
publishing_year: corpusTextData.publishing_year,
num_tokens: corpusTextData.num_tokens,
num_unique_words: corpusTextData.num_unique_words,
num_unique_lemmas: corpusTextData.num_unique_lemmas,
num_sentences: corpusTextData.num_sentences,
average_sentence_length: corpusTextData.average_sentence_length,
num_unique_ent_types: corpusTextData.num_unique_ent_types
title: corpusTextData.values.title,
publishing_year: corpusTextData.values.publishing_year,
num_tokens: corpusTextData.counts.token,
num_sentences: corpusTextData.counts.s,
num_unique_words: Object.entries(corpusTextData.freqs.word).length,
num_unique_lemmas: Object.entries(corpusTextData.freqs.lemma).length,
num_unique_pos: Object.entries(corpusTextData.freqs.pos).length,
num_unique_simple_pos: Object.entries(corpusTextData.freqs.simple_pos).length
};
}

View File

@ -35,44 +35,70 @@
<div class="col s12">
<h4><i class="material-icons left">query_stats</i>Visualizations</h4>
</div>
<div class="col s4" >
<div class="card hoverable">
<div class="card-content">
<span class="card-title">General information about the Corpus</span>
<p></p>
<br>
<ul class="corpus-general-info-list">
<li class="corpus-num-tokens"></li>
<br>
<li class="corpus-text-count"></li>
<br>
<li class="corpus-num-unique-words"></li>
<br>
<li class="corpus-num-unique-lemmas"></li>
<br>
<li class="corpus-num-sentences"></li>
<br>
<li class="corpus-average-sentence-length"></li>
<br>
<li class="corpus-num-ent-types"></li>
<br>
<li class="corpus-num-unique-ent-types"></li>
<br>
</ul>
</div>
<div class="row">
<div class="col s2">
<div class="card hoverable" style="border-radius: 10px !important; background-color:#6b3f89; color:white">
<div class="card-content" style="padding:10px !important; text-align:center;">
<p>Number of tokens</p>
<span class="card-title corpus-num-tokens"></span>
</div>
</div>
</div>
<div class="col s8">
<div class="col s2">
<div class="card hoverable" style="border-radius: 10px !important; background-color:#6b3f89; color:white">
<div class="card-content" style="padding:10px !important; text-align:center">
<p>Number of sentences</p>
<span class="card-title corpus-num-s"></span>
</div>
</div>
</div>
<div class="col s2">
<div class="card hoverable" style="border-radius: 10px !important; background-color:#6b3f89; color:white">
<div class="card-content" style="padding:10px !important; text-align:center">
<p>Number of unique words</p>
<span class="card-title corpus-num-unique-words"></span>
</div>
</div>
</div>
<div class="col s2">
<div class="card hoverable" style="border-radius: 10px !important; background-color:#6b3f89; color:white">
<div class="card-content" style="padding:10px !important; text-align:center">
<p>Number of unique lemmas</p>
<span class="card-title corpus-num-unique-lemmas"></span>
</div>
</div>
</div>
<div class="col s2">
<div class="card hoverable" style="border-radius: 10px !important; background-color:#6b3f89; color:white">
<div class="card-content" style="padding:10px !important; text-align:center">
<p>Number of unique pos</p>
<span class="card-title corpus-num-unique-pos"></span>
</div>
</div>
</div>
<div class="col s2">
<div class="card hoverable" style="border-radius: 10px !important; background-color:#6b3f89; color:white">
<div class="card-content" style="padding:10px !important; text-align:center">
<p>Number of unique simple_pos</p>
<span class="card-title corpus-num-unique-simple-pos"></span>
</div>
</div>
</div>
</div>
<div class="row">
<div class="col s12">
<div class="card hoverable">
<div class="card-content">
<span class="card-title">Text information</span>
<div class="chip text-count-chip" style="background-color:#6b3f89; color:white""></div>
<div class="corpus-text-info-list no-autoinit"></div>
</div>
</div>
</div>
</div>
<div class="row">
<div class="col s6">
<div class="col s3">
<div class="card hoverable">
<div class="card-content">
<span class="card-title">Text proportions within the corpus</span>
@ -80,7 +106,7 @@
</div>
</div>
</div>
<div class="col s6">
<div class="col s3">
<div class="card hoverable">
<div class="card-content">
<span class="card-title">Word frequencies</span>
@ -88,6 +114,14 @@
</div>
</div>
</div>
<div class="col s6">
<div class="card hoverable">
<div class="card-content">
<span class="card-title">Word distributions</span>
<div id="word-distributions-graphic"></div>
</div>
</div>
</div>
</div>
</div>