mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2025-01-18 05:50:34 +00:00
Compare commits
4 Commits
972f514e6b
...
11b697145b
Author | SHA1 | Date | |
---|---|---|---|
|
11b697145b | ||
|
11e1789d83 | ||
|
f037c31b88 | ||
|
91e68360ac |
@ -50,9 +50,52 @@ def cqi_corpora_corpus_update_db(cqi_client: cqi.CQiClient, corpus_name: str):
|
|||||||
@cqi_over_socketio
|
@cqi_over_socketio
|
||||||
def cqi_corpora_corpus_get_visualization_data(cqi_client: cqi.CQiClient, corpus_name: str):
|
def cqi_corpora_corpus_get_visualization_data(cqi_client: cqi.CQiClient, corpus_name: str):
|
||||||
corpus = cqi_client.corpora.get(corpus_name)
|
corpus = cqi_client.corpora.get(corpus_name)
|
||||||
|
# s_attrs = [x for x in corpus.structural_attributes.list() if not x.has_values]
|
||||||
|
# p_attrs = corpus.positional_attributes.list()
|
||||||
|
# payload = {
|
||||||
|
# 's_attrs': {},
|
||||||
|
# 'p_attrs': {},
|
||||||
|
# 'values': {
|
||||||
|
# 's_attrs': {},
|
||||||
|
# 'p_attrs': {}
|
||||||
|
# }
|
||||||
|
# }
|
||||||
|
# for s_attr in s_attrs:
|
||||||
|
# s_attr_lbound, s_attr_rbound = s_attr.cpos_by_id(text_id)
|
||||||
|
# s_attr_cpos_range = range(s_attr_lbound, s_attr_rbound + 1)
|
||||||
|
# payload['text']['lexicon'][text_id] = {
|
||||||
|
# 's_attrs': [s_attr_lbound, s_attr_rbound],
|
||||||
|
# 'counts': {
|
||||||
|
# 'token': s_attr_rbound - s_attr_lbound + 1
|
||||||
|
# },
|
||||||
|
# 'freqs': {
|
||||||
|
# p_attr.name: dict(Counter(p_attr.ids_by_cpos(list(s_attr_cpos_range))))
|
||||||
|
# for p_attr in p_attrs
|
||||||
|
# }
|
||||||
|
# }
|
||||||
|
# for p_attr in p_attrs:
|
||||||
|
# payload['p_attrs'] = dict(
|
||||||
|
|
||||||
|
# )
|
||||||
|
# payload['values']['p_attrs'] = dict(
|
||||||
|
# zip(
|
||||||
|
# range(0, p_attr.lexicon_size),
|
||||||
|
# p_attr.values_by_ids(list(range(0, p_attr.lexicon_size)))
|
||||||
|
# )
|
||||||
|
# )
|
||||||
text = corpus.structural_attributes.get('text')
|
text = corpus.structural_attributes.get('text')
|
||||||
|
text_value_names = []
|
||||||
|
text_values = []
|
||||||
|
for text_sub_attr in corpus.structural_attributes.list(filters={'part_of': text}):
|
||||||
|
text_value_names.append(text_sub_attr.name[(len(text.name) + 1):])
|
||||||
|
text_values.append(text_sub_attr.values_by_ids(list(range(0, text.size))))
|
||||||
s = corpus.structural_attributes.get('s')
|
s = corpus.structural_attributes.get('s')
|
||||||
ent = corpus.structural_attributes.get('ent')
|
ent = corpus.structural_attributes.get('ent')
|
||||||
|
ent_value_names = []
|
||||||
|
ent_values = []
|
||||||
|
for ent_sub_attr in corpus.structural_attributes.list(filters={'part_of': ent}):
|
||||||
|
ent_value_names.append(ent_sub_attr.name[(len(ent.name) + 1):])
|
||||||
|
ent_values.append(ent_sub_attr.values_by_ids(list(range(0, ent.size))))
|
||||||
word = corpus.positional_attributes.get('word')
|
word = corpus.positional_attributes.get('word')
|
||||||
lemma = corpus.positional_attributes.get('lemma')
|
lemma = corpus.positional_attributes.get('lemma')
|
||||||
pos = corpus.positional_attributes.get('pos')
|
pos = corpus.positional_attributes.get('pos')
|
||||||
@ -122,10 +165,7 @@ def cqi_corpora_corpus_get_visualization_data(cqi_client: cqi.CQiClient, corpus_
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
payload['text']['values'] = [
|
payload['text']['values'] = text_value_names
|
||||||
sub_attr.name[(len(text.name) + 1):]
|
|
||||||
for sub_attr in corpus.structural_attributes.list(filters={'part_of': text})
|
|
||||||
]
|
|
||||||
payload['s'] = {'lexicon': {}, 'values': None}
|
payload['s'] = {'lexicon': {}, 'values': None}
|
||||||
for s_id in range(0, s.size):
|
for s_id in range(0, s.size):
|
||||||
payload['s']['lexicon'][s_id] = {
|
payload['s']['lexicon'][s_id] = {
|
||||||
@ -140,15 +180,22 @@ def cqi_corpora_corpus_get_visualization_data(cqi_client: cqi.CQiClient, corpus_
|
|||||||
payload['ent']['lexicon'][ent_id] = {
|
payload['ent']['lexicon'][ent_id] = {
|
||||||
# 'bounds': ent.cpos_by_id(ent_id)
|
# 'bounds': ent.cpos_by_id(ent_id)
|
||||||
}
|
}
|
||||||
payload['ent']['values'] = [
|
payload['ent']['values'] = ent_value_names
|
||||||
sub_attr.name[(len(ent.name) + 1):]
|
|
||||||
for sub_attr in corpus.structural_attributes.list(filters={'part_of': ent})
|
|
||||||
]
|
|
||||||
payload['lookups'] = {
|
payload['lookups'] = {
|
||||||
'corpus': {},
|
'corpus': {},
|
||||||
'text': {},
|
'text': {
|
||||||
|
text_id: {
|
||||||
|
text_value_name: text_values[text_value_name_idx][text_id_idx]
|
||||||
|
for text_value_name_idx, text_value_name in enumerate(text_value_names)
|
||||||
|
} for text_id_idx, text_id in enumerate(range(0, text.size))
|
||||||
|
},
|
||||||
's': {},
|
's': {},
|
||||||
'ent': {},
|
'ent': {
|
||||||
|
ent_id: {
|
||||||
|
ent_value_name: ent_values[ent_value_name_idx][ent_id_idx]
|
||||||
|
for ent_value_name_idx, ent_value_name in enumerate(ent_value_names)
|
||||||
|
} for ent_id_idx, ent_id in enumerate(range(0, ent.size))
|
||||||
|
},
|
||||||
'word': dict(
|
'word': dict(
|
||||||
zip(
|
zip(
|
||||||
range(0, word.lexicon_size),
|
range(0, word.lexicon_size),
|
||||||
|
@ -115,164 +115,188 @@ class CQiCorpus {
|
|||||||
getCorpusData() {
|
getCorpusData() {
|
||||||
return new Promise((resolve, reject) => {
|
return new Promise((resolve, reject) => {
|
||||||
const dummyData = {
|
const dummyData = {
|
||||||
"num_tokens": 2000, // number of tokens in the corpus
|
"corpus": {
|
||||||
"num_unique_words": 500, // number of unique words in the corpus
|
"bounds": [1, 689],
|
||||||
"num_unique_lemmas": 200, // number of unique lemmas in the corpus
|
"counts": {
|
||||||
"num_sentences": 90, // number of sentences in the corpus
|
"token": 743,
|
||||||
"average_sentence_length": 11, // average number of tokens per sentence in the corpus
|
"ent": 321,
|
||||||
"num_ent_types": 30, // number of entities in the corpus
|
"s": 234
|
||||||
"num_unique_ent_types":10,
|
},
|
||||||
"ent_type_freqs": {
|
"freqs": {
|
||||||
"str": 10, // number of ent_types with ent_type "str"
|
"word": {
|
||||||
// ...
|
"1": 876,
|
||||||
},
|
"2": 234,
|
||||||
"texts": [
|
"3": 657
|
||||||
{
|
|
||||||
"num_tokens": 11, // number of tokens in the text
|
|
||||||
"num_unique_words": 12, // number of unique words in the text
|
|
||||||
"word_freqs": { // frequency of unique words in the text (sorted by frequency)
|
|
||||||
"str": "int", // number of tokens with word "str"
|
|
||||||
// ...
|
|
||||||
},
|
},
|
||||||
"num_unique_lemmas": 15, // number of unique lemmas in the text
|
"lemma": {
|
||||||
"lemma_freqs": { // frequency of unique lemmas in the text (sorted by frequency)
|
"1": 543,
|
||||||
"str": "int", // number of tokens with lemma "str"
|
"2": 876,
|
||||||
// ...
|
"3": 321
|
||||||
},
|
},
|
||||||
"num_sentences": 4, // number of sentences in the text
|
"pos": {
|
||||||
"average_sentence_length": 3, // average number of tokens per sentence in the text
|
"1": 456,
|
||||||
"num_ent_types": 12, // number of ent_types in the text
|
"2": 789,
|
||||||
"num_unique_ent_types": 28, // number of unique ent_types in the text
|
"3": 234
|
||||||
"num_entities_by_id": {
|
},
|
||||||
"1": "int", // number of entities with id 1
|
"simple_pos": {
|
||||||
// ...
|
"1": 987,
|
||||||
},
|
"2": 876,
|
||||||
"author": "Author Name",
|
"3": 543
|
||||||
"title": "Titel",
|
},
|
||||||
"publishing_year": 1950
|
"ent": {
|
||||||
},
|
"1": 654,
|
||||||
{
|
"2": 321,
|
||||||
"num_tokens": 15, // number of tokens in the text
|
"3": 987
|
||||||
"num_unique_words": 4, // number of unique words in the text
|
}
|
||||||
"word_freqs": { // frequency of unique words in the text (sorted by frequency)
|
|
||||||
"str": "int", // number of tokens with word "str"
|
|
||||||
// ...
|
|
||||||
},
|
|
||||||
"num_unique_lemmas": 90, // number of unique lemmas in the text
|
|
||||||
"lemma_freqs": { // frequency of unique lemmas in the text (sorted by frequency)
|
|
||||||
"str": "int", // number of tokens with lemma "str"
|
|
||||||
// ...
|
|
||||||
},
|
|
||||||
"num_sentences": 11, // number of sentences in the text
|
|
||||||
"average_sentence_length": 3, // average number of tokens per sentence in the text
|
|
||||||
"num_ent_types": 4, // number of ent_types in the text
|
|
||||||
"num_unique_ent_types": 300, // number of unique ent_types in the text
|
|
||||||
"num_entities_by_id": {
|
|
||||||
"1": "int", // number of entities with id 1
|
|
||||||
// ...
|
|
||||||
},
|
|
||||||
"author": "Author Name",
|
|
||||||
"title": "Titel 1",
|
|
||||||
"publishing_year": 1962
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"num_tokens": 11, // number of tokens in the text
|
|
||||||
"num_unique_words": 12, // number of unique words in the text
|
|
||||||
"word_freqs": { // frequency of unique words in the text (sorted by frequency)
|
|
||||||
"str": "int", // number of tokens with word "str"
|
|
||||||
// ...
|
|
||||||
},
|
|
||||||
"num_unique_lemmas": 64, // number of unique lemmas in the text
|
|
||||||
"lemma_freqs": { // frequency of unique lemmas in the text (sorted by frequency)
|
|
||||||
"str": "int", // number of tokens with lemma "str"
|
|
||||||
// ...
|
|
||||||
},
|
|
||||||
"num_sentences": 52, // number of sentences in the text
|
|
||||||
"average_sentence_length": 3, // average number of tokens per sentence in the text
|
|
||||||
"num_ent_types": 45, // number of ent_types in the text
|
|
||||||
"num_unique_ent_types": 68, // number of unique ent_types in the text
|
|
||||||
"num_entities_by_id": {
|
|
||||||
"1": "int", // number of entities with id 1
|
|
||||||
// ...
|
|
||||||
},
|
|
||||||
"author": "Author Name",
|
|
||||||
"title": "Titel 2",
|
|
||||||
"publishing_year": 1850
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"num_tokens": 56, // number of tokens in the text
|
|
||||||
"num_unique_words": 13, // number of unique words in the text
|
|
||||||
"word_freqs": { // frequency of unique words in the text (sorted by frequency)
|
|
||||||
"str": "int", // number of tokens with word "str"
|
|
||||||
// ...
|
|
||||||
},
|
|
||||||
"num_unique_lemmas": 43, // number of unique lemmas in the text
|
|
||||||
"lemma_freqs": { // frequency of unique lemmas in the text (sorted by frequency)
|
|
||||||
"str": "int", // number of tokens with lemma "str"
|
|
||||||
// ...
|
|
||||||
},
|
|
||||||
"num_sentences": 45, // number of sentences in the text
|
|
||||||
"average_sentence_length": 56, // average number of tokens per sentence in the text
|
|
||||||
"num_ent_types": 8792, // number of ent_types in the text
|
|
||||||
"num_unique_ent_types": 56758, // number of unique ent_types in the text
|
|
||||||
"num_entities_by_id": {
|
|
||||||
"1": "int", // number of entities with id 1
|
|
||||||
// ...
|
|
||||||
},
|
|
||||||
"author": "Author Name",
|
|
||||||
"title": "Titel 3",
|
|
||||||
"publishing_year": 1504
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"num_tokens": 54345, // number of tokens in the text
|
|
||||||
"num_unique_words": 561, // number of unique words in the text
|
|
||||||
"word_freqs": { // frequency of unique words in the text (sorted by frequency)
|
|
||||||
"str": "int", // number of tokens with word "str"
|
|
||||||
// ...
|
|
||||||
},
|
|
||||||
"num_unique_lemmas": 546, // number of unique lemmas in the text
|
|
||||||
"lemma_freqs": { // frequency of unique lemmas in the text (sorted by frequency)
|
|
||||||
"str": "int", // number of tokens with lemma "str"
|
|
||||||
// ...
|
|
||||||
},
|
|
||||||
"num_sentences": 5427, // number of sentences in the text
|
|
||||||
"average_sentence_length": 657, // average number of tokens per sentence in the text
|
|
||||||
"num_ent_types": 3465, // number of ent_types in the text
|
|
||||||
"num_unique_ent_types": 45, // number of unique ent_types in the text
|
|
||||||
"num_entities_by_id": {
|
|
||||||
"1": "int", // number of entities with id 1
|
|
||||||
// ...
|
|
||||||
},
|
|
||||||
"author": "Author Name",
|
|
||||||
"title": "Titel 4",
|
|
||||||
"publishing_year": 1712
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"num_tokens": 4354, // number of tokens in the text
|
|
||||||
"num_unique_words": 45234, // number of unique words in the text
|
|
||||||
"word_freqs": { // frequency of unique words in the text (sorted by frequency)
|
|
||||||
"testwort": 50, // number of tokens with word "str"
|
|
||||||
"testwort2": 1
|
|
||||||
},
|
|
||||||
"num_unique_lemmas": 15, // number of unique lemmas in the text
|
|
||||||
"lemma_freqs": { // frequency of unique lemmas in the text (sorted by frequency)
|
|
||||||
"testlemma": 11, // number of tokens with lemma "str"
|
|
||||||
"testlemma2": 1
|
|
||||||
},
|
|
||||||
"num_sentences": 90, // number of sentences in the text
|
|
||||||
"average_sentence_length": 7, // average number of tokens per sentence in the text
|
|
||||||
"num_ent_types": 19,
|
|
||||||
"num_unique_ent_types": 5, // number of unique ent_types in the text
|
|
||||||
"num_entities_by_id": {
|
|
||||||
"1": "int", // number of entities with id 1
|
|
||||||
// ...
|
|
||||||
},
|
|
||||||
"author": "Author Name 2",
|
|
||||||
"title": "Titel 5",
|
|
||||||
"publishing_year": 1951
|
|
||||||
}
|
}
|
||||||
]
|
},
|
||||||
};
|
"text": {
|
||||||
|
"1": {
|
||||||
|
"bounds": [0, 435],
|
||||||
|
"counts": {
|
||||||
|
"token": 345,
|
||||||
|
"ent_type": 123,
|
||||||
|
"s": 89
|
||||||
|
},
|
||||||
|
"freqs": {
|
||||||
|
"word": {
|
||||||
|
"1": 25,
|
||||||
|
"2": 90,
|
||||||
|
"3": 200
|
||||||
|
},
|
||||||
|
"lemma": {
|
||||||
|
"1": 654,
|
||||||
|
"2": 321,
|
||||||
|
"3": 987
|
||||||
|
},
|
||||||
|
"pos": {
|
||||||
|
"1": 543,
|
||||||
|
"2": 876,
|
||||||
|
"3": 234
|
||||||
|
},
|
||||||
|
"simple_pos": {
|
||||||
|
"1": 987,
|
||||||
|
"2": 654,
|
||||||
|
"3": 321
|
||||||
|
},
|
||||||
|
"ent_type": {
|
||||||
|
"1": 234,
|
||||||
|
"2": 789,
|
||||||
|
"3": 543
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"values": {
|
||||||
|
"author": 1,
|
||||||
|
"publishing_year":1950,
|
||||||
|
"title": 1
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"bounds": [435, 689],
|
||||||
|
"counts": {
|
||||||
|
"token": 389,
|
||||||
|
"ent_type": 198,
|
||||||
|
"s": 145
|
||||||
|
},
|
||||||
|
"freqs": {
|
||||||
|
"word": {
|
||||||
|
"1": 60,
|
||||||
|
"2": 70,
|
||||||
|
"3": 100
|
||||||
|
},
|
||||||
|
"lemma": {
|
||||||
|
"1": 654,
|
||||||
|
"2": 321,
|
||||||
|
"3": 987
|
||||||
|
},
|
||||||
|
"pos": {
|
||||||
|
"1": 543,
|
||||||
|
"2": 876,
|
||||||
|
"3": 234
|
||||||
|
},
|
||||||
|
"simple_pos": {
|
||||||
|
"1": 987,
|
||||||
|
"2": 654,
|
||||||
|
"3": 321
|
||||||
|
},
|
||||||
|
"ent_type": {
|
||||||
|
"1": 234,
|
||||||
|
"2": 789,
|
||||||
|
"3": 543
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"values": {
|
||||||
|
"author": 2,
|
||||||
|
"publishing_year":1951,
|
||||||
|
"title": 2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"s": {
|
||||||
|
"1": {
|
||||||
|
"bounds": [345, 678]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"ent": {
|
||||||
|
"1": {
|
||||||
|
"bounds": [567, 890],
|
||||||
|
"values": {
|
||||||
|
"type": 789
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"token": {
|
||||||
|
"310": {
|
||||||
|
"values": {
|
||||||
|
"word": 1,
|
||||||
|
"lemma": 2,
|
||||||
|
"pos": 1,
|
||||||
|
"simple_pos": 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"value_lookups": {
|
||||||
|
"text": {
|
||||||
|
"author": {
|
||||||
|
"1": "John Doe",
|
||||||
|
"2": "Jane Smith"
|
||||||
|
},
|
||||||
|
"title": {
|
||||||
|
"1": "Test Title 1",
|
||||||
|
"2": "Test Title 2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"ent": {
|
||||||
|
"type": {
|
||||||
|
"1": "Person",
|
||||||
|
"2": "Organization"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"token": {
|
||||||
|
"word": {
|
||||||
|
"1": "apple",
|
||||||
|
"2": "banana",
|
||||||
|
"3": "orange"
|
||||||
|
},
|
||||||
|
"lemma": {
|
||||||
|
"1": "run",
|
||||||
|
"2": "walk",
|
||||||
|
"3": "jump"
|
||||||
|
},
|
||||||
|
"pos": {
|
||||||
|
"1": "noun",
|
||||||
|
"2": "verb",
|
||||||
|
"3": "adjective"
|
||||||
|
},
|
||||||
|
"simple_pos": {
|
||||||
|
"1": "subject",
|
||||||
|
"2": "object",
|
||||||
|
"3": "predicate"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
resolve(dummyData);
|
resolve(dummyData);
|
||||||
/*
|
/*
|
||||||
|
@ -34,6 +34,7 @@ class CorpusAnalysisApp {
|
|||||||
.then(
|
.then(
|
||||||
cQiCorpus => {
|
cQiCorpus => {
|
||||||
this.data.corpus = {o: cQiCorpus};
|
this.data.corpus = {o: cQiCorpus};
|
||||||
|
this.data.corpus.o.getVisualizationData().then(data => console.log(data));
|
||||||
// this.data.corpus.o.getVisualizationData()
|
// this.data.corpus.o.getVisualizationData()
|
||||||
// .then(
|
// .then(
|
||||||
// (visualizationData) => {
|
// (visualizationData) => {
|
||||||
@ -48,6 +49,8 @@ class CorpusAnalysisApp {
|
|||||||
this.renderGeneralCorpusInfo(corpusData);
|
this.renderGeneralCorpusInfo(corpusData);
|
||||||
this.renderTextInfoList(corpusData);
|
this.renderTextInfoList(corpusData);
|
||||||
this.renderTextProportionsGraphic(corpusData);
|
this.renderTextProportionsGraphic(corpusData);
|
||||||
|
this.renderWordFrequenciesGraphic(corpusData);
|
||||||
|
this.renderWordDistributionsGraphic(corpusData);
|
||||||
});
|
});
|
||||||
// TODO: Don't do this hgere
|
// TODO: Don't do this hgere
|
||||||
cQiCorpus.updateDb();
|
cQiCorpus.updateDb();
|
||||||
@ -112,38 +115,85 @@ class CorpusAnalysisApp {
|
|||||||
}
|
}
|
||||||
|
|
||||||
renderGeneralCorpusInfo(corpusData) {
|
renderGeneralCorpusInfo(corpusData) {
|
||||||
let corpusGeneralInfoListElement = document.querySelector('.corpus-general-info-list');
|
document.querySelector('.corpus-num-tokens').innerHTML = corpusData.corpus.counts.token;
|
||||||
corpusGeneralInfoListElement.querySelector('.corpus-num-tokens').innerHTML = `<b>Number of tokens:</b> ${this.data.corpus.o.size}`;
|
document.querySelector('.corpus-num-s').innerHTML = corpusData.corpus.counts.s;
|
||||||
corpusGeneralInfoListElement.querySelector('.corpus-text-count').innerHTML = `<b>Corpus text count:</b> ${corpusData.texts.length}`;
|
// corpusGeneralInfoListElement.querySelector('.corpus-text-count').innerHTML = <b>Corpus text count:</b> ${Object.entries(corpusData.text).length;
|
||||||
corpusGeneralInfoListElement.querySelector('.corpus-num-unique-words').innerHTML = `<b>Corpus unique word count:</b> ${corpusData.num_unique_words}`;
|
document.querySelector('.corpus-num-unique-words').innerHTML = Object.entries(corpusData.corpus.freqs.word).length;
|
||||||
corpusGeneralInfoListElement.querySelector('.corpus-num-unique-lemmas').innerHTML = `<b>Corpus unique lemma count:</b> ${corpusData.num_unique_lemmas}`;
|
document.querySelector('.corpus-num-unique-lemmas').innerHTML = Object.entries(corpusData.corpus.freqs.lemma).length;
|
||||||
// corpusGeneralInfoListElement.querySelector('.corpus-most-frequent-words').innerHTML = `<b>Corpus most frequent words:</b> ${corpusData.most_frequent_words.join(', ');
|
document.querySelector('.corpus-num-unique-pos').innerHTML = Object.entries(corpusData.corpus.freqs.pos).length;
|
||||||
corpusGeneralInfoListElement.querySelector('.corpus-num-sentences').innerHTML = `<b>Corpus sentence count:</b> ${corpusData.num_sentences}`;
|
document.querySelector('.corpus-num-unique-simple-pos').innerHTML = Object.entries(corpusData.corpus.freqs.simple_pos).length;
|
||||||
corpusGeneralInfoListElement.querySelector('.corpus-average-sentence-length').innerHTML = `<b>Corpus average sentence length:</b> ${corpusData.average_sentence_length}`;
|
|
||||||
corpusGeneralInfoListElement.querySelector('.corpus-num-ent-types').innerHTML = `<b>Corpus entity count:</b> ${corpusData.num_ent_types}`;
|
|
||||||
corpusGeneralInfoListElement.querySelector('.corpus-num-unique-ent-types').innerHTML = `<b>Corpus unique entity count:</b> ${corpusData.num_unique_ent_types}`;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
renderTextInfoList(corpusData) {
|
renderTextInfoList(corpusData) {
|
||||||
let corpusTextInfoListElement = document.querySelector('.corpus-text-info-list');
|
// let corpusTextInfoListElement = document.querySelector('.corpus-text-info-list');
|
||||||
let corpusTextInfoList = new CorpusTextInfoList(corpusTextInfoListElement);
|
// let corpusTextInfoList = new CorpusTextInfoList(corpusTextInfoListElement);
|
||||||
corpusTextInfoList.add(corpusData.texts);
|
// for (let text of Object.values(corpusData.text)) {
|
||||||
|
// text.values.title = corpusData.value_lookups.text.title[text.values.title];
|
||||||
|
// }
|
||||||
|
// corpusTextInfoList.add(Object.values(corpusData.text));
|
||||||
|
|
||||||
|
// let textCountChipElement = document.querySelector('.text-count-chip');
|
||||||
|
// textCountChipElement.innerHTML = `Text count: ${Object.values(corpusData.text).length}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
renderTextProportionsGraphic(corpusData) {
|
renderTextProportionsGraphic(corpusData) {
|
||||||
let textProportionsGraphicElement = document.querySelector('#text-proportions-graphic');
|
// let textProportionsGraphicElement = document.querySelector('#text-proportions-graphic');
|
||||||
let graphData = [
|
// let texts = Object.values(corpusData.text);
|
||||||
{
|
// let graphData = [
|
||||||
values: corpusData.texts.map(text => text.num_tokens),
|
// {
|
||||||
labels: corpusData.texts.map(text => `${text.title} (${text.publishing_year})`),
|
// values: texts.map(text => text.counts.token),
|
||||||
type: 'pie'
|
// labels: texts.map(text => `${text.values.title} (${text.values.publishing_year})`),
|
||||||
}
|
// type: 'pie'
|
||||||
];
|
// }
|
||||||
let graphLayout = {
|
// ];
|
||||||
height: 400,
|
// let graphLayout = {
|
||||||
width: 500
|
// height: 400,
|
||||||
};
|
// width: 500
|
||||||
Plotly.newPlot(textProportionsGraphicElement, graphData, graphLayout);
|
// };
|
||||||
|
// Plotly.newPlot(textProportionsGraphicElement, graphData, graphLayout);
|
||||||
|
}
|
||||||
|
|
||||||
|
renderWordFrequenciesGraphic(corpusData) {
|
||||||
|
// let wordFrequenciesGraphicElement = document.querySelector('#word-frequencies-graphic');
|
||||||
|
// let words = Object.entries(corpusData.value_lookups.token.word);
|
||||||
|
// let texts = Object.values(corpusData.text);
|
||||||
|
// let graphData = [];
|
||||||
|
// for (let word of words) {
|
||||||
|
// let data = {
|
||||||
|
// x: texts.map(text => `${text.values.title} (${text.values.publishing_year})`),
|
||||||
|
// y: texts.map(text => text.freqs.word[word[0]]),
|
||||||
|
// name: word[1],
|
||||||
|
// type: 'bar'
|
||||||
|
// };
|
||||||
|
// graphData.push(data);
|
||||||
|
// }
|
||||||
|
|
||||||
|
// let graphLayout = {
|
||||||
|
// height: 400,
|
||||||
|
// width: 500,
|
||||||
|
// barmode: 'stack',
|
||||||
|
// type: 'bar'
|
||||||
|
// };
|
||||||
|
// Plotly.newPlot(wordFrequenciesGraphicElement, graphData, graphLayout);
|
||||||
|
}
|
||||||
|
|
||||||
|
renderWordDistributionsGraphic(corpusData) {
|
||||||
|
// let wordDistributionGraphicElement = document.querySelector('#word-distributions-graphic');
|
||||||
|
// var trace1 = {
|
||||||
|
// x: [1, 2, 3, 4],
|
||||||
|
// y: [10, 11, 12, 13],
|
||||||
|
// mode: 'markers',
|
||||||
|
// marker: {
|
||||||
|
// size: [40, 60, 80, 100]
|
||||||
|
// }
|
||||||
|
// };
|
||||||
|
// var data = [trace1];
|
||||||
|
// var layout = {
|
||||||
|
// title: 'Marker Size',
|
||||||
|
// showlegend: false,
|
||||||
|
// height: 600,
|
||||||
|
// width: 600
|
||||||
|
// };
|
||||||
|
// Plotly.newPlot(wordDistributionGraphicElement, data, layout);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -29,11 +29,11 @@ class CorpusTextInfoList extends ResourceList {
|
|||||||
<tr class="list-item clickable hoverable">
|
<tr class="list-item clickable hoverable">
|
||||||
<td><span class="title"></span> (<span class="publishing_year"></span>)</td>
|
<td><span class="title"></span> (<span class="publishing_year"></span>)</td>
|
||||||
<td><span class="num_tokens"></span></td>
|
<td><span class="num_tokens"></span></td>
|
||||||
|
<td><span class="num_sentences"></span></td>
|
||||||
<td><span class="num_unique_words"></span></td>
|
<td><span class="num_unique_words"></span></td>
|
||||||
<td><span class="num_unique_lemmas"></span></td>
|
<td><span class="num_unique_lemmas"></span></td>
|
||||||
<td><span class="num_sentences"></span></td>
|
<td><span class="num_unique_pos"></span></td>
|
||||||
<td><span class="average_sentence_length"></span></td>
|
<td><span class="num_unique_simple_pos"></span></td>
|
||||||
<td><span class="num_unique_ent_types"></span></td>
|
|
||||||
</tr>
|
</tr>
|
||||||
`.trim();
|
`.trim();
|
||||||
}
|
}
|
||||||
@ -44,11 +44,11 @@ class CorpusTextInfoList extends ResourceList {
|
|||||||
'title',
|
'title',
|
||||||
'publishing_year',
|
'publishing_year',
|
||||||
'num_tokens',
|
'num_tokens',
|
||||||
|
'num_sentences',
|
||||||
'num_unique_words',
|
'num_unique_words',
|
||||||
'num_unique_lemmas',
|
'num_unique_lemmas',
|
||||||
'num_sentences',
|
'num_unique_pos',
|
||||||
'average_sentence_length',
|
'num_unique_simple_pos'
|
||||||
'num_unique_ent_types'
|
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -68,11 +68,11 @@ class CorpusTextInfoList extends ResourceList {
|
|||||||
<tr>
|
<tr>
|
||||||
<th>Text<span class="sort right material-icons" data-sort="title" style="cursor:pointer; color:#aa9cc9">arrow_drop_down</span></th>
|
<th>Text<span class="sort right material-icons" data-sort="title" style="cursor:pointer; color:#aa9cc9">arrow_drop_down</span></th>
|
||||||
<th>Number of tokens<span class="sort right material-icons" data-sort="num_tokens" style="cursor:pointer">arrow_drop_down</span></th>
|
<th>Number of tokens<span class="sort right material-icons" data-sort="num_tokens" style="cursor:pointer">arrow_drop_down</span></th>
|
||||||
|
<th>Number of sentences<span class="sort right material-icons" data-sort="num_sentences" style="cursor:pointer">arrow_drop_down</span></th>
|
||||||
<th>Number of unique words<span class="sort right material-icons" data-sort="num_unique_words" style="cursor:pointer">arrow_drop_down</span></th>
|
<th>Number of unique words<span class="sort right material-icons" data-sort="num_unique_words" style="cursor:pointer">arrow_drop_down</span></th>
|
||||||
<th>Number of unique lemmas<span class="sort right material-icons" data-sort="num_unique_lemmas" style="cursor:pointer">arrow_drop_down</span></th>
|
<th>Number of unique lemmas<span class="sort right material-icons" data-sort="num_unique_lemmas" style="cursor:pointer">arrow_drop_down</span></th>
|
||||||
<th>Number of sentences<span class="sort right material-icons" data-sort="num_sentences" style="cursor:pointer">arrow_drop_down</span></th>
|
<th>Number of unique pos<span class="sort right material-icons" data-sort="num_unique_pos" style="cursor:pointer">arrow_drop_down</span></th>
|
||||||
<th>Average sentence length<span class="sort right material-icons" data-sort="average_sentence_length" style="cursor:pointer">arrow_drop_down</span></th>
|
<th>Number of unique simple pos<span class="sort right material-icons" data-sort="num_unique_simple_pos" style="cursor:pointer">arrow_drop_down</span></th>
|
||||||
<th>Number of unique entity types<span class="sort right material-icons" data-sort="num_unique_ent_types" style="cursor:pointer">arrow_drop_down</span></th>
|
|
||||||
</tr>
|
</tr>
|
||||||
</thead>
|
</thead>
|
||||||
<tbody class="list"></tbody>
|
<tbody class="list"></tbody>
|
||||||
@ -83,14 +83,14 @@ class CorpusTextInfoList extends ResourceList {
|
|||||||
|
|
||||||
mapResourceToValue(corpusTextData) {
|
mapResourceToValue(corpusTextData) {
|
||||||
return {
|
return {
|
||||||
title: corpusTextData.title,
|
title: corpusTextData.values.title,
|
||||||
publishing_year: corpusTextData.publishing_year,
|
publishing_year: corpusTextData.values.publishing_year,
|
||||||
num_tokens: corpusTextData.num_tokens,
|
num_tokens: corpusTextData.counts.token,
|
||||||
num_unique_words: corpusTextData.num_unique_words,
|
num_sentences: corpusTextData.counts.s,
|
||||||
num_unique_lemmas: corpusTextData.num_unique_lemmas,
|
num_unique_words: Object.entries(corpusTextData.freqs.word).length,
|
||||||
num_sentences: corpusTextData.num_sentences,
|
num_unique_lemmas: Object.entries(corpusTextData.freqs.lemma).length,
|
||||||
average_sentence_length: corpusTextData.average_sentence_length,
|
num_unique_pos: Object.entries(corpusTextData.freqs.pos).length,
|
||||||
num_unique_ent_types: corpusTextData.num_unique_ent_types
|
num_unique_simple_pos: Object.entries(corpusTextData.freqs.simple_pos).length
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -19,69 +19,112 @@
|
|||||||
<div class="row" id="corpus-analysis-app-overview">
|
<div class="row" id="corpus-analysis-app-overview">
|
||||||
<div class="col s12">
|
<div class="col s12">
|
||||||
<h1>{{ title }}</h1>
|
<h1>{{ title }}</h1>
|
||||||
|
|
||||||
|
{% for extension in extensions %}
|
||||||
|
<div class="col s3">
|
||||||
|
<div class="card extension-selector hoverable" data-target="{{ extension.id_prefix }}-container">
|
||||||
|
<div class="card-content">
|
||||||
|
<span class="card-title">{{ extension.name }}</span>
|
||||||
|
<p>{{ extension.description }}</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
<div class="row">
|
||||||
|
<div class="col s12">
|
||||||
|
<h4><i class="material-icons left">query_stats</i>Visualizations</h4>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="row">
|
||||||
|
<div class="col s2">
|
||||||
|
<div class="card hoverable" style="border-radius: 10px !important; background-color:#6b3f89; color:white">
|
||||||
|
<div class="card-content" style="padding:10px !important; text-align:center;">
|
||||||
|
<p>Number of tokens</p>
|
||||||
|
<span class="card-title corpus-num-tokens"></span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="col s2">
|
||||||
|
<div class="card hoverable" style="border-radius: 10px !important; background-color:#6b3f89; color:white">
|
||||||
|
<div class="card-content" style="padding:10px !important; text-align:center">
|
||||||
|
<p>Number of sentences</p>
|
||||||
|
<span class="card-title corpus-num-s"></span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="col s2">
|
||||||
|
<div class="card hoverable" style="border-radius: 10px !important; background-color:#6b3f89; color:white">
|
||||||
|
<div class="card-content" style="padding:10px !important; text-align:center">
|
||||||
|
<p>Number of unique words</p>
|
||||||
|
<span class="card-title corpus-num-unique-words"></span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="col s2">
|
||||||
|
<div class="card hoverable" style="border-radius: 10px !important; background-color:#6b3f89; color:white">
|
||||||
|
<div class="card-content" style="padding:10px !important; text-align:center">
|
||||||
|
<p>Number of unique lemmas</p>
|
||||||
|
<span class="card-title corpus-num-unique-lemmas"></span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="col s2">
|
||||||
|
<div class="card hoverable" style="border-radius: 10px !important; background-color:#6b3f89; color:white">
|
||||||
|
<div class="card-content" style="padding:10px !important; text-align:center">
|
||||||
|
<p>Number of unique pos</p>
|
||||||
|
<span class="card-title corpus-num-unique-pos"></span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="col s2">
|
||||||
|
<div class="card hoverable" style="border-radius: 10px !important; background-color:#6b3f89; color:white">
|
||||||
|
<div class="card-content" style="padding:10px !important; text-align:center">
|
||||||
|
<p>Number of unique simple_pos</p>
|
||||||
|
<span class="card-title corpus-num-unique-simple-pos"></span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="row">
|
||||||
|
<div class="col s12">
|
||||||
|
<div class="card hoverable">
|
||||||
|
<div class="card-content">
|
||||||
|
<span class="card-title">Text information</span>
|
||||||
|
<div class="chip text-count-chip" style="background-color:#6b3f89; color:white""></div>
|
||||||
|
<div class="corpus-text-info-list no-autoinit"></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="row">
|
||||||
|
<div class="col s3">
|
||||||
|
<div class="card hoverable">
|
||||||
|
<div class="card-content">
|
||||||
|
<span class="card-title">Text proportions within the corpus</span>
|
||||||
|
<div id="text-proportions-graphic"></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="col s3">
|
||||||
|
<div class="card hoverable">
|
||||||
|
<div class="card-content">
|
||||||
|
<span class="card-title">Word frequencies</span>
|
||||||
|
<div id="word-frequencies-graphic"></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="col s6">
|
||||||
|
<div class="card hoverable">
|
||||||
|
<div class="card-content">
|
||||||
|
<span class="card-title">Word distributions</span>
|
||||||
|
<div id="word-distributions-graphic"></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
{% for extension in extensions %}
|
|
||||||
<div class="col s3">
|
|
||||||
<div class="card extension-selector hoverable" data-target="{{ extension.id_prefix }}-container">
|
|
||||||
<div class="card-content">
|
|
||||||
<span class="card-title">{{ extension.name }}</span>
|
|
||||||
<p>{{ extension.description }}</p>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
{% endfor %}
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="row">
|
|
||||||
<div class="col s12">
|
|
||||||
<h4><i class="material-icons left">query_stats</i>Visualizations</h4>
|
|
||||||
</div>
|
|
||||||
<div class="col s4" >
|
|
||||||
<div class="card hoverable">
|
|
||||||
<div class="card-content">
|
|
||||||
<span class="card-title">General information about the Corpus</span>
|
|
||||||
<p></p>
|
|
||||||
<br>
|
|
||||||
<ul class="corpus-general-info-list">
|
|
||||||
<li class="corpus-num-tokens"></li>
|
|
||||||
<br>
|
|
||||||
<li class="corpus-text-count"></li>
|
|
||||||
<br>
|
|
||||||
<li class="corpus-num-unique-words"></li>
|
|
||||||
<br>
|
|
||||||
<li class="corpus-num-unique-lemmas"></li>
|
|
||||||
<br>
|
|
||||||
<li class="corpus-num-sentences"></li>
|
|
||||||
<br>
|
|
||||||
<li class="corpus-average-sentence-length"></li>
|
|
||||||
<br>
|
|
||||||
<li class="corpus-num-ent-types"></li>
|
|
||||||
<br>
|
|
||||||
<li class="corpus-num-unique-ent-types"></li>
|
|
||||||
<br>
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
<div class="col s8">
|
|
||||||
<div class="card hoverable">
|
|
||||||
<div class="card-content">
|
|
||||||
<span class="card-title">Text information</span>
|
|
||||||
<div class="corpus-text-info-list no-autoinit"></div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
<div class="row">
|
|
||||||
<div class="col 12">
|
|
||||||
<div class="card hoverable">
|
|
||||||
<div class="card-content">
|
|
||||||
<span class="card-title">Text proportions within the corpus</span>
|
|
||||||
<div id="text-proportions-graphic"></div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user