mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2025-08-02 16:55:18 +00:00
Sort Mechanics Text Info List
This commit is contained in:
@@ -138,8 +138,104 @@ class CQiCorpus {
|
||||
"publishing_year": 1950
|
||||
},
|
||||
{
|
||||
"num_tokens": 800, // number of tokens in the text
|
||||
"num_unique_words": 60, // number of unique words in the text
|
||||
"num_tokens": 15, // number of tokens in the text
|
||||
"num_unique_words": 4, // number of unique words in the text
|
||||
"word_freqs": { // frequency of unique words in the text (sorted by frequency)
|
||||
"str": "int", // number of tokens with word "str"
|
||||
// ...
|
||||
},
|
||||
"num_unique_lemmas": 90, // number of unique lemmas in the text
|
||||
"lemma_freqs": { // frequency of unique lemmas in the text (sorted by frequency)
|
||||
"str": "int", // number of tokens with lemma "str"
|
||||
// ...
|
||||
},
|
||||
"num_sentences": 11, // number of sentences in the text
|
||||
"average_sentence_length": 3, // average number of tokens per sentence in the text
|
||||
"num_ent_types": 4, // number of ent_types in the text
|
||||
"num_unique_ent_types": 300, // number of unique ent_types in the text
|
||||
"num_entities_by_id": {
|
||||
"1": "int", // number of entities with id 1
|
||||
// ...
|
||||
},
|
||||
"author": "Author Name",
|
||||
"title": "Titel 1",
|
||||
"publishing_year": 1962
|
||||
},
|
||||
{
|
||||
"num_tokens": 11, // number of tokens in the text
|
||||
"num_unique_words": 12, // number of unique words in the text
|
||||
"word_freqs": { // frequency of unique words in the text (sorted by frequency)
|
||||
"str": "int", // number of tokens with word "str"
|
||||
// ...
|
||||
},
|
||||
"num_unique_lemmas": 64, // number of unique lemmas in the text
|
||||
"lemma_freqs": { // frequency of unique lemmas in the text (sorted by frequency)
|
||||
"str": "int", // number of tokens with lemma "str"
|
||||
// ...
|
||||
},
|
||||
"num_sentences": 52, // number of sentences in the text
|
||||
"average_sentence_length": 3, // average number of tokens per sentence in the text
|
||||
"num_ent_types": 45, // number of ent_types in the text
|
||||
"num_unique_ent_types": 68, // number of unique ent_types in the text
|
||||
"num_entities_by_id": {
|
||||
"1": "int", // number of entities with id 1
|
||||
// ...
|
||||
},
|
||||
"author": "Author Name",
|
||||
"title": "Titel 2",
|
||||
"publishing_year": 1850
|
||||
},
|
||||
{
|
||||
"num_tokens": 56, // number of tokens in the text
|
||||
"num_unique_words": 13, // number of unique words in the text
|
||||
"word_freqs": { // frequency of unique words in the text (sorted by frequency)
|
||||
"str": "int", // number of tokens with word "str"
|
||||
// ...
|
||||
},
|
||||
"num_unique_lemmas": 43, // number of unique lemmas in the text
|
||||
"lemma_freqs": { // frequency of unique lemmas in the text (sorted by frequency)
|
||||
"str": "int", // number of tokens with lemma "str"
|
||||
// ...
|
||||
},
|
||||
"num_sentences": 45, // number of sentences in the text
|
||||
"average_sentence_length": 56, // average number of tokens per sentence in the text
|
||||
"num_ent_types": 8792, // number of ent_types in the text
|
||||
"num_unique_ent_types": 56758, // number of unique ent_types in the text
|
||||
"num_entities_by_id": {
|
||||
"1": "int", // number of entities with id 1
|
||||
// ...
|
||||
},
|
||||
"author": "Author Name",
|
||||
"title": "Titel 3",
|
||||
"publishing_year": 1504
|
||||
},
|
||||
{
|
||||
"num_tokens": 54345, // number of tokens in the text
|
||||
"num_unique_words": 561, // number of unique words in the text
|
||||
"word_freqs": { // frequency of unique words in the text (sorted by frequency)
|
||||
"str": "int", // number of tokens with word "str"
|
||||
// ...
|
||||
},
|
||||
"num_unique_lemmas": 546, // number of unique lemmas in the text
|
||||
"lemma_freqs": { // frequency of unique lemmas in the text (sorted by frequency)
|
||||
"str": "int", // number of tokens with lemma "str"
|
||||
// ...
|
||||
},
|
||||
"num_sentences": 5427, // number of sentences in the text
|
||||
"average_sentence_length": 657, // average number of tokens per sentence in the text
|
||||
"num_ent_types": 3465, // number of ent_types in the text
|
||||
"num_unique_ent_types": 45, // number of unique ent_types in the text
|
||||
"num_entities_by_id": {
|
||||
"1": "int", // number of entities with id 1
|
||||
// ...
|
||||
},
|
||||
"author": "Author Name",
|
||||
"title": "Titel 4",
|
||||
"publishing_year": 1712
|
||||
},
|
||||
{
|
||||
"num_tokens": 4354, // number of tokens in the text
|
||||
"num_unique_words": 45234, // number of unique words in the text
|
||||
"word_freqs": { // frequency of unique words in the text (sorted by frequency)
|
||||
"testwort": 50, // number of tokens with word "str"
|
||||
"testwort2": 1
|
||||
@@ -158,7 +254,7 @@ class CQiCorpus {
|
||||
// ...
|
||||
},
|
||||
"author": "Author Name 2",
|
||||
"title": "Titel 2",
|
||||
"title": "Titel 5",
|
||||
"publishing_year": 1951
|
||||
}
|
||||
]
|
||||
|
Reference in New Issue
Block a user