Update README.md
This commit is contained in:
parent
000fc353d7
commit
626aa2b03f
60
README.md
60
README.md
@ -13,14 +13,14 @@ Structure:
|
|||||||
```
|
```
|
||||||
.
|
.
|
||||||
├── inputs
|
├── inputs
|
||||||
│ ├── backup_raw_xml # Zip files of all original protocols.
|
│ ├── backup_raw_xml ### Zip files of all original protocols.
|
||||||
│ ├── current_official_protocols_xml # Example file of the new official markup.
|
│ ├── current_official_protocols_xml ### Example file of the new official markup.
|
||||||
│ ├── development_data_xml # Set of original xml protocols used for development.
|
│ ├── development_data_xml ### Set of original xml protocols used for development.
|
||||||
│ ├── faulty_raw_xml # All original protocolls with errors. The Bundesregierung should have fixed those by now. The Software mentioned above used these faulty ones though because the new ones were not available back then.
|
│ ├── faulty_raw_xml ### All original protocolls with errors. The Bundesregierung should have fixed those by now. The Software mentioned above used these faulty ones though because the new ones were not available back then.
|
||||||
│ │ ├── 15_Wahlperiode_2002-2005
|
│ │ ├── 15_Wahlperiode_2002-2005
|
||||||
│ │ ├── 16_Wahlperiode_2005-2009
|
│ │ ├── 16_Wahlperiode_2005-2009
|
||||||
│ │ └── 17_Wahlperiode_2009-2013
|
│ │ └── 17_Wahlperiode_2009-2013
|
||||||
│ ├── protocols_raw_xml # Unziped original protocols.
|
│ ├── protocols_raw_xml ### Unziped original protocols.
|
||||||
│ │ ├── 01_Wahlperiode_1949-1953
|
│ │ ├── 01_Wahlperiode_1949-1953
|
||||||
│ │ ├── 02_Wahlperiode_1953-1957
|
│ │ ├── 02_Wahlperiode_1953-1957
|
||||||
│ │ ├── 03_Wahlperiode_1957-1961
|
│ │ ├── 03_Wahlperiode_1957-1961
|
||||||
@ -36,17 +36,17 @@ Structure:
|
|||||||
│ │ ├── 13_Wahlperiode_1994-1998
|
│ │ ├── 13_Wahlperiode_1994-1998
|
||||||
│ │ ├── 14_Wahlperiode_1998-2002
|
│ │ ├── 14_Wahlperiode_1998-2002
|
||||||
│ │ └── 18_Wahlperiode_2013-2017
|
│ │ └── 18_Wahlperiode_2013-2017
|
||||||
│ └── test_data_xml # Set of original protocols fpr testin purposes.
|
│ └── test_data_xml ### Set of original protocols fpr testin purposes.
|
||||||
├── MdB_data # The official Stammdaten of every MdB can be found here.
|
├── MdB_data ### The official Stammdaten of every MdB can be found here.
|
||||||
├── outputs # These are the files an data produced using the software from https://gitlab.ub.uni-bielefeld.de/sporada/bundesdata_markup_nlp_software
|
├── outputs ### These are the files an data produced using the software from https://gitlab.ub.uni-bielefeld.de/sporada/bundesdata_markup_nlp_software
|
||||||
│ ├── markup # Contains all automatically marked protocols.
|
│ ├── markup ### Contains all automatically marked protocols.
|
||||||
│ │ ├── dev_data # Automatically marked dev_data protocols.
|
│ │ ├── dev_data ### Automatically marked dev_data protocols.
|
||||||
│ │ │ ├── beautiful_xml # Final output: humanreadable automatically marked protocolls.
|
│ │ │ ├── beautiful_xml ### Final output: humanreadable automatically marked protocolls.
|
||||||
│ │ │ ├── clear_speech_markup # Tmp data
|
│ │ │ ├── clear_speech_markup ### Tmp data
|
||||||
│ │ │ ├── complex_markup # Tmp data
|
│ │ │ ├── complex_markup ### Tmp data
|
||||||
│ │ │ ├── new_metadata # Tmp data
|
│ │ │ ├── new_metadata ### Tmp data
|
||||||
│ │ │ └── simple_xml # Tmp data
|
│ │ │ └── simple_xml ### Tmp data
|
||||||
│ │ ├── full_periods # Automatically marked protocols form all periods.
|
│ │ ├── full_periods ### Automatically marked protocols form all periods.
|
||||||
│ │ │ ├── 01_Wahlperiode_1949-1953
|
│ │ │ ├── 01_Wahlperiode_1949-1953
|
||||||
│ │ │ ├── 02_Wahlperiode_1953-1957
|
│ │ │ ├── 02_Wahlperiode_1953-1957
|
||||||
│ │ │ ├── 03_Wahlperiode_1957-1961
|
│ │ │ ├── 03_Wahlperiode_1957-1961
|
||||||
@ -65,37 +65,37 @@ Structure:
|
|||||||
│ │ │ ├── 16_Wahlperiode_2005-2009_faulty
|
│ │ │ ├── 16_Wahlperiode_2005-2009_faulty
|
||||||
│ │ │ ├── 17_Wahlperiode_2009-2013_faulty
|
│ │ │ ├── 17_Wahlperiode_2009-2013_faulty
|
||||||
│ │ │ └── 18_Wahlperiode_2013-2017
|
│ │ │ └── 18_Wahlperiode_2013-2017
|
||||||
│ │ └── test_data # Automatically marked test_data protocols.
|
│ │ └── test_data ### Automatically marked test_data protocols.
|
||||||
│ │ ├── beautiful_xml # Final output: humanreadable automatically marked protocolls.
|
│ │ ├── beautiful_xml ### Final output: humanreadable automatically marked protocolls.
|
||||||
│ │ ├── clear_speech_markup # Tmp data
|
│ │ ├── clear_speech_markup ### Tmp data
|
||||||
│ │ ├── complex_markup # Tmp data
|
│ │ ├── complex_markup ### Tmp data
|
||||||
│ │ ├── new_metadata # Tmp data
|
│ │ ├── new_metadata ### Tmp data
|
||||||
│ │ └── simple_xml # Tmp data
|
│ │ └── simple_xml ### Tmp data
|
||||||
│ └── nlp # All data created from the automatically marked protocolls.
|
│ └── nlp ### All data created from the automatically marked protocolls.
|
||||||
│ └── full_periods # Contains data created from all protocolls.
|
│ └── full_periods ### Contains data created from all protocolls.
|
||||||
│ ├── n-grams # N-Gramm data based on protocolls (sibling of this folder).
|
│ ├── n-grams ### N-Gramm data based on protocolls (sibling of this folder).
|
||||||
│ │ ├── lm_ns_speaker # N-grams from lemmatized protocols without stop words counted by speaker.
|
│ │ ├── lm_ns_speaker ### N-grams from lemmatized protocols without stop words counted by speaker.
|
||||||
│ │ │ ├── 1_grams
|
│ │ │ ├── 1_grams
|
||||||
│ │ │ ├── 2_grams
|
│ │ │ ├── 2_grams
|
||||||
│ │ │ ├── 3_grams
|
│ │ │ ├── 3_grams
|
||||||
│ │ │ ├── 4_grams
|
│ │ │ ├── 4_grams
|
||||||
│ │ │ └── 5_grams
|
│ │ │ └── 5_grams
|
||||||
│ │ ├── lm_ns_year # N-grams from lemmatized protocols without stop words counted by year.
|
│ │ ├── lm_ns_year ### N-grams from lemmatized protocols without stop words counted by year.
|
||||||
│ │ │ ├── 1_grams
|
│ │ │ ├── 1_grams
|
||||||
│ │ │ ├── 2_grams
|
│ │ │ ├── 2_grams
|
||||||
│ │ │ ├── 3_grams
|
│ │ │ ├── 3_grams
|
||||||
│ │ │ ├── 4_grams
|
│ │ │ ├── 4_grams
|
||||||
│ │ │ └── 5_grams
|
│ │ │ └── 5_grams
|
||||||
│ │ ├── tk_ws_speaker_(1-3) # N-grams from tokenized protocols with stop words counted by speaker.
|
│ │ ├── tk_ws_speaker_(1-3) ### N-grams from tokenized protocols with stop words counted by speaker.
|
||||||
│ │ │ ├── 1_grams
|
│ │ │ ├── 1_grams
|
||||||
│ │ │ ├── 2_grams
|
│ │ │ ├── 2_grams
|
||||||
│ │ │ └── 3_grams
|
│ │ │ └── 3_grams
|
||||||
│ │ └── tk_ws_year_(1-4) # N-grams from tokenized protocols with stop words counted by year.
|
│ │ └── tk_ws_year_(1-4) ### N-grams from tokenized protocols with stop words counted by year.
|
||||||
│ │ ├── 1_grams
|
│ │ ├── 1_grams
|
||||||
│ │ ├── 2_grams
|
│ │ ├── 2_grams
|
||||||
│ │ ├── 3_grams
|
│ │ ├── 3_grams
|
||||||
│ │ └── 4_grams
|
│ │ └── 4_grams
|
||||||
│ └── protocols # Lemmatized and tokenized protocols used for n-gramm caalculation.
|
│ └── protocols ### Lemmatized and tokenized protocols used for n-gramm caalculation.
|
||||||
│ ├── protocols_lemmatized_without_stopwords
|
│ ├── protocols_lemmatized_without_stopwords
|
||||||
│ └── protocols_tokenized_with_stopwords
|
│ └── protocols_tokenized_with_stopwords
|
||||||
└── protocol_DTD
|
└── protocol_DTD
|
||||||
|
Loading…
Reference in New Issue
Block a user