bundesdata_markup_nlp_software/bundesdata_markup_nlp/config_readme.md

[Regular expressions time extraction]
# These regular expressions are used to extract the start and ending time of one
# session. The regular expressions are kind of complex because they have to catch
# a lot of human errors. To catch those errors the expression is repeatedly
# "chained" by using the or statement with only minor differences between each
# expression. This is the easiest way though to catch as many times as possible.
# The expressions match the partial strings where the start or end time is mentioned.
# With different match groups the hours and minutes will then be extracted.

# START TIME: Matches the start time.
session_start_time = (?:Die Sitzung wird [umrn]+ (?:(?:(\d{1,2}) Uhr (?:(\d{1,2})?)|(?:(\d{1,2})\.(?:(\d{1,2})) Uhr)) ?(?:Minuten?)?.?)? ?(?:durch\n*[\w \.;'\(\)]*)?[\s \. A-z]*(?:(?:eröffnet\.)|(?:eingeleitet[\w „\",\.]+)))|(?:Begi[\w]+:? (\d{1,2})(?:[, \.]*)?(?:(\d{1,2}))? ?Uhr\.?)|(?:Die Sitzung wird [umrn]+ (\d{1,2}) Uhr eröffnet.)|(?:eingeleitet[\w „\",\.]+)))|(?:Begi[\w]+:? (\d{1,2})(?:[, \.]*)?(?:(\d{1,2}))? ?Uhr\.?)|(?:Die Sitzung wird [umrn]+ (\d{1,2}) Uhr eröffnet.)

# END TIME: Matches the end time.
session_end_time = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr\s(\d{1,2}).?\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(\d{1,2})\sUhr (\d{1,2})\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr und\s.?(\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (\d{1,2}) Uhr (\d{1,2})\.\))


[Regular expressions splits]
# These expressions are used for splitting the protocols at the location if
# matched.
# All match groups are non catching except the  group catching the entire regex
# to insert it later on again. This is the main difference to the time extractions.
# These splits are needed to automatically separate the actual session content
# from the table of contents and the attachments.

# Split at first president occurrence.
session_start_president_split = (\n\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?:)

# Split at the end time of session.
attachment_split = ((?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\)))


[Regular expressions speakers]
# These are the regular expressions for matching the speakers in the protocols.
# They consist of tuples with three values.
# First element of the tuple is the regex.
# Second element is a case that tells if this regex should be used as a
# First, middle, or last element/match during the markup process.
# Third element describes the type of speech the speaker is holding in German, to use it as an attribute later on.
# The value tuple is divided with " ; " to convert it into a list later on.
# It is similar to csv syntax. If needed the user can add more key, value pairs following the same
# pattern to automatically identify even more speaker roles.

speaker_president_first = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; first ; Präsident
speaker_state_secretary = ^[ \-\.,\w]+ Staatssekretär[\-\w\n, \n]+: ; middle ; Staatssekretär
speaker_minister_of_state = ^[ \-\.,\w]+ Staatsminister[\-\w\n, \n]+: ; middle ; Staatsminister
speaker_applicant = [ \-\.,\w]+ (\([\w ]+\))?, (?:A|a)ntragsteller(?:in)?[\-\w\n, \n]*: ; middle ; Antragsteller
speaker_president = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; middle ; Präsident
speaker_undefined = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-\.]+\) ?: ; middle ; MdB
speaker_defined = ^[\w \-\.,]+ ?Bundesminister(in)? [\w\-\., ]* ?: ; middle ; Bundesminister
speaker_chancellor = ^[\w \-\.\,]+Bundeskanzler(in)? ?: ; middle ; Bundeskanzler
speaker_secretary = ^[\w \-\.,]+ ?Schriftführer(in)? ?: ; middle ; Schriftführer
speaker_rapporteur = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-]+\) ?, (?:B|b)erichterstatter: ; middle ; Berichterstatter
end_of_session = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\)) ; last ; Zeitpunkt

[Additional name features]
# In this section the user can add additional strings which are not part of the
# Stammdatenbank but are used inside the protocolls.
academic_titles = Dr. Dr. h. c. ; Dr. h. c.
parties = DIE LINKE ; CDU/CSU ; PDS/Linke Liste ; Fraktionslos ; F.D.P.

[Regular expressions speeches]
# These regular expressions are used to markup some entities inside of the actual speeches.
# The value of any given key is a tuple with two values splitted by " ; " like in the section
# \[Regular expressions speakers\]. First value is the regex and the second value is the tagname
# wirrten as a string. This list of key, value pairs can also be extended by the user to identify
# even more entities inside of the speeches. Just add key, value pairs following the same pattern.
# These expressions are only used to identify entities which are present in one <p> without
# linebreaks.

comments = \B\([^\(\)]*\)\B ; kommentar
date_string_with_periode = [\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,2} ?\. Wahlperiode (?:–|—|-|--) \d{1,3} ?\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]* ; metadata
date_string_without_periode = [\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,3}\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]* ; metadata

[Multiline entities]
# These regulare expressions are used to identifie entities in speeches which span over multiple <p>
# elements. The value of any given key is a tuple with three values splitted by " ; " like in the
# section [Regular expressions speakers]. First value is a regex describing how the start of the
# entity string looks like. The second value is a regex describing how the end of the entity string
# looks like. Third value is the tagname written as a normal string.
multiline_comment = \B\([^\(\)]* ; [^\(\)]*\)\B ; kommentar

[File paths]
# This is where the paths for input and output folders are set. The input folder
# path should contain the XML-protocols that will be processed.
# The output folder path specifies the place where all the intermediate files
# and the final new XML protocols with the new automatic created markup will be
# saved.

input_folder_xmls = /home/stephan/Repos/master_thesis/data/working_data/development_data_xml
output_folder = /home/stephan/Repos/master_thesis/data/working_data/

# These paths will be set while running the programm.
nlp_output = /home/stephan/Desktop/nlp_output
nlp_input = /home/stephan/Desktop/protocols/
nlp_lemmatized_tokenized = /home/stephan/Desktop/nlp_output/lemmatized
tmp_path = /home/stephan/Desktop/nlp_output/lemmatized/tmp
nlp_beuatiful_xml = /home/stephan/Desktop/nlp_output/nlp_beuatiful_xml
input_folder_xmls = /home/stephan/Repos/master_thesis_data/inputs/excluded_periods/
output_folder = /home/stephan/Desktop/output
new_metadata = /home/stephan/Desktop/output/new_metadata
new_simple_markup = /home/stephan/Desktop/output/simple_xml
complex_markup = /home/stephan/Desktop/output/complex_markup
clear_speech_markup = /home/stephan/Desktop/output/clear_speech_markup
beautiful_xml = /home/stephan/Desktop/output/beautiful_xml
fixed_markup = /home/stephan/Repos/master_thesis/data/working_data/id_fixed/fixed_markup