[Regular expressions time extraction] # These regular expressions are used to extract the start and ending time of one # session. The regular expressions are kind of complex because they have to catch # a lot of human errors. To catch those errors the expression is repeatedly # "chained" by using the or statement with only minor differences between each # expression. This is the easiest way though to catch as many times as possible. # The expressions match the partial strings where the start or end time is mentioned. # With different match groups the hours and minutes will then be extracted. # START TIME: Matches the start time. session_start_time = (?:Die Sitzung wird [umrn]+ (?:(?:(\d{1,2}) Uhr (?:(\d{1,2})?)|(?:(\d{1,2})\.(?:(\d{1,2})) Uhr)) ?(?:Minuten?)?.?)? ?(?:durch\n*[\w \.;'\(\)]*)?[\s \. A-z]*(?:(?:eröffnet\.)|(?:eingeleitet[\w „\",\.]+)))|(?:Begi[\w]+:? (\d{1,2})(?:[, \.]*)?(?:(\d{1,2}))? ?Uhr\.?)|(?:Die Sitzung wird [umrn]+ (\d{1,2}) Uhr eröffnet.)|(?:eingeleitet[\w „\",\.]+)))|(?:Begi[\w]+:? (\d{1,2})(?:[, \.]*)?(?:(\d{1,2}))? ?Uhr\.?)|(?:Die Sitzung wird [umrn]+ (\d{1,2}) Uhr eröffnet.) # END TIME: Matches the end time. session_end_time = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr\s(\d{1,2}).?\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(\d{1,2})\sUhr (\d{1,2})\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr und\s.?(\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (\d{1,2}) Uhr (\d{1,2})\.\)) [Regular expressions splits] # These expressions are used for splitting the protocols at the location if # matched. # All match groups are non catching except the group catching the entire regex # to insert it later on again. This is the main difference to the time extractions. # These splits are needed to automatically separate the actual session content # from the table of contents and the attachments. # Split at first president occurrence. session_start_president_split = (\n\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?:) # Split at the end time of session. attachment_split = ((?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\))) [Regular expressions speakers] # These are the regular expressions for matching the speakers in the protocols. # They consist of tuples with three values. # First element of the tuple is the regex. # Second element is a case that tells if this regex should be used as a # First, middle, or last element/match during the markup process. # Third element describes the type of speech the speaker is holding in German, to use it as an attribute later on. # The value tuple is divided with " ; " to convert it into a list later on. # It is similar to csv syntax. If needed the user can add more key, value pairs following the same # pattern to automatically identify even more speaker roles. speaker_president_first = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; first ; Präsident speaker_state_secretary = ^[ \-\.,\w]+ Staatssekretär[\-\w\n, \n]+: ; middle ; Staatssekretär speaker_minister_of_state = ^[ \-\.,\w]+ Staatsminister[\-\w\n, \n]+: ; middle ; Staatsminister speaker_applicant = [ \-\.,\w]+ (\([\w ]+\))?, (?:A|a)ntragsteller(?:in)?[\-\w\n, \n]*: ; middle ; Antragsteller speaker_president = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; middle ; Präsident speaker_undefined = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-\.]+\) ?: ; middle ; MdB speaker_defined = ^[\w \-\.,]+ ?Bundesminister(in)? [\w\-\., ]* ?: ; middle ; Bundesminister speaker_chancellor = ^[\w \-\.\,]+Bundeskanzler(in)? ?: ; middle ; Bundeskanzler speaker_secretary = ^[\w \-\.,]+ ?Schriftführer(in)? ?: ; middle ; Schriftführer speaker_rapporteur = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-]+\) ?, (?:B|b)erichterstatter: ; middle ; Berichterstatter end_of_session = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\)) ; last ; Zeitpunkt [Additional name features] # In this section the user can add additional strings which are not part of the # Stammdatenbank but are used inside the protocolls. academic_titles = Dr. Dr. h. c. ; Dr. h. c. parties = DIE LINKE ; CDU/CSU ; PDS/Linke Liste ; Fraktionslos ; F.D.P. [Regular expressions speeches] # These regular expressions are used to markup some entities inside of the actual speeches. # The value of any given key is a tuple with two values splitted by " ; " like in the section # \[Regular expressions speakers\]. First value is the regex and the second value is the tagname # wirrten as a string. This list of key, value pairs can also be extended by the user to identify # even more entities inside of the speeches. Just add key, value pairs following the same pattern. # These expressions are only used to identify entities which are present in one
without # linebreaks. comments = \B\([^\(\)]*\)\B ; kommentar date_string_with_periode = [\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,2} ?\. Wahlperiode (?:–|—|-|--) \d{1,3} ?\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]* ; metadata date_string_without_periode = [\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,3}\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]* ; metadata [Multiline entities] # These regulare expressions are used to identifie entities in speeches which span over multiple
# elements. The value of any given key is a tuple with three values splitted by " ; " like in the # section [Regular expressions speakers]. First value is a regex describing how the start of the # entity string looks like. The second value is a regex describing how the end of the entity string # looks like. Third value is the tagname written as a normal string. multiline_comment = \B\([^\(\)]* ; [^\(\)]*\)\B ; kommentar [File paths] # This is where the paths for input and output folders are set. The input folder # path should contain the XML-protocols that will be processed. # The output folder path specifies the place where all the intermediate files # and the final new XML protocols with the new automatic created markup will be # saved. input_folder_xmls = /home/stephan/Repos/master_thesis/data/working_data/development_data_xml output_folder = /home/stephan/Repos/master_thesis/data/working_data/ # These paths will be set while running the programm. nlp_output = /home/stephan/Desktop/nlp_output nlp_input = /home/stephan/Desktop/protocols/ nlp_lemmatized_tokenized = /home/stephan/Desktop/nlp_output/lemmatized tmp_path = /home/stephan/Desktop/nlp_output/lemmatized/tmp nlp_beuatiful_xml = /home/stephan/Desktop/nlp_output/nlp_beuatiful_xml input_folder_xmls = /home/stephan/Repos/master_thesis_data/inputs/excluded_periods/ output_folder = /home/stephan/Desktop/output new_metadata = /home/stephan/Desktop/output/new_metadata new_simple_markup = /home/stephan/Desktop/output/simple_xml complex_markup = /home/stephan/Desktop/output/complex_markup clear_speech_markup = /home/stephan/Desktop/output/clear_speech_markup beautiful_xml = /home/stephan/Desktop/output/beautiful_xml fixed_markup = /home/stephan/Repos/master_thesis/data/working_data/id_fixed/fixed_markup