106 lines
8.6 KiB
Markdown
106 lines
8.6 KiB
Markdown
|
[Regular expressions time extraction]
|
|||
|
# These regular expressions are used to extract the start and ending time of one
|
|||
|
# session. The regular expressions are kind of complex because they have to catch
|
|||
|
# a lot of human errors. To catch those errors the expression is repeatedly
|
|||
|
# "chained" by using the or statement with only minor differences between each
|
|||
|
# expression. This is the easiest way though to catch as many times as possible.
|
|||
|
# The expressions match the partial strings where the start or end time is mentioned.
|
|||
|
# With different match groups the hours and minutes will then be extracted.
|
|||
|
|
|||
|
# START TIME: Matches the start time.
|
|||
|
session_start_time = (?:Die Sitzung wird [umrn]+ (?:(?:(\d{1,2}) Uhr (?:(\d{1,2})?)|(?:(\d{1,2})\.(?:(\d{1,2})) Uhr)) ?(?:Minuten?)?.?)? ?(?:durch\n*[\w \.;'\(\)]*)?[\s \. A-z]*(?:(?:eröffnet\.)|(?:eingeleitet[\w „\",\.]+)))|(?:Begi[\w]+:? (\d{1,2})(?:[, \.]*)?(?:(\d{1,2}))? ?Uhr\.?)|(?:Die Sitzung wird [umrn]+ (\d{1,2}) Uhr eröffnet.)|(?:eingeleitet[\w „\",\.]+)))|(?:Begi[\w]+:? (\d{1,2})(?:[, \.]*)?(?:(\d{1,2}))? ?Uhr\.?)|(?:Die Sitzung wird [umrn]+ (\d{1,2}) Uhr eröffnet.)
|
|||
|
|
|||
|
# END TIME: Matches the end time.
|
|||
|
session_end_time = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr\s(\d{1,2}).?\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(\d{1,2})\sUhr (\d{1,2})\sMinuten?)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr)|(?:(\d{1,2})[\., ]+(\d{1,2})\sUhr\))|(?:(\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(\d{1,2})\sUhr und\s.?(\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (\d{1,2}) Uhr (\d{1,2})\.\))
|
|||
|
|
|||
|
|
|||
|
[Regular expressions splits]
|
|||
|
# These expressions are used for splitting the protocols at the location if
|
|||
|
# matched.
|
|||
|
# All match groups are non catching except the group catching the entire regex
|
|||
|
# to insert it later on again. This is the main difference to the time extractions.
|
|||
|
# These splits are needed to automatically separate the actual session content
|
|||
|
# from the table of contents and the attachments.
|
|||
|
|
|||
|
# Split at first president occurrence.
|
|||
|
session_start_president_split = (\n\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?:)
|
|||
|
|
|||
|
# Split at the end time of session.
|
|||
|
attachment_split = ((?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\)))
|
|||
|
|
|||
|
|
|||
|
[Regular expressions speakers]
|
|||
|
# These are the regular expressions for matching the speakers in the protocols.
|
|||
|
# They consist of tuples with three values.
|
|||
|
# First element of the tuple is the regex.
|
|||
|
# Second element is a case that tells if this regex should be used as a
|
|||
|
# First, middle, or last element/match during the markup process.
|
|||
|
# Third element describes the type of speech the speaker is holding in German, to use it as an attribute later on.
|
|||
|
# The value tuple is divided with " ; " to convert it into a list later on.
|
|||
|
# It is similar to csv syntax. If needed the user can add more key, value pairs following the same
|
|||
|
# pattern to automatically identify even more speaker roles.
|
|||
|
|
|||
|
speaker_president_first = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; first ; Präsident
|
|||
|
speaker_state_secretary = ^[ \-\.,\w]+ Staatssekretär[\-\w\n, \n]+: ; middle ; Staatssekretär
|
|||
|
speaker_minister_of_state = ^[ \-\.,\w]+ Staatsminister[\-\w\n, \n]+: ; middle ; Staatsminister
|
|||
|
speaker_applicant = [ \-\.,\w]+ (\([\w ]+\))?, (?:A|a)ntragsteller(?:in)?[\-\w\n, \n]*: ; middle ; Antragsteller
|
|||
|
speaker_president = ^\w*(?:P|p)räsident\w* [ÜÖÄA-züöäß \-\.,]+ ?: ; middle ; Präsident
|
|||
|
speaker_undefined = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-\.]+\) ?: ; middle ; MdB
|
|||
|
speaker_defined = ^[\w \-\.,]+ ?Bundesminister(in)? [\w\-\., ]* ?: ; middle ; Bundesminister
|
|||
|
speaker_chancellor = ^[\w \-\.\,]+Bundeskanzler(in)? ?: ; middle ; Bundeskanzler
|
|||
|
speaker_secretary = ^[\w \-\.,]+ ?Schriftführer(in)? ?: ; middle ; Schriftführer
|
|||
|
speaker_rapporteur = ^[ \-\.,\w]+ ?(\([\w ]+\))? ?\([\w\/ \d\-]+\) ?, (?:B|b)erichterstatter: ; middle ; Berichterstatter
|
|||
|
end_of_session = (?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr\s(?:\d{1,2}).?\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;: ]*)\s?(?:(?:(?:\d{1,2})\sUhr (?:\d{1,2})\sMinuten?)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr)|(?:(?:\d{1,2})[\., ]+(?:\d{1,2})\sUhr\))|(?:(?:\d{1,2})\sUhr))[\., ]*\))|(?:\(Schlu(?:(?:ss)|ß)(?: .?der .?Sitzung)?(?:[;:]*)[\w\n,\. ]*\s(?:(?:(?:\d{1,2})\sUhr und\s.?(?:\d{1,2}).?\sMinuten?\.\))))|(?:\(Schlu(?:(?:ss)|ß):? (?:\d{1,2}) Uhr (?:\d{1,2})\.\)) ; last ; Zeitpunkt
|
|||
|
|
|||
|
[Additional name features]
|
|||
|
# In this section the user can add additional strings which are not part of the
|
|||
|
# Stammdatenbank but are used inside the protocolls.
|
|||
|
academic_titles = Dr. Dr. h. c. ; Dr. h. c.
|
|||
|
parties = DIE LINKE ; CDU/CSU ; PDS/Linke Liste ; Fraktionslos ; F.D.P.
|
|||
|
|
|||
|
[Regular expressions speeches]
|
|||
|
# These regular expressions are used to markup some entities inside of the actual speeches.
|
|||
|
# The value of any given key is a tuple with two values splitted by " ; " like in the section
|
|||
|
# \[Regular expressions speakers\]. First value is the regex and the second value is the tagname
|
|||
|
# wirrten as a string. This list of key, value pairs can also be extended by the user to identify
|
|||
|
# even more entities inside of the speeches. Just add key, value pairs following the same pattern.
|
|||
|
# These expressions are only used to identify entities which are present in one <p> without
|
|||
|
# linebreaks.
|
|||
|
|
|||
|
comments = \B\([^\(\)]*\)\B ; kommentar
|
|||
|
date_string_with_periode = [\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,2} ?\. Wahlperiode (?:–|—|-|--) \d{1,3} ?\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]* ; metadata
|
|||
|
date_string_without_periode = [\d\t ]*Deutscher Bundestag (?:–|—|-|--) \d{1,3}\. Sitzung ?\. (?:Bonn|Berlin), (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), den \d{1,2} ?\. (?:Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezmber) \d{4}[\d\t ]* ; metadata
|
|||
|
|
|||
|
[Multiline entities]
|
|||
|
# These regulare expressions are used to identifie entities in speeches which span over multiple <p>
|
|||
|
# elements. The value of any given key is a tuple with three values splitted by " ; " like in the
|
|||
|
# section [Regular expressions speakers]. First value is a regex describing how the start of the
|
|||
|
# entity string looks like. The second value is a regex describing how the end of the entity string
|
|||
|
# looks like. Third value is the tagname written as a normal string.
|
|||
|
multiline_comment = \B\([^\(\)]* ; [^\(\)]*\)\B ; kommentar
|
|||
|
|
|||
|
[File paths]
|
|||
|
# This is where the paths for input and output folders are set. The input folder
|
|||
|
# path should contain the XML-protocols that will be processed.
|
|||
|
# The output folder path specifies the place where all the intermediate files
|
|||
|
# and the final new XML protocols with the new automatic created markup will be
|
|||
|
# saved.
|
|||
|
|
|||
|
input_folder_xmls = /home/stephan/Repos/master_thesis/data/working_data/development_data_xml
|
|||
|
output_folder = /home/stephan/Repos/master_thesis/data/working_data/
|
|||
|
|
|||
|
# These paths will be set while running the programm.
|
|||
|
nlp_output = /home/stephan/Desktop/nlp_output
|
|||
|
nlp_input = /home/stephan/Desktop/protocols/
|
|||
|
nlp_lemmatized_tokenized = /home/stephan/Desktop/nlp_output/lemmatized
|
|||
|
tmp_path = /home/stephan/Desktop/nlp_output/lemmatized/tmp
|
|||
|
nlp_beuatiful_xml = /home/stephan/Desktop/nlp_output/nlp_beuatiful_xml
|
|||
|
input_folder_xmls = /home/stephan/Repos/master_thesis_data/inputs/excluded_periods/
|
|||
|
output_folder = /home/stephan/Desktop/output
|
|||
|
new_metadata = /home/stephan/Desktop/output/new_metadata
|
|||
|
new_simple_markup = /home/stephan/Desktop/output/simple_xml
|
|||
|
complex_markup = /home/stephan/Desktop/output/complex_markup
|
|||
|
clear_speech_markup = /home/stephan/Desktop/output/clear_speech_markup
|
|||
|
beautiful_xml = /home/stephan/Desktop/output/beautiful_xml
|
|||
|
fixed_markup = /home/stephan/Repos/master_thesis/data/working_data/id_fixed/fixed_markup
|