Initial commit

2019-02-28 14:09:53 +01:00
commit 96e84d083d
97 changed files with 66293 additions and 0 deletions
--- a/app/speeches/management/commands/import_protocols.py
+++ b/app/speeches/management/commands/import_protocols.py
@ -0,0 +1,120 @@
+from django.core.management.base import BaseCommand
+from speeches.models import Protocol, Speech
+from speakers.models import Speaker
+from lxml import etree
+import os
+import fnmatch
+import datetime
+from tqdm import tqdm
+
+
+class Command(BaseCommand):
+    help = ("Adds protocols to the database using the django models"
+            " syntax. Protocols will be added from the xml protocol files."
+            " Input is a path pointing to all/multiple protocols in one"
+            " directory with one level of subdirectories. First imports"
+            " toc, attachments and metadata with model Protocol. Speeches will be put into realtion with the model Speech."
+            " to the protocols later on.")
+
+    def add_arguments(self, parser):
+        parser.add_argument("input_path",
+                            type=str)
+
+    def handle(self, *args, **options):
+        path = options["input_path"]
+        list_of_files = []
+        for path, subdirs, files in os.walk(path):
+            for name in files:
+                if fnmatch.fnmatch(name, "*.xml"):
+                    list_of_files.append(os.path.join(path, name))
+        for file_path in tqdm(sorted(list_of_files), desc="Importing protocol data"):
+            # self.stdout.write("Reading data from file: " + file_path)
+            tree = etree.parse(file_path)
+            protocol = Protocol()
+
+            protocol.protocol_id = os.path.basename(file_path)[:-4]
+            # self.stdout.write("\tProtocol ID is: " + protocol.protocol_id)
+            # self.stdout.write("\tReading toc and attachment.")
+
+            session_nr = tree.xpath("//sitzungsnr")[0]
+            protocol.session_nr = session_nr
+
+            protocol_period = tree.xpath("@wahlperiode")[0]
+            protocol.protocol_period = protocol_period
+
+            session_date = tree.xpath("//@date")[0]
+            protocol.session_date_str = session_date
+            session_date = datetime.datetime.strptime(session_date, "%d.%m.%Y")
+            session_date = datetime.datetime.strftime(session_date, "%Y-%m-%d")
+            protocol.session_date = session_date
+
+            correct_start_time = None
+            start_of_session = tree.xpath("//@sitzung-start-uhrzeit")[0]
+            try:
+                start_of_session = datetime.datetime.strptime(start_of_session,
+                                                               "%H:%M")
+                correct_start_time = True
+            except ValueError as e:
+                correct_start_time = False
+            if(correct_start_time is True):
+                protocol.start_of_session = start_of_session
+            else:
+                protocol.start_of_session = None
+
+            end_of_session = tree.xpath("//@sitzung-ende-uhrzeit")[0]
+            correct_end_time = None
+            try:
+                end_of_session = datetime.datetime.strptime(end_of_session,
+                                                            "%H:%M")
+                correct_end_time = True
+            except ValueError as e:
+                correct_end_time = False
+            if(correct_end_time is True):
+                protocol.end_of_session = end_of_session
+            else:
+                protocol.end_of_session = None
+
+            session_nr = tree.xpath("//sitzungsnr")[0]
+            protocol.session_nr = session_nr.text
+
+            election_period = tree.xpath("//wahlperiode")[0]
+            protocol.election_period = election_period.text
+
+            toc = tree.xpath("//inhaltsverzeichnis")[0]
+            protocol.toc = toc.text
+
+            attachment = tree.xpath("//anlagen")[0]
+            protocol.attachment = attachment.text
+            protocol.save()
+
+            speeches = tree.xpath("//sitzungsbeginn | //rede")
+            for previous_e, current_e, next_e in zip([None]+speeches[:-1], speeches, speeches[1:]+[None]):
+                # self.stdout.write("\tReading speech from " + protocol.protocol_id)
+                speech = Speech()
+                speech.foreign_protocol = protocol
+                if(previous_e is not None):
+                    previous_speech_id = previous_e.xpath("@id")[0]
+                    speech.previous_speech_id = previous_speech_id
+                speech_id = current_e.xpath("@id")[0]
+                speech.speech_id = speech_id
+                if(next_e is not None):
+                    next_speech_id = next_e.xpath("@id")[0]
+                    speech.next_speech_id = next_speech_id
+                # self.stdout.write("\tSpeech ID is:" + str(speech.speech_id))
+                # self.stdout.write("\tPrevious Speech ID is:" + str(speech.previous_speech_id))
+                # self.stdout.write("\tNext Speech ID is:" + str(speech.next_speech_id))
+                speaker_type = current_e.xpath("//@typ")[0]
+                speech.speaker_type = speaker_type
+                speaker_id = current_e.xpath(".//redner/@id")[0]
+                # self.stdout.write("\tCurrent speaker ID is:" + str(speaker_id))
+                if(speaker_id != "None"):
+                    speech.foreign_speaker = Speaker.objects.filter(pk=speaker_id)[0]
+                # self.stdout.write("\tSpeaker ID (Foreign key) is:" + str(speech.foreign_speaker))
+                speech_content = current_e.xpath(".//p")
+                speech_content = [str(etree.tostring(p)) for p in speech_content]
+                speech_content = "".join(speech_content)
+                speech.speech_content = speech_content
+                original_string = current_e.xpath(".//redner/name")[0]
+                speech.original_string = original_string.tail
+                # self.stdout.write("\t-------------------------------------------")
+                speech.save()