Initial commit

This commit is contained in:
Stephan Porada
2019-02-28 14:09:53 +01:00
commit 96e84d083d
97 changed files with 66293 additions and 0 deletions

View File

@ -0,0 +1,120 @@
from django.core.management.base import BaseCommand
from speeches.models import Protocol, Speech
from speakers.models import Speaker
from lxml import etree
import os
import fnmatch
import datetime
from tqdm import tqdm
class Command(BaseCommand):
help = ("Adds protocols to the database using the django models"
" syntax. Protocols will be added from the xml protocol files."
" Input is a path pointing to all/multiple protocols in one"
" directory with one level of subdirectories. First imports"
" toc, attachments and metadata with model Protocol. Speeches will be put into realtion with the model Speech."
" to the protocols later on.")
def add_arguments(self, parser):
parser.add_argument("input_path",
type=str)
def handle(self, *args, **options):
path = options["input_path"]
list_of_files = []
for path, subdirs, files in os.walk(path):
for name in files:
if fnmatch.fnmatch(name, "*.xml"):
list_of_files.append(os.path.join(path, name))
for file_path in tqdm(sorted(list_of_files), desc="Importing protocol data"):
# self.stdout.write("Reading data from file: " + file_path)
tree = etree.parse(file_path)
protocol = Protocol()
protocol.protocol_id = os.path.basename(file_path)[:-4]
# self.stdout.write("\tProtocol ID is: " + protocol.protocol_id)
# self.stdout.write("\tReading toc and attachment.")
session_nr = tree.xpath("//sitzungsnr")[0]
protocol.session_nr = session_nr
protocol_period = tree.xpath("@wahlperiode")[0]
protocol.protocol_period = protocol_period
session_date = tree.xpath("//@date")[0]
protocol.session_date_str = session_date
session_date = datetime.datetime.strptime(session_date, "%d.%m.%Y")
session_date = datetime.datetime.strftime(session_date, "%Y-%m-%d")
protocol.session_date = session_date
correct_start_time = None
start_of_session = tree.xpath("//@sitzung-start-uhrzeit")[0]
try:
start_of_session = datetime.datetime.strptime(start_of_session,
"%H:%M")
correct_start_time = True
except ValueError as e:
correct_start_time = False
if(correct_start_time is True):
protocol.start_of_session = start_of_session
else:
protocol.start_of_session = None
end_of_session = tree.xpath("//@sitzung-ende-uhrzeit")[0]
correct_end_time = None
try:
end_of_session = datetime.datetime.strptime(end_of_session,
"%H:%M")
correct_end_time = True
except ValueError as e:
correct_end_time = False
if(correct_end_time is True):
protocol.end_of_session = end_of_session
else:
protocol.end_of_session = None
session_nr = tree.xpath("//sitzungsnr")[0]
protocol.session_nr = session_nr.text
election_period = tree.xpath("//wahlperiode")[0]
protocol.election_period = election_period.text
toc = tree.xpath("//inhaltsverzeichnis")[0]
protocol.toc = toc.text
attachment = tree.xpath("//anlagen")[0]
protocol.attachment = attachment.text
protocol.save()
speeches = tree.xpath("//sitzungsbeginn | //rede")
for previous_e, current_e, next_e in zip([None]+speeches[:-1], speeches, speeches[1:]+[None]):
# self.stdout.write("\tReading speech from " + protocol.protocol_id)
speech = Speech()
speech.foreign_protocol = protocol
if(previous_e is not None):
previous_speech_id = previous_e.xpath("@id")[0]
speech.previous_speech_id = previous_speech_id
speech_id = current_e.xpath("@id")[0]
speech.speech_id = speech_id
if(next_e is not None):
next_speech_id = next_e.xpath("@id")[0]
speech.next_speech_id = next_speech_id
# self.stdout.write("\tSpeech ID is:" + str(speech.speech_id))
# self.stdout.write("\tPrevious Speech ID is:" + str(speech.previous_speech_id))
# self.stdout.write("\tNext Speech ID is:" + str(speech.next_speech_id))
speaker_type = current_e.xpath("//@typ")[0]
speech.speaker_type = speaker_type
speaker_id = current_e.xpath(".//redner/@id")[0]
# self.stdout.write("\tCurrent speaker ID is:" + str(speaker_id))
if(speaker_id != "None"):
speech.foreign_speaker = Speaker.objects.filter(pk=speaker_id)[0]
# self.stdout.write("\tSpeaker ID (Foreign key) is:" + str(speech.foreign_speaker))
speech_content = current_e.xpath(".//p")
speech_content = [str(etree.tostring(p)) for p in speech_content]
speech_content = "".join(speech_content)
speech.speech_content = speech_content
original_string = current_e.xpath(".//redner/name")[0]
speech.original_string = original_string.tail
# self.stdout.write("\t-------------------------------------------")
speech.save()