Initial commit
This commit is contained in:
120
app/speeches/management/commands/import_protocols.py
Executable file
120
app/speeches/management/commands/import_protocols.py
Executable file
@ -0,0 +1,120 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from speeches.models import Protocol, Speech
|
||||
from speakers.models import Speaker
|
||||
from lxml import etree
|
||||
import os
|
||||
import fnmatch
|
||||
import datetime
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = ("Adds protocols to the database using the django models"
|
||||
" syntax. Protocols will be added from the xml protocol files."
|
||||
" Input is a path pointing to all/multiple protocols in one"
|
||||
" directory with one level of subdirectories. First imports"
|
||||
" toc, attachments and metadata with model Protocol. Speeches will be put into realtion with the model Speech."
|
||||
" to the protocols later on.")
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument("input_path",
|
||||
type=str)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
path = options["input_path"]
|
||||
list_of_files = []
|
||||
for path, subdirs, files in os.walk(path):
|
||||
for name in files:
|
||||
if fnmatch.fnmatch(name, "*.xml"):
|
||||
list_of_files.append(os.path.join(path, name))
|
||||
for file_path in tqdm(sorted(list_of_files), desc="Importing protocol data"):
|
||||
# self.stdout.write("Reading data from file: " + file_path)
|
||||
tree = etree.parse(file_path)
|
||||
protocol = Protocol()
|
||||
|
||||
protocol.protocol_id = os.path.basename(file_path)[:-4]
|
||||
# self.stdout.write("\tProtocol ID is: " + protocol.protocol_id)
|
||||
# self.stdout.write("\tReading toc and attachment.")
|
||||
|
||||
session_nr = tree.xpath("//sitzungsnr")[0]
|
||||
protocol.session_nr = session_nr
|
||||
|
||||
protocol_period = tree.xpath("@wahlperiode")[0]
|
||||
protocol.protocol_period = protocol_period
|
||||
|
||||
session_date = tree.xpath("//@date")[0]
|
||||
protocol.session_date_str = session_date
|
||||
session_date = datetime.datetime.strptime(session_date, "%d.%m.%Y")
|
||||
session_date = datetime.datetime.strftime(session_date, "%Y-%m-%d")
|
||||
protocol.session_date = session_date
|
||||
|
||||
correct_start_time = None
|
||||
start_of_session = tree.xpath("//@sitzung-start-uhrzeit")[0]
|
||||
try:
|
||||
start_of_session = datetime.datetime.strptime(start_of_session,
|
||||
"%H:%M")
|
||||
correct_start_time = True
|
||||
except ValueError as e:
|
||||
correct_start_time = False
|
||||
if(correct_start_time is True):
|
||||
protocol.start_of_session = start_of_session
|
||||
else:
|
||||
protocol.start_of_session = None
|
||||
|
||||
end_of_session = tree.xpath("//@sitzung-ende-uhrzeit")[0]
|
||||
correct_end_time = None
|
||||
try:
|
||||
end_of_session = datetime.datetime.strptime(end_of_session,
|
||||
"%H:%M")
|
||||
correct_end_time = True
|
||||
except ValueError as e:
|
||||
correct_end_time = False
|
||||
if(correct_end_time is True):
|
||||
protocol.end_of_session = end_of_session
|
||||
else:
|
||||
protocol.end_of_session = None
|
||||
|
||||
session_nr = tree.xpath("//sitzungsnr")[0]
|
||||
protocol.session_nr = session_nr.text
|
||||
|
||||
election_period = tree.xpath("//wahlperiode")[0]
|
||||
protocol.election_period = election_period.text
|
||||
|
||||
toc = tree.xpath("//inhaltsverzeichnis")[0]
|
||||
protocol.toc = toc.text
|
||||
|
||||
attachment = tree.xpath("//anlagen")[0]
|
||||
protocol.attachment = attachment.text
|
||||
protocol.save()
|
||||
|
||||
speeches = tree.xpath("//sitzungsbeginn | //rede")
|
||||
for previous_e, current_e, next_e in zip([None]+speeches[:-1], speeches, speeches[1:]+[None]):
|
||||
# self.stdout.write("\tReading speech from " + protocol.protocol_id)
|
||||
speech = Speech()
|
||||
speech.foreign_protocol = protocol
|
||||
if(previous_e is not None):
|
||||
previous_speech_id = previous_e.xpath("@id")[0]
|
||||
speech.previous_speech_id = previous_speech_id
|
||||
speech_id = current_e.xpath("@id")[0]
|
||||
speech.speech_id = speech_id
|
||||
if(next_e is not None):
|
||||
next_speech_id = next_e.xpath("@id")[0]
|
||||
speech.next_speech_id = next_speech_id
|
||||
# self.stdout.write("\tSpeech ID is:" + str(speech.speech_id))
|
||||
# self.stdout.write("\tPrevious Speech ID is:" + str(speech.previous_speech_id))
|
||||
# self.stdout.write("\tNext Speech ID is:" + str(speech.next_speech_id))
|
||||
speaker_type = current_e.xpath("//@typ")[0]
|
||||
speech.speaker_type = speaker_type
|
||||
speaker_id = current_e.xpath(".//redner/@id")[0]
|
||||
# self.stdout.write("\tCurrent speaker ID is:" + str(speaker_id))
|
||||
if(speaker_id != "None"):
|
||||
speech.foreign_speaker = Speaker.objects.filter(pk=speaker_id)[0]
|
||||
# self.stdout.write("\tSpeaker ID (Foreign key) is:" + str(speech.foreign_speaker))
|
||||
speech_content = current_e.xpath(".//p")
|
||||
speech_content = [str(etree.tostring(p)) for p in speech_content]
|
||||
speech_content = "".join(speech_content)
|
||||
speech.speech_content = speech_content
|
||||
original_string = current_e.xpath(".//redner/name")[0]
|
||||
speech.original_string = original_string.tail
|
||||
# self.stdout.write("\t-------------------------------------------")
|
||||
speech.save()
|
Reference in New Issue
Block a user