Initial commit
This commit is contained in:
0
app/ngram_viewer/__init__.py
Executable file
0
app/ngram_viewer/__init__.py
Executable file
3
app/ngram_viewer/admin.py
Executable file
3
app/ngram_viewer/admin.py
Executable file
@ -0,0 +1,3 @@
|
||||
from django.contrib import admin
|
||||
|
||||
# Register your models here.
|
6
app/ngram_viewer/apps.py
Executable file
6
app/ngram_viewer/apps.py
Executable file
@ -0,0 +1,6 @@
|
||||
from django.apps import AppConfig
|
||||
from watson import search as watson
|
||||
|
||||
|
||||
class NgramViewerConfig(AppConfig):
|
||||
name = 'ngram_viewer'
|
92
app/ngram_viewer/charts.py
Executable file
92
app/ngram_viewer/charts.py
Executable file
@ -0,0 +1,92 @@
|
||||
from jchart import Chart
|
||||
from jchart.config import Axes, DataSet, rgba, Tick
|
||||
from random import randint
|
||||
|
||||
|
||||
class TimeChart(Chart):
|
||||
"""
|
||||
Class to configure the N-Gramm Viewer line chart over time. The class function
|
||||
get_datasets() is used to get the data sets and creates one data set for
|
||||
each.
|
||||
"""
|
||||
chart_type = "line"
|
||||
responsive = True
|
||||
scales = {
|
||||
'xAxes': [Axes(type='time', position="bottom")],
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
super(TimeChart, self).__init__()
|
||||
self.data_sets = None
|
||||
|
||||
def get_datasets(self, **kwargs):
|
||||
if kwargs is not None:
|
||||
for key, value in kwargs.items():
|
||||
self.data_sets = value
|
||||
lable_names = []
|
||||
data_sets = []
|
||||
for dict in self.data_sets:
|
||||
for key, value in dict.items():
|
||||
lable_names.append(key)
|
||||
data_sets.append(value)
|
||||
data_set_objects = []
|
||||
for lable_name, data_set in zip(lable_names, data_sets):
|
||||
data_set_objects.append(DataSet(type="line",
|
||||
label=lable_name,
|
||||
borderColor=rgba(randint(0,255), randint(0,255), randint(0,255)),
|
||||
data=data_set,
|
||||
lineTension=0))
|
||||
return data_set_objects
|
||||
|
||||
|
||||
class BarChart(Chart):
|
||||
"""
|
||||
Class to configure the N-Gramm Viewer bar chart per speaker.
|
||||
The class function get_datasets() is used to get the data sets and creates
|
||||
one data set for each.
|
||||
"""
|
||||
chart_type = "horizontalBar"
|
||||
responsive = True
|
||||
|
||||
def __init__(self, speaker_range=10):
|
||||
super(BarChart, self).__init__()
|
||||
self.data_sets = None
|
||||
self.speaker_range = int(speaker_range)
|
||||
self.lable_names = []
|
||||
self.bar_data = []
|
||||
self.bar_names = []
|
||||
|
||||
def get_labels(self):
|
||||
try:
|
||||
tmp_list = self.lable_names
|
||||
self.lable_names = sum(tmp_list, [])[:self.speaker_range]
|
||||
except TypeError as e:
|
||||
pass
|
||||
return self.lable_names
|
||||
|
||||
def create_data(self, **kwargs):
|
||||
if kwargs is not None:
|
||||
for key, value in kwargs.items():
|
||||
self.data_sets = value
|
||||
for d in self.data_sets:
|
||||
entry_lable_names = []
|
||||
entry_bar_data = []
|
||||
entry_bar_name = []
|
||||
for key, value in d.items():
|
||||
for set in value:
|
||||
entry_lable_names.append(set["x"])
|
||||
entry_bar_data.append(set["y"])
|
||||
self.lable_names.append(entry_lable_names)
|
||||
entry_bar_name.append(key)
|
||||
self.bar_names.extend(entry_bar_name)
|
||||
entry_bar_data = entry_bar_data[:self.speaker_range]
|
||||
self.bar_data.append(entry_bar_data[:self.speaker_range])
|
||||
|
||||
def get_datasets(self):
|
||||
data_set_objects = []
|
||||
for bar_data, bar_name in zip(self.bar_data, self.bar_names):
|
||||
data_set_objects.append(DataSet(type="horizontalBar",
|
||||
label=bar_name,
|
||||
backgroundColor=rgba(randint(0,255), randint(0,255), randint(0,255)),
|
||||
data=bar_data[:self.speaker_range]))
|
||||
return data_set_objects
|
39
app/ngram_viewer/forms.py
Executable file
39
app/ngram_viewer/forms.py
Executable file
@ -0,0 +1,39 @@
|
||||
from django import forms
|
||||
|
||||
|
||||
class NgramForm(forms.Form):
|
||||
"""
|
||||
Describes and configures the input html form for the Ngram Viewer per year.
|
||||
"""
|
||||
CORPUS_CHOICE = [('lm_ns_year', 'Lemmatisiert ohne Stoppwörter'),
|
||||
('tk_ws_year', 'Nicht lemmatisiert mit Stoppwörter'),]
|
||||
query = forms.CharField(label="Suche Ngramme", max_length="200")
|
||||
case_sensitive = forms.BooleanField(label="case-sensitive", required=False)
|
||||
search_plus = forms.BooleanField(label="search-plus", required=False)
|
||||
ignore_missing = forms.BooleanField(label="fill-zeros", required=False)
|
||||
|
||||
corpus_choice = forms.ChoiceField(label="Wählen Sie einen Corpus", choices=CORPUS_CHOICE)
|
||||
|
||||
class NgramFormSpeaker(forms.Form):
|
||||
"""
|
||||
Describes and configures the input html form for the Ngram Viewer per speaker.
|
||||
"""
|
||||
CORPUS_CHOICE = [('lm_ns_speaker', 'Lemmatisiert ohne Stoppwörter'),
|
||||
('tk_ws_speaker', 'Nicht lemmatisiert mit Stoppwörter'),]
|
||||
query = forms.CharField(label="Suche Ngramm", max_length="200")
|
||||
case_sensitive = forms.BooleanField(label="case-sensitive", required=False)
|
||||
search_plus = forms.BooleanField(label="search-plus", required=False)
|
||||
ignore_missing = forms.BooleanField(label="fill-zeros", required=False)
|
||||
range = forms.IntegerField(label="Anzahl an Rednern")
|
||||
|
||||
corpus_choice = forms.ChoiceField(label="Wählen Sie einen Corpus", choices=CORPUS_CHOICE)
|
||||
|
||||
def clean_query(self):
|
||||
data = self.cleaned_data["query"]
|
||||
print(data)
|
||||
if(len(data.split(",")) > 1):
|
||||
raise forms.ValidationError("Es kann nur ein Ngramm gleichzeitig \
|
||||
abgefragt werden.")
|
||||
print(data.split(",")[0])
|
||||
return data.split(",")[0]
|
||||
return data
|
107
app/ngram_viewer/management/commands/import_ngrams_bulk.py
Executable file
107
app/ngram_viewer/management/commands/import_ngrams_bulk.py
Executable file
@ -0,0 +1,107 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from ngram_viewer.models import *
|
||||
from itertools import islice
|
||||
from datetime import datetime
|
||||
from tqdm import tqdm
|
||||
import csv
|
||||
import fnmatch
|
||||
import os
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = ("Adds n-grams to the database using the django models"
|
||||
" syntax. N-grams will be added from csv files with three columns."
|
||||
" First column is the n-gram string, second column is the key "
|
||||
" (e.g. year or speaker) and the third column is the counter."
|
||||
" Input is a path pointing to one n-gram file. The user must specify"
|
||||
" if the csv is containing 1-grams, 2-grams ... 5-grams with the"
|
||||
" parameter 'n_grams'.")
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument("n_grams",
|
||||
type=int,
|
||||
choices=[1, 2, 3, 4, 5],
|
||||
help="Tells the script to either import given input\
|
||||
csv as 1-grams 2-grams etc.")
|
||||
parser.add_argument("input_folder",
|
||||
type=str,
|
||||
help="File path to the csv containing one kind of \
|
||||
ngrams.")
|
||||
parser.add_argument("corpus_type",
|
||||
choices=["lm_ns_year", "tk_ws_year", "lm_ns_speaker",
|
||||
"tk_ws_speaker"],
|
||||
help="user has to choose what kind of ngrams will \
|
||||
be imported. lm_ns: Lemmatized without stopwords or\
|
||||
tk_ws not lemmatized with stopwords.",
|
||||
type=str)
|
||||
parser.add_argument(
|
||||
"--batch_size",
|
||||
"-bs",
|
||||
type=int,
|
||||
default=1000000,
|
||||
required=False,
|
||||
help="Int to set how many rows(entries) should be \
|
||||
inserted via bulk at once. Default is 1 million.")
|
||||
|
||||
def handle(self, *args, **options):
|
||||
start_time = datetime.now()
|
||||
self.stdout.write("Start time of script is: " + str(start_time))
|
||||
folder_path = options["input_folder"]
|
||||
n_grams = options["n_grams"]
|
||||
corpus_type = options["corpus_type"]
|
||||
batch_size = options["batch_size"]
|
||||
|
||||
list_of_files = []
|
||||
for path, subdirs, files in os.walk(folder_path):
|
||||
for name in files:
|
||||
if fnmatch.fnmatch(name, "*.csv"):
|
||||
list_of_files.append(os.path.join(path, name))
|
||||
list_of_files = sorted(list_of_files)
|
||||
|
||||
for file in tqdm(list_of_files, desc="File status"):
|
||||
with open(file, newline="") as csvfile:
|
||||
n_gram_reader = csv.reader(csvfile, delimiter="\t")
|
||||
|
||||
row_count = sum(1 for row in n_gram_reader) # closes csvfile
|
||||
iterations = int(row_count/batch_size) + 1
|
||||
self.stdout.write("Number of rows in csv is: " + str(row_count))
|
||||
self.stdout.write("Batch size is " + str(batch_size))
|
||||
self.stdout.write((str(iterations)
|
||||
+ " iterations are needed to import the"
|
||||
" data into the database."))
|
||||
|
||||
with open(file, newline="") as csvfile: # reopens csvfile
|
||||
sort_key = os.path.basename(file)[0:1]
|
||||
if(sort_key == "_"):
|
||||
sort_key = "_Non_ASCII"
|
||||
n_gram_reader = csv.reader(csvfile, delimiter="\t")
|
||||
|
||||
if(n_grams == 1):
|
||||
main_class = "One"
|
||||
elif(n_grams == 2):
|
||||
main_class = "Two"
|
||||
elif(n_grams == 3):
|
||||
main_class = "Three"
|
||||
elif(n_grams == 4):
|
||||
main_class = "Four"
|
||||
elif(n_grams == 5):
|
||||
main_class = "Five"
|
||||
|
||||
model = "Key{}_{}Gram_{}".format(sort_key, main_class, corpus_type)
|
||||
print(model)
|
||||
while True:
|
||||
batch = [globals()[model](ngram=row[0],
|
||||
key=row[1],
|
||||
count=row[2])
|
||||
for row in tqdm(islice(n_gram_reader, batch_size),
|
||||
desc="Creating batch from row")]
|
||||
if not batch:
|
||||
break
|
||||
self.stdout.write("Starting bulk insert.")
|
||||
globals()[model].objects.bulk_create(batch, batch_size)
|
||||
self.stdout.write("---------------------------------------")
|
||||
|
||||
end_time = datetime.now()
|
||||
self.stdout.write("End time of script is: " + str(end_time))
|
||||
duration = end_time - start_time
|
||||
self.stdout.write("Duration of script is: " + str(duration))
|
0
app/ngram_viewer/migrations/__init__.py
Normal file
0
app/ngram_viewer/migrations/__init__.py
Normal file
9639
app/ngram_viewer/models.py
Executable file
9639
app/ngram_viewer/models.py
Executable file
File diff suppressed because it is too large
Load Diff
287
app/ngram_viewer/ngram_search.py
Executable file
287
app/ngram_viewer/ngram_search.py
Executable file
@ -0,0 +1,287 @@
|
||||
from datetime import datetime
|
||||
from ngram_viewer.models import *
|
||||
from speakers.models import Speaker
|
||||
from watson import search as watson
|
||||
from collections import defaultdict, OrderedDict
|
||||
import logging
|
||||
|
||||
|
||||
class NgramSearch(object):
|
||||
"""
|
||||
Class that handles the search for ngrams per year. Inputs are the user query
|
||||
and search options. User query will be splitted and every split will be used
|
||||
as a single query. Every singel query returns a QuerySet which will be
|
||||
searched again with a regex to either match full words or partial words.
|
||||
New regex evaluated QuerySets will be returned. Data from those will be
|
||||
retrived and converted to valid chart.js data sets. Besides the query the
|
||||
user can pass some search options to the class like case sensitive and case
|
||||
insensitve. This Class handles search per year which is kind of the default.
|
||||
"""
|
||||
|
||||
def __init__(self, clean_data):
|
||||
super(NgramSearch, self).__init__()
|
||||
|
||||
self.cs_query = clean_data["query"]
|
||||
self.case_sensitive = clean_data["case_sensitive"]
|
||||
self.search_plus = clean_data["search_plus"]
|
||||
self.ignore_missing = clean_data["ignore_missing"]
|
||||
self.corpus_choice = clean_data["corpus_choice"]
|
||||
self.sub_querys_dict = defaultdict(list)
|
||||
self.filtered_sets_dict = defaultdict(list)
|
||||
self.raw_data = []
|
||||
|
||||
def get_time_from_year_str(self, query_data, date_format="%Y"):
|
||||
"""
|
||||
This function creates a valid datetime object from an input string.
|
||||
Works with strings consisting of %Y, %Y-%m or %Y.%m.%d. Not needed for
|
||||
now.
|
||||
"""
|
||||
for ngram_dict in query_data:
|
||||
for key in ngram_dict:
|
||||
data_series = ngram_dict[key]
|
||||
for value_pair in data_series:
|
||||
valid_time = datetime.strptime(value_pair["x"], date_format)
|
||||
valid_time_str = valid_time.strftime("%Y-%m-%dT%H:%M:%S")
|
||||
value_pair["x"] = valid_time_str
|
||||
return query_data
|
||||
|
||||
def get_sub_querys(self):
|
||||
"""
|
||||
This function takes the comma separated query string and splits it into
|
||||
the needed substring and sorts them into a dictionary according to their
|
||||
length to distinguish between unigrams, bigrams and so on.
|
||||
"""
|
||||
# Some checks to see if the input query is valid
|
||||
if(self.cs_query.startswith(",")):
|
||||
self.cs_query = self.cs_query[1:]
|
||||
elif(self.cs_query.endswith(",")):
|
||||
self.cs_query = self.cs_query[:-1]
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
sub_querys = self.cs_query.split(",")
|
||||
logger.info(sub_querys)
|
||||
sub_querys_stripped = []
|
||||
for sub_query in sub_querys:
|
||||
if(sub_query.startswith(" ")):
|
||||
sub_querys_stripped.append(sub_query[1:])
|
||||
elif(sub_query.endswith(" ")):
|
||||
sub_querys_stripped.append(sub_query[:-1])
|
||||
else:
|
||||
sub_querys_stripped.append(sub_query)
|
||||
sub_querys_dict = defaultdict(list)
|
||||
for sub_query in sub_querys_stripped:
|
||||
# Checks for words starting with german Umlaut or special characters like "§$%&"
|
||||
sort_key = sub_query[0].upper()
|
||||
if(sort_key in ["Ä", "Ö", "Ü"]):
|
||||
sort_key = "_Non_ASCII"
|
||||
elif(sort_key.isascii() is True and sort_key.isalnum() is False):
|
||||
sort_key = "_Non_ASCII"
|
||||
elif(not sort_key.isascii()):
|
||||
sort_key = "_Non_ASCII"
|
||||
else:
|
||||
sort_key = sort_key
|
||||
|
||||
if(len(sub_query.split()) == 1):
|
||||
main_class = "One"
|
||||
elif(len(sub_query.split()) == 2):
|
||||
main_class = "Two"
|
||||
elif(len(sub_query.split()) == 3):
|
||||
main_class = "Three"
|
||||
elif(len(sub_query.split()) == 4):
|
||||
main_class = "Four"
|
||||
elif(len(sub_query.split()) == 5):
|
||||
main_class = "Five"
|
||||
else:
|
||||
sub_querys_dict["invalid"].append(sub_query)
|
||||
continue
|
||||
|
||||
model = "Key{}_{}Gram_{}".format(sort_key,
|
||||
main_class,
|
||||
self.corpus_choice)
|
||||
model = globals()[model]
|
||||
sub_querys_dict[model].append(sub_query)
|
||||
self.sub_querys_dict = sub_querys_dict
|
||||
|
||||
|
||||
def enhanced_search(self):
|
||||
"""
|
||||
This function takes the sub_querys_dict and searches the database for every
|
||||
subquery and returns QuerySets for those. In a second step the QuerySets
|
||||
will be searched again with a regex to assure that QuerySets only contain
|
||||
objects with an exact word match.
|
||||
"""
|
||||
# first broad search to catch every entry containing the query
|
||||
# Without enhanced search syntax
|
||||
if(self.search_plus is False):
|
||||
query_sets_dict = defaultdict(list)
|
||||
for key, values in self.sub_querys_dict.items():
|
||||
if(key != "invalid"):
|
||||
for value in values:
|
||||
query_set = key.objects.filter(ngram__icontains=value) # Case-insensitve. Checks for entires that somehow contain the input string. Equal to LIKE SQL syntax. Should be faster than exact match and the QuerySet can be used for more specific search operations.
|
||||
query_sets_dict[key].append((query_set, value))
|
||||
# Case-insensitive exact match of entries
|
||||
if(self.case_sensitive is False):
|
||||
filtered_sets_dict = defaultdict(list)
|
||||
for key, query_sets in query_sets_dict.items():
|
||||
for query_set in query_sets:
|
||||
r_filtered = query_set[0].filter(ngram__iexact=query_set[1]) # Matches entries that contain the exact query
|
||||
filtered_sets_dict[key].append((r_filtered, query_set[1]))
|
||||
# Case-sensitive exact match of entries
|
||||
elif(self.case_sensitive is True):
|
||||
filtered_sets_dict = defaultdict(list)
|
||||
for key, query_sets in query_sets_dict.items():
|
||||
for query_set in query_sets:
|
||||
r_filtered = query_set[0].filter(ngram__exact=query_set[1]) # Matches entries that contain the exact query
|
||||
filtered_sets_dict[key].append((r_filtered, query_set[1]))
|
||||
# With enhanced search syntax
|
||||
elif(self.search_plus is True):
|
||||
# Case-insensitive exact match of entries
|
||||
if(self.case_sensitive is False):
|
||||
filtered_sets_dict = defaultdict(list)
|
||||
for key, values in self.sub_querys_dict.items():
|
||||
if(key != "invalid"):
|
||||
for value in values:
|
||||
if(value.endswith("__")):
|
||||
r_filtered = key.objects.filter(ngram__iexact=value[:-2])
|
||||
else:
|
||||
r_filtered = key.objects.filter(ngram__iregex=value) # Matches entries that contain regex query case-insensitive
|
||||
filtered_sets_dict[key].append((r_filtered, value))
|
||||
# Case-sensitive exact match of entries
|
||||
elif(self.case_sensitive is True):
|
||||
filtered_sets_dict = defaultdict(list)
|
||||
for key, values in self.sub_querys_dict.items():
|
||||
if(key != "invalid"):
|
||||
for value in values:
|
||||
if(value.endswith("__")):
|
||||
r_filtered = key.objects.filter(ngram__exact=value[:-2])
|
||||
else:
|
||||
r_filtered = key.objects.filter(ngram__regex=value) # Matches entries that contain regex query case-sensitive
|
||||
filtered_sets_dict[key].append((r_filtered, value))
|
||||
|
||||
self.filtered_sets_dict = filtered_sets_dict
|
||||
|
||||
def query_sets_to_data(self):
|
||||
"""
|
||||
Converts QuerySets to data dictionaries. Fills missing years with zero
|
||||
value counts for ngrams. Also sums upper and lower case n-grams to one ngram
|
||||
with one count.
|
||||
"""
|
||||
data = []
|
||||
for key, query_sets in self.filtered_sets_dict.items():
|
||||
for query_set in query_sets:
|
||||
data_line = {}
|
||||
for ngram in query_set[0]:
|
||||
if ngram.key in data_line:
|
||||
data_line[ngram.key] += ngram.count
|
||||
# print(ngram.key, ngram.count, ngram.one_gram)
|
||||
else:
|
||||
data_line[ngram.key] = ngram.count
|
||||
# print(ngram.key, ngram.count, ngram.one_gram)
|
||||
# print(data_line)
|
||||
data.append({query_set[1]: data_line})
|
||||
|
||||
# checks for missing years and fills the mwith zero
|
||||
if(self.ignore_missing is False):
|
||||
years = [year for year in range(1949, 2018)]
|
||||
for data_line in data:
|
||||
for key, values in data_line.items():
|
||||
for year in years:
|
||||
if(str(year) not in values):
|
||||
values[str(year)] = 0
|
||||
data_line[key] = dict(sorted(values.items()))
|
||||
elif(self.ignore_missing is True):
|
||||
for data_line in data:
|
||||
for key, values in data_line.items():
|
||||
data_line[key] = dict(sorted(values.items()))
|
||||
self.raw_data = data
|
||||
|
||||
def convert_to_data_set(self):
|
||||
"""
|
||||
Converts the cleaned data from query_sets_to_data into valid chart.js
|
||||
data set json like objects.
|
||||
"""
|
||||
data_set = []
|
||||
for data_line in self.raw_data:
|
||||
data_set_line = defaultdict(list)
|
||||
for key, values in data_line.items():
|
||||
for year, count in values.items():
|
||||
new_data_point = {}
|
||||
new_data_point["y"] = count
|
||||
new_data_point["x"] = year
|
||||
data_set_line[key].append(new_data_point)
|
||||
data_set.append(data_set_line)
|
||||
self.data_set = data_set
|
||||
|
||||
|
||||
class NgramSearchSpeaker(NgramSearch):
|
||||
"""
|
||||
Class that handles the search for ngrams per speaker. Inputs are the user
|
||||
query and search options. User query will be splitted and every split will
|
||||
be used as a single query. Every singel query returns a QuerySet which will
|
||||
be searched again with a regex to either match full words or partial words.
|
||||
New regex evaluated QuerySets will be returned. Data from those will be
|
||||
retrived and converted to valid chart.js data sets. Besides the query the
|
||||
user can pass some search options to the class like case sensitive and case
|
||||
insensitve. Inherits from NgramSearch.
|
||||
"""
|
||||
|
||||
def __init__(self, clean_data):
|
||||
super(NgramSearch, self).__init__()
|
||||
|
||||
self.cs_query = clean_data["query"].split(",")[0]
|
||||
self.case_sensitive = clean_data["case_sensitive"]
|
||||
self.search_plus = clean_data["search_plus"]
|
||||
self.ignore_missing = clean_data["ignore_missing"]
|
||||
self.corpus_choice = clean_data["corpus_choice"]
|
||||
self.sub_querys_dict = defaultdict(list)
|
||||
self.filtered_sets_dict = defaultdict(list)
|
||||
self.raw_data = []
|
||||
|
||||
def get_speaker_name(self, query_data):
|
||||
"""
|
||||
This function takes the speaker ID and gets the corresponding speaker
|
||||
name.
|
||||
"""
|
||||
for ngram_dict in query_data:
|
||||
for key in ngram_dict:
|
||||
data_series = ngram_dict[key]
|
||||
for value_pair in data_series:
|
||||
speaker_id = value_pair["x"]
|
||||
if(speaker_id != "None"):
|
||||
speaker_details = Speaker.objects.get(pk=speaker_id)
|
||||
value_pair["x"] = (speaker_id
|
||||
+ ": "
|
||||
+ speaker_details.first_name
|
||||
+ " "
|
||||
+ speaker_details.last_name
|
||||
+ " ({})".format(speaker_details.party))
|
||||
elif(speaker_id == "None"):
|
||||
value_pair["x"] = "Redner nicht identifiziert."
|
||||
return query_data
|
||||
|
||||
def query_sets_to_data(self):
|
||||
"""
|
||||
Converts QuerySets to data dictionaries. Fills missing years with zero
|
||||
value counts for ngrams. Also sums upper and lower case n-grams to one ngram
|
||||
with one count.
|
||||
"""
|
||||
data = []
|
||||
for key, query_sets in self.filtered_sets_dict.items():
|
||||
for query_set in query_sets:
|
||||
data_line = {}
|
||||
for ngram in query_set[0]:
|
||||
if ngram.key in data_line:
|
||||
data_line[ngram.key] += ngram.count
|
||||
# print(ngram.key, ngram.count, ngram.one_gram)
|
||||
else:
|
||||
data_line[ngram.key] = ngram.count
|
||||
# print(ngram.key, ngram.count, ngram.one_gram)
|
||||
# print(data_line)
|
||||
data.append({query_set[1]: data_line})
|
||||
|
||||
for d in data:
|
||||
for key, value in d.items():
|
||||
value = OrderedDict(sorted(value.items(), key=lambda t: t[1], reverse=True))
|
||||
value = dict(value)
|
||||
d[key] = value
|
||||
self.raw_data = data
|
96
app/ngram_viewer/templates/ngram_viewer/ngram_viewer_speaker.html
Executable file
96
app/ngram_viewer/templates/ngram_viewer/ngram_viewer_speaker.html
Executable file
@ -0,0 +1,96 @@
|
||||
{% extends "blog/base.html" %}
|
||||
|
||||
{% block nav-tabs %}
|
||||
<div class="nav-content">
|
||||
<ul class="tabs tabs-transparent">
|
||||
<li class="tab"><a target="_self" href="{% url "ngram-viewer-jahr" %}">Pro Jahr</a></li>
|
||||
<li class="tab"><a target="_self" class="active" href="{% url "ngram-viewer-sprecher" %}">Pro MdB</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
{% endblock nav-tabs %}
|
||||
|
||||
{% block content %}
|
||||
<div class="row">
|
||||
<div class="col s12 m12 l4">
|
||||
<div class="card">
|
||||
<div class="card-content">
|
||||
<span class="card-title center-align">Suchoptionen</span>
|
||||
<div class="row">
|
||||
<form method="GET" class="col s12">
|
||||
{% csrf_token %}
|
||||
{% if errors %}
|
||||
|
||||
<p class="red-text text-darken-2">Es kann nur jeweils ein Ngramm gesucht werden.</p>
|
||||
|
||||
{% endif %}
|
||||
<div class="input-field col s12">
|
||||
<i class="material-icons prefix">search</i>
|
||||
<input id="id_query" type="text" name="{{form.query.html_name}}" class="autocomplete materialize-textarea validate" {% if form.query.value != None %}value = "{{form.query.value}}" {% else %}value = "Ausländer" {% endif %}}>
|
||||
<label for="id_query">{{form.query.label}}</label>
|
||||
<button class="btn waves-effect waves-light right light-green darken-3" type="submit" name="ngram-search">Suche
|
||||
<i class="material-icons right">send</i>
|
||||
</button>
|
||||
</div>
|
||||
<br />
|
||||
<br />
|
||||
Corpus:{{form.corpus_choice}}
|
||||
<div class="section">
|
||||
<div class="switch section ">
|
||||
<span>Case-sensitive Suche:</span>
|
||||
<div style="float: right;">
|
||||
Aus
|
||||
<label>
|
||||
<input type="checkbox" name="{{form.case_sensitive.html_name}}" class="filled-in" {% if form.case_sensitive.value == True %}checked = "checked" {% endif %} />
|
||||
<span class="lever"></span>
|
||||
</label>
|
||||
Ein
|
||||
</div>
|
||||
</div>
|
||||
<div class="divider"></div>
|
||||
<div class="switch section">
|
||||
<span>Erweiterter Suchsyntax: <a class="tooltipped" data-position="bottom" data-tooltip="Ist diese Option aktiviert, kann die PostgreSQL interne regex-Syntax für die einzelnen Suchanfragen verwendet werden. Allerdings kann diese nur an Wortenden ('Asyl\w*') verwendet werden. Wörter können am Wortende mit '__' ('Krieg__') quasi escaped werden, so dass diese nicht als regulärer Ausdruck interpretiert werden."><i
|
||||
class="material-icons tiny blue-grey-text darken-4">info_outline</i></a></span>
|
||||
<div style="float: right;">
|
||||
Aus
|
||||
<label>
|
||||
<input type="checkbox" name="{{form.search_plus.html_name}}" class="filled-in" {% if form.search_plus.value == True %}checked = "checked" {% endif %} />
|
||||
<span class="lever"></span>
|
||||
</label>
|
||||
Ein
|
||||
</div>
|
||||
</div>
|
||||
<div class="divider"></div>
|
||||
<div class="section">
|
||||
<div class="input-field col s12">
|
||||
<i class="material-icons prefix">filter_9_plus</i>
|
||||
<input id="id_query" type="text" name="{{form.range.html_name}}" class="autocomplete materialize-textarea validate" {% if form.range.value != None %}value = "{{form.range.value}}" {% else %}value = "10" {% endif %}}>
|
||||
<label for="id_range">{{form.range.label}}</label>
|
||||
</div>
|
||||
</div>
|
||||
<div class="divider"></div>
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<ul class="collapsible">
|
||||
<li>
|
||||
<div class="collapsible-header"><i class="material-icons blue-grey-text darken-4">info_outline</i>Hilfe und Hinweise</div>
|
||||
<div class="collapsible-body white">
|
||||
<h6>Muster der Suchanfrage<h6>
|
||||
<p></p>
|
||||
<h6>Suchgeschwindigkeit<h6>
|
||||
<p></p>
|
||||
</div>
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
<div class="col s12 m12 l8">
|
||||
<div class="card">
|
||||
<div class="card-content">
|
||||
<span class="card-title">Graph</span>
|
||||
{{ bar_chart.as_html }}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endblock content %}
|
96
app/ngram_viewer/templates/ngram_viewer/ngram_viewer_year.html
Executable file
96
app/ngram_viewer/templates/ngram_viewer/ngram_viewer_year.html
Executable file
@ -0,0 +1,96 @@
|
||||
{% extends "blog/base.html" %}
|
||||
|
||||
{% block nav-tabs %}
|
||||
<div class="nav-content">
|
||||
<ul class="tabs tabs-transparent">
|
||||
<li class="tab"><a target="_self" class="active" href="{% url "ngram-viewer-jahr" %}">Pro Jahr</a></li>
|
||||
<li class="tab"><a target="_self" href="{% url "ngram-viewer-sprecher" %}">Pro MdB</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
{% endblock nav-tabs %}
|
||||
|
||||
{% block content %}
|
||||
<div class="row">
|
||||
<div class="col s12 m12 l4">
|
||||
<div class="card">
|
||||
<div class="card-content">
|
||||
<span class="card-title center-align">Suchoptionen</span>
|
||||
<div class="row">
|
||||
<form method="GET" class="col s12">
|
||||
{% csrf_token %}
|
||||
<div class="input-field col s12">
|
||||
<i class="material-icons prefix">search</i>
|
||||
<input id="id_query" type="text" name="{{form.query.html_name}}" class="autocomplete materialize-textarea validate" {% if form.query.value != None %}value = "{{form.query.value}}" {% else %}value = "Kroatien, Krieg, Asyl" {% endif %}}>
|
||||
<label for="id_query">{{form.query.label}}</label>
|
||||
<button class="btn waves-effect waves-light right light-green darken-3" type="submit" name="ngram-search">Suche
|
||||
<i class="material-icons right">send</i>
|
||||
</button>
|
||||
</div>
|
||||
<br />
|
||||
<br />
|
||||
Corpus:{{form.corpus_choice}}
|
||||
<div class="section">
|
||||
<div class="switch section ">
|
||||
<span>Case-sensitive Suche:</span>
|
||||
<div style="float: right;">
|
||||
Aus
|
||||
<label>
|
||||
<input type="checkbox" name="{{form.case_sensitive.html_name}}" class="filled-in" {% if form.case_sensitive.value == True %}checked = "checked" {% endif %} />
|
||||
<span class="lever"></span>
|
||||
</label>
|
||||
Ein
|
||||
</div>
|
||||
</div>
|
||||
<div class="divider"></div>
|
||||
<div class="switch section">
|
||||
<span>Erweiterter Suchsyntax: <a class="tooltipped" data-position="bottom" data-tooltip="Ist diese Option aktiviert, kann die PostgreSQL interne regex-Syntax für die einzelnen Suchanfragen verwendet werden. Allerdings kann diese nur an Wortenden ('Asyl\w*') verwendet werden. Wörter können am Wortende mit '__' ('Krieg__') quasi escaped werden, so dass diese nicht als regulärer Ausdruck interpretiert werden."><i
|
||||
class="material-icons tiny blue-grey-text darken-4">info_outline</i></a></span>
|
||||
<div style="float: right;">
|
||||
Aus
|
||||
<label>
|
||||
<input type="checkbox" name="{{form.search_plus.html_name}}" class="filled-in" {% if form.search_plus.value == True %}checked = "checked" {% endif %} />
|
||||
<span class="lever"></span>
|
||||
</label>
|
||||
Ein
|
||||
</div>
|
||||
</div>
|
||||
<div class="divider"></div>
|
||||
<div class="switch section">
|
||||
<span>Fehlende Daten ignorieren: <a class="tooltipped" data-position="bottom" data-tooltip="Ist diese Option aus, werden Jahre, die das gesuchte Ngramm nicht enthalten mit Nullwerten auf gefüllt. Wird diese Option aktiviert, werden Jahre, die das gesuchte Ngramm nicht enthalten ignoriert."><i
|
||||
class="material-icons tiny blue-grey-text darken-4">info_outline</i></a></span>
|
||||
<div style="float: right;">
|
||||
Aus
|
||||
<label>
|
||||
<input type="checkbox" name="{{form.ignore_missing.html_name}}" class="filled-in" {% if form.ignore_missing.value == True %}checked="checked" {% endif %} />
|
||||
<span class="lever"></span>
|
||||
</label>
|
||||
Ein
|
||||
</div>
|
||||
</div>
|
||||
<div class="divider"></div>
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<ul class="collapsible">
|
||||
<li>
|
||||
<div class="collapsible-header"><i class="material-icons blue-grey-text darken-4">info_outline</i>Hilfe und Hinweise</div>
|
||||
<div class="collapsible-body white">
|
||||
<h6>Muster der Suchanfrage<h6>
|
||||
<p></p>
|
||||
<h6>Suchgeschwindigkeit<h6>
|
||||
<p></p>
|
||||
</div>
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
<div class="col s12 m12 l8">
|
||||
<div class="card">
|
||||
<div class="card-content">
|
||||
<span class="card-title">Graph</span>
|
||||
{{ line_chart.as_html}}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endblock content %}
|
3
app/ngram_viewer/tests.py
Executable file
3
app/ngram_viewer/tests.py
Executable file
@ -0,0 +1,3 @@
|
||||
from django.test import TestCase
|
||||
|
||||
# Create your tests here.
|
7
app/ngram_viewer/urls.py
Executable file
7
app/ngram_viewer/urls.py
Executable file
@ -0,0 +1,7 @@
|
||||
from django.urls import path
|
||||
from . import views
|
||||
|
||||
urlpatterns = [
|
||||
path("pro-jahr/", views.ngram_viewer_year, name="ngram-viewer-jahr"),
|
||||
path("pro-mdb/", views.ngram_viewer_speaker, name="ngram-viewer-sprecher")
|
||||
]
|
94
app/ngram_viewer/views.py
Executable file
94
app/ngram_viewer/views.py
Executable file
@ -0,0 +1,94 @@
|
||||
from django.shortcuts import render
|
||||
from .charts import TimeChart, BarChart
|
||||
from .forms import NgramForm, NgramFormSpeaker
|
||||
from .ngram_search import NgramSearch, NgramSearchSpeaker
|
||||
# import logging
|
||||
|
||||
|
||||
def ngram_viewer_year(request):
|
||||
# logger = logging.getLogger(__name__)
|
||||
if(request.method == "GET"):
|
||||
form = NgramForm(request.GET)
|
||||
if(form.is_valid()):
|
||||
clean_data = form.cleaned_data
|
||||
search = NgramSearch(clean_data)
|
||||
search.get_sub_querys()
|
||||
search.enhanced_search()
|
||||
search.query_sets_to_data()
|
||||
search.convert_to_data_set()
|
||||
line_chart = TimeChart()
|
||||
line_chart.get_datasets(data_sets=search.data_set)
|
||||
context = {"title": "Ngram Viewer für: " + clean_data["query"],
|
||||
"form": form, "line_chart": line_chart}
|
||||
# logger.info(search.sub_querys_dict)
|
||||
# logger.info(search.filtered_sets_dict)
|
||||
# logger.info(search.raw_data)
|
||||
# logger.info(search.data_set)
|
||||
return render(request,
|
||||
"ngram_viewer/ngram_viewer_year.html",
|
||||
context)
|
||||
else:
|
||||
form = NgramForm()
|
||||
clean_data = {'query': 'Asyl, Kroatien, Krieg',
|
||||
'case_sensitive': False,
|
||||
'search_plus': False,
|
||||
'ignore_missing': False,
|
||||
'corpus_choice': 'lm_ns_year'}
|
||||
search = NgramSearch(clean_data)
|
||||
search.get_sub_querys()
|
||||
search.enhanced_search()
|
||||
search.query_sets_to_data()
|
||||
search.convert_to_data_set()
|
||||
line_chart = TimeChart()
|
||||
line_chart.get_datasets(data_sets=search.data_set)
|
||||
context = {"title": "Ngram Viewer pro Jahr für: " + clean_data["query"],
|
||||
"form": form, "line_chart": line_chart}
|
||||
return render(request,
|
||||
"ngram_viewer/ngram_viewer_year.html",
|
||||
context)
|
||||
|
||||
|
||||
def ngram_viewer_speaker(request):
|
||||
if(request.method == "GET"):
|
||||
form = NgramFormSpeaker(request.GET)
|
||||
if(form.is_valid()):
|
||||
clean_data = form.cleaned_data
|
||||
search = NgramSearchSpeaker(clean_data)
|
||||
search.get_sub_querys()
|
||||
search.enhanced_search()
|
||||
search.query_sets_to_data()
|
||||
search.convert_to_data_set()
|
||||
speaker_data = search.get_speaker_name(search.data_set)
|
||||
bar_chart = BarChart(clean_data["range"])
|
||||
bar_chart.create_data(data_sets=speaker_data)
|
||||
bar_chart.get_datasets()
|
||||
bar_chart.get_labels()
|
||||
context = {"title": "Ngram Viewer für: " + clean_data["query"],
|
||||
"form": form, "bar_chart": bar_chart}
|
||||
return render(request,
|
||||
"ngram_viewer/ngram_viewer_speaker.html",
|
||||
context)
|
||||
else:
|
||||
errors = form.errors
|
||||
form = NgramFormSpeaker()
|
||||
clean_data = {'query': 'Ausländer',
|
||||
'case_sensitive': False,
|
||||
'search_plus': False,
|
||||
'ignore_missing': False,
|
||||
'corpus_choice': 'lm_ns_speaker',
|
||||
'range': '10'}
|
||||
search = NgramSearchSpeaker(clean_data)
|
||||
search.get_sub_querys()
|
||||
search.enhanced_search()
|
||||
search.query_sets_to_data()
|
||||
search.convert_to_data_set()
|
||||
speaker_data = search.get_speaker_name(search.data_set)
|
||||
bar_chart = BarChart(clean_data["range"])
|
||||
bar_chart.create_data(data_sets=speaker_data)
|
||||
bar_chart.get_datasets()
|
||||
bar_chart.get_labels()
|
||||
context = {"title": "Ngram Viewer pro MdB für: " + clean_data["query"],
|
||||
"form": form, "bar_chart": bar_chart, "errors": errors}
|
||||
return render(request,
|
||||
"ngram_viewer/ngram_viewer_speaker.html",
|
||||
context)
|
Reference in New Issue
Block a user