Added some documentation.

This commit is contained in:
Stephan Porada
2019-03-01 20:55:41 +01:00
parent 96e84d083d
commit 27aa61d91a
37 changed files with 277 additions and 115 deletions

View File

@ -20,6 +20,10 @@ class TimeChart(Chart):
self.data_sets = None
def get_datasets(self, **kwargs):
"""
Takes n number of data sets as an input and creates one data-line per
data set.
"""
if kwargs is not None:
for key, value in kwargs.items():
self.data_sets = value
@ -42,8 +46,6 @@ class TimeChart(Chart):
class BarChart(Chart):
"""
Class to configure the N-Gramm Viewer bar chart per speaker.
The class function get_datasets() is used to get the data sets and creates
one data set for each.
"""
chart_type = "horizontalBar"
responsive = True
@ -57,6 +59,9 @@ class BarChart(Chart):
self.bar_names = []
def get_labels(self):
"""
Creates lables for the bar chart entries.
"""
try:
tmp_list = self.lable_names
self.lable_names = sum(tmp_list, [])[:self.speaker_range]
@ -65,6 +70,10 @@ class BarChart(Chart):
return self.lable_names
def create_data(self, **kwargs):
"""
Takes n numer of data sets but only one is passed because the
Ngram Viewer per speaker is caped at one query at a time.
"""
if kwargs is not None:
for key, value in kwargs.items():
self.data_sets = value
@ -83,6 +92,10 @@ class BarChart(Chart):
self.bar_data.append(entry_bar_data[:self.speaker_range])
def get_datasets(self):
"""
Takes the data sets from self.bar_data plus self.bar_names and creates
one bar per speaker from this.
"""
data_set_objects = []
for bar_data, bar_name in zip(self.bar_data, self.bar_names):
data_set_objects.append(DataSet(type="horizontalBar",

View File

@ -6,7 +6,7 @@ class NgramForm(forms.Form):
Describes and configures the input html form for the Ngram Viewer per year.
"""
CORPUS_CHOICE = [('lm_ns_year', 'Lemmatisiert ohne Stoppwörter'),
('tk_ws_year', 'Nicht lemmatisiert mit Stoppwörter'),]
('tk_ws_year', 'Nicht lemmatisiert mit Stoppwörtern'),]
query = forms.CharField(label="Suche Ngramme", max_length="200")
case_sensitive = forms.BooleanField(label="case-sensitive", required=False)
search_plus = forms.BooleanField(label="search-plus", required=False)
@ -19,7 +19,7 @@ class NgramFormSpeaker(forms.Form):
Describes and configures the input html form for the Ngram Viewer per speaker.
"""
CORPUS_CHOICE = [('lm_ns_speaker', 'Lemmatisiert ohne Stoppwörter'),
('tk_ws_speaker', 'Nicht lemmatisiert mit Stoppwörter'),]
('tk_ws_speaker', 'Nicht lemmatisiert mit Stoppwörtern'),]
query = forms.CharField(label="Suche Ngramm", max_length="200")
case_sensitive = forms.BooleanField(label="case-sensitive", required=False)
search_plus = forms.BooleanField(label="search-plus", required=False)

View File

@ -13,26 +13,29 @@ class Command(BaseCommand):
" syntax. N-grams will be added from csv files with three columns."
" First column is the n-gram string, second column is the key "
" (e.g. year or speaker) and the third column is the counter."
" Input is a path pointing to one n-gram file. The user must specify"
" if the csv is containing 1-grams, 2-grams ... 5-grams with the"
" parameter 'n_grams'.")
" Input (input_path) is a path pointing to one folder containing all"
" 37 alphabetical sorted n-gram csv-files for one kind of n-gram."
" Thus the user must specify with the parameter n_grams if the"
" csv-files in the folder are 1-grams, 2-grams etc."
" parameter 'n_grams'. The user also need to specifiy the corpus_type.")
def add_arguments(self, parser):
parser.add_argument("n_grams",
type=int,
choices=[1, 2, 3, 4, 5],
help="Tells the script to either import given input\
csv as 1-grams 2-grams etc.")
csv-files as 1-grams 2-grams etc.")
parser.add_argument("input_folder",
type=str,
help="File path to the csv containing one kind of \
ngrams.")
help="File path to the csv-files containing one \
kind of ngrams.")
parser.add_argument("corpus_type",
choices=["lm_ns_year", "tk_ws_year", "lm_ns_speaker",
"tk_ws_speaker"],
help="user has to choose what kind of ngrams will \
be imported. lm_ns: Lemmatized without stopwords or\
tk_ws not lemmatized with stopwords.",
help="User has to choose what kind of ngrams will \
be imported. lm_ns_year: Lemmatized without \
stopwords per year, tk_ws_year: not lemmatized \
with stopwords per year etc.",
type=str)
parser.add_argument(
"--batch_size",
@ -41,7 +44,8 @@ class Command(BaseCommand):
default=1000000,
required=False,
help="Int to set how many rows(entries) should be \
inserted via bulk at once. Default is 1 million.")
inserted via bulk at once. Default is 1 million. \
Optional parameter.")
def handle(self, *args, **options):
start_time = datetime.now()

View File

@ -6,16 +6,16 @@ automatically generated with the utils/create_ngram_models.py script. One model
holds one kind of ngram. The name of the model follows a pattern describing the
specific kind of ngam.
For example: KeyA_TwoGram_lm_ns_year --> This model will create a table
contianing all lemmatized (lm) 2-grams without stopwords (ns) per year starting with the
letter "A" or "a".
contianing all lemmatized (lm) 2-grams without stopwords (ns) per year starting
with the letter "A" or "a".
For example: Key_Non_ASCII_ThreeGram_tk_ws_speaker --> This model will create a
table containing all tokenized (tk) 3-grams with stopwords (ws) per speaker
starting with any non ASCII letter like ü, ö, ä or é.
The Idea behind these splits and a single table for every kind of ngram is to
minimize search times for the user. It would have been possible to create a table
for every 1-gram, 2-gram etc. But these would have benn pretty long (millions of)
rows.
for every 1-gram, 2-gram etc. But these would have benn pretty long (100 millions
of) rows.
"""

View File

@ -10,9 +10,8 @@ class NgramSearch(object):
"""
Class that handles the search for ngrams per year. Inputs are the user query
and search options. User query will be splitted and every split will be used
as a single query. Every singel query returns a QuerySet which will be
searched again with a regex to either match full words or partial words.
New regex evaluated QuerySets will be returned. Data from those will be
as a single query.
Every singel query returns a QuerySet. Data from those will be
retrived and converted to valid chart.js data sets. Besides the query the
user can pass some search options to the class like case sensitive and case
insensitve. This Class handles search per year which is kind of the default.
@ -163,8 +162,8 @@ class NgramSearch(object):
def query_sets_to_data(self):
"""
Converts QuerySets to data dictionaries. Fills missing years with zero
value counts for ngrams. Also sums upper and lower case n-grams to one ngram
with one count.
value counts for ngrams. Also sums upper and lower case n-grams to one
ngram with one count.
"""
data = []
for key, query_sets in self.filtered_sets_dict.items():
@ -216,11 +215,9 @@ class NgramSearch(object):
class NgramSearchSpeaker(NgramSearch):
"""
Class that handles the search for ngrams per speaker. Inputs are the user
query and search options. User query will be splitted and every split will
be used as a single query. Every singel query returns a QuerySet which will
be searched again with a regex to either match full words or partial words.
New regex evaluated QuerySets will be returned. Data from those will be
retrived and converted to valid chart.js data sets. Besides the query the
query and search options. User query can only contain one n-gram.
The query returns a QuerySet. Data from thise will be
retrived and converted to a valid chart.js data set. Besides the query the
user can pass some search options to the class like case sensitive and case
insensitve. Inherits from NgramSearch.
"""
@ -261,8 +258,8 @@ class NgramSearchSpeaker(NgramSearch):
def query_sets_to_data(self):
"""
Converts QuerySets to data dictionaries. Fills missing years with zero
value counts for ngrams. Also sums upper and lower case n-grams to one ngram
Converts QuerySets to data dictionaries.
Also sums upper and lower case n-grams to one ngram
with one count.
"""
data = []

View File

@ -1,5 +1,8 @@
{% extends "blog/base.html" %}
<!-- This template creates the ngram viewer page for the user according to the
query. This creates the ngram viewer per speaker. -->
{% block nav-tabs %}
<div class="nav-content">
<ul class="tabs tabs-transparent">
@ -33,7 +36,7 @@
</div>
<br />
<br />
Corpus:{{form.corpus_choice}}
Korpus:{{form.corpus_choice}}
<div class="section">
<div class="switch section ">
<span>Case-sensitive Suche:</span>

View File

@ -1,5 +1,8 @@
{% extends "blog/base.html" %}
<!-- This template creates the ngram viewer page for the user according to the
query. This creates the ngram viewer per year. -->
{% block nav-tabs %}
<div class="nav-content">
<ul class="tabs tabs-transparent">
@ -28,7 +31,7 @@
</div>
<br />
<br />
Corpus:{{form.corpus_choice}}
Korpus:{{form.corpus_choice}}
<div class="section">
<div class="switch section ">
<span>Case-sensitive Suche:</span>

View File

@ -1,6 +1,10 @@
from django.urls import path
from . import views
"""
Url paths for all ngram_viewer views.
"""
urlpatterns = [
path("pro-jahr/", views.ngram_viewer_year, name="ngram-viewer-jahr"),
path("pro-mdb/", views.ngram_viewer_speaker, name="ngram-viewer-sprecher")

View File

@ -6,6 +6,9 @@ from .ngram_search import NgramSearch, NgramSearchSpeaker
def ngram_viewer_year(request):
"""
This view creates the Ngram Viewer page per year.
"""
# logger = logging.getLogger(__name__)
if(request.method == "GET"):
form = NgramForm(request.GET)
@ -49,6 +52,9 @@ def ngram_viewer_year(request):
def ngram_viewer_speaker(request):
"""
This view creates the Ngram Viewer page per speaker.
"""
if(request.method == "GET"):
form = NgramFormSpeaker(request.GET)
if(form.is_valid()):