Initial commit

2019-02-28 14:09:53 +01:00
commit 96e84d083d
97 changed files with 66293 additions and 0 deletions
--- a/app/ngram_viewer/management/commands/import_ngrams_bulk.py
+++ b/app/ngram_viewer/management/commands/import_ngrams_bulk.py
@ -0,0 +1,107 @@
+from django.core.management.base import BaseCommand
+from ngram_viewer.models import *
+from itertools import islice
+from datetime import datetime
+from tqdm import tqdm
+import csv
+import fnmatch
+import os
+
+
+class Command(BaseCommand):
+    help = ("Adds n-grams to the database using the django models"
+            " syntax. N-grams will be added from csv files with three columns."
+            " First column is the n-gram string, second column is the key "
+            " (e.g. year or speaker) and the third column is the counter."
+            " Input is a path pointing to one n-gram file. The user must specify"
+            " if the csv is containing 1-grams, 2-grams ... 5-grams with the"
+            " parameter 'n_grams'.")
+
+    def add_arguments(self, parser):
+        parser.add_argument("n_grams",
+                            type=int,
+                            choices=[1, 2, 3, 4, 5],
+                            help="Tells the script to either import given input\
+                            csv as 1-grams 2-grams etc.")
+        parser.add_argument("input_folder",
+                            type=str,
+                            help="File path to the csv containing one kind of  \
+                            ngrams.")
+        parser.add_argument("corpus_type",
+                            choices=["lm_ns_year", "tk_ws_year", "lm_ns_speaker",
+                                     "tk_ws_speaker"],
+                            help="user has to choose what kind of ngrams will  \
+                            be imported. lm_ns: Lemmatized without stopwords or\
+                            tk_ws not lemmatized with stopwords.",
+                            type=str)
+        parser.add_argument(
+                            "--batch_size",
+                            "-bs",
+                            type=int,
+                            default=1000000,
+                            required=False,
+                            help="Int to set how many rows(entries) should be  \
+                            inserted via bulk at once. Default is 1 million.")
+
+    def handle(self, *args, **options):
+        start_time = datetime.now()
+        self.stdout.write("Start time of script is: " + str(start_time))
+        folder_path = options["input_folder"]
+        n_grams = options["n_grams"]
+        corpus_type = options["corpus_type"]
+        batch_size = options["batch_size"]
+
+        list_of_files = []
+        for path, subdirs, files in os.walk(folder_path):
+            for name in files:
+                if fnmatch.fnmatch(name, "*.csv"):
+                    list_of_files.append(os.path.join(path, name))
+        list_of_files = sorted(list_of_files)
+
+        for file in tqdm(list_of_files, desc="File status"):
+            with open(file, newline="") as csvfile:
+                n_gram_reader = csv.reader(csvfile, delimiter="\t")
+
+                row_count = sum(1 for row in n_gram_reader)  # closes csvfile
+                iterations = int(row_count/batch_size) + 1
+                self.stdout.write("Number of rows in csv is: " + str(row_count))
+                self.stdout.write("Batch size is " + str(batch_size))
+                self.stdout.write((str(iterations)
+                                   + " iterations are needed to import the"
+                                     " data into the database."))
+
+            with open(file, newline="") as csvfile:  # reopens csvfile
+                sort_key = os.path.basename(file)[0:1]
+                if(sort_key == "_"):
+                    sort_key = "_Non_ASCII"
+                n_gram_reader = csv.reader(csvfile, delimiter="\t")
+
+                if(n_grams == 1):
+                    main_class = "One"
+                elif(n_grams == 2):
+                    main_class = "Two"
+                elif(n_grams == 3):
+                    main_class = "Three"
+                elif(n_grams == 4):
+                    main_class = "Four"
+                elif(n_grams == 5):
+                    main_class = "Five"
+
+                model = "Key{}_{}Gram_{}".format(sort_key, main_class, corpus_type)
+                print(model)
+                while True:
+                    batch = [globals()[model](ngram=row[0],
+                                              key=row[1],
+                                              count=row[2])
+                             for row in tqdm(islice(n_gram_reader, batch_size),
+                                             desc="Creating batch from row")]
+                    if not batch:
+                        break
+                    self.stdout.write("Starting bulk insert.")
+                    globals()[model].objects.bulk_create(batch, batch_size)
+                    self.stdout.write("---------------------------------------")
+
+        end_time = datetime.now()
+        self.stdout.write("End time of script is: " + str(end_time))
+        duration = end_time - start_time
+        self.stdout.write("Duration of script is: " + str(duration))