Initial commit

This commit is contained in:
Stephan Porada
2019-02-28 14:09:53 +01:00
commit 96e84d083d
97 changed files with 66293 additions and 0 deletions

View File

@ -0,0 +1,107 @@
from django.core.management.base import BaseCommand
from ngram_viewer.models import *
from itertools import islice
from datetime import datetime
from tqdm import tqdm
import csv
import fnmatch
import os
class Command(BaseCommand):
help = ("Adds n-grams to the database using the django models"
" syntax. N-grams will be added from csv files with three columns."
" First column is the n-gram string, second column is the key "
" (e.g. year or speaker) and the third column is the counter."
" Input is a path pointing to one n-gram file. The user must specify"
" if the csv is containing 1-grams, 2-grams ... 5-grams with the"
" parameter 'n_grams'.")
def add_arguments(self, parser):
parser.add_argument("n_grams",
type=int,
choices=[1, 2, 3, 4, 5],
help="Tells the script to either import given input\
csv as 1-grams 2-grams etc.")
parser.add_argument("input_folder",
type=str,
help="File path to the csv containing one kind of \
ngrams.")
parser.add_argument("corpus_type",
choices=["lm_ns_year", "tk_ws_year", "lm_ns_speaker",
"tk_ws_speaker"],
help="user has to choose what kind of ngrams will \
be imported. lm_ns: Lemmatized without stopwords or\
tk_ws not lemmatized with stopwords.",
type=str)
parser.add_argument(
"--batch_size",
"-bs",
type=int,
default=1000000,
required=False,
help="Int to set how many rows(entries) should be \
inserted via bulk at once. Default is 1 million.")
def handle(self, *args, **options):
start_time = datetime.now()
self.stdout.write("Start time of script is: " + str(start_time))
folder_path = options["input_folder"]
n_grams = options["n_grams"]
corpus_type = options["corpus_type"]
batch_size = options["batch_size"]
list_of_files = []
for path, subdirs, files in os.walk(folder_path):
for name in files:
if fnmatch.fnmatch(name, "*.csv"):
list_of_files.append(os.path.join(path, name))
list_of_files = sorted(list_of_files)
for file in tqdm(list_of_files, desc="File status"):
with open(file, newline="") as csvfile:
n_gram_reader = csv.reader(csvfile, delimiter="\t")
row_count = sum(1 for row in n_gram_reader) # closes csvfile
iterations = int(row_count/batch_size) + 1
self.stdout.write("Number of rows in csv is: " + str(row_count))
self.stdout.write("Batch size is " + str(batch_size))
self.stdout.write((str(iterations)
+ " iterations are needed to import the"
" data into the database."))
with open(file, newline="") as csvfile: # reopens csvfile
sort_key = os.path.basename(file)[0:1]
if(sort_key == "_"):
sort_key = "_Non_ASCII"
n_gram_reader = csv.reader(csvfile, delimiter="\t")
if(n_grams == 1):
main_class = "One"
elif(n_grams == 2):
main_class = "Two"
elif(n_grams == 3):
main_class = "Three"
elif(n_grams == 4):
main_class = "Four"
elif(n_grams == 5):
main_class = "Five"
model = "Key{}_{}Gram_{}".format(sort_key, main_class, corpus_type)
print(model)
while True:
batch = [globals()[model](ngram=row[0],
key=row[1],
count=row[2])
for row in tqdm(islice(n_gram_reader, batch_size),
desc="Creating batch from row")]
if not batch:
break
self.stdout.write("Starting bulk insert.")
globals()[model].objects.bulk_create(batch, batch_size)
self.stdout.write("---------------------------------------")
end_time = datetime.now()
self.stdout.write("End time of script is: " + str(end_time))
duration = end_time - start_time
self.stdout.write("Duration of script is: " + str(duration))