Initial commit
This commit is contained in:
107
app/ngram_viewer/management/commands/import_ngrams_bulk.py
Executable file
107
app/ngram_viewer/management/commands/import_ngrams_bulk.py
Executable file
@ -0,0 +1,107 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from ngram_viewer.models import *
|
||||
from itertools import islice
|
||||
from datetime import datetime
|
||||
from tqdm import tqdm
|
||||
import csv
|
||||
import fnmatch
|
||||
import os
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = ("Adds n-grams to the database using the django models"
|
||||
" syntax. N-grams will be added from csv files with three columns."
|
||||
" First column is the n-gram string, second column is the key "
|
||||
" (e.g. year or speaker) and the third column is the counter."
|
||||
" Input is a path pointing to one n-gram file. The user must specify"
|
||||
" if the csv is containing 1-grams, 2-grams ... 5-grams with the"
|
||||
" parameter 'n_grams'.")
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument("n_grams",
|
||||
type=int,
|
||||
choices=[1, 2, 3, 4, 5],
|
||||
help="Tells the script to either import given input\
|
||||
csv as 1-grams 2-grams etc.")
|
||||
parser.add_argument("input_folder",
|
||||
type=str,
|
||||
help="File path to the csv containing one kind of \
|
||||
ngrams.")
|
||||
parser.add_argument("corpus_type",
|
||||
choices=["lm_ns_year", "tk_ws_year", "lm_ns_speaker",
|
||||
"tk_ws_speaker"],
|
||||
help="user has to choose what kind of ngrams will \
|
||||
be imported. lm_ns: Lemmatized without stopwords or\
|
||||
tk_ws not lemmatized with stopwords.",
|
||||
type=str)
|
||||
parser.add_argument(
|
||||
"--batch_size",
|
||||
"-bs",
|
||||
type=int,
|
||||
default=1000000,
|
||||
required=False,
|
||||
help="Int to set how many rows(entries) should be \
|
||||
inserted via bulk at once. Default is 1 million.")
|
||||
|
||||
def handle(self, *args, **options):
|
||||
start_time = datetime.now()
|
||||
self.stdout.write("Start time of script is: " + str(start_time))
|
||||
folder_path = options["input_folder"]
|
||||
n_grams = options["n_grams"]
|
||||
corpus_type = options["corpus_type"]
|
||||
batch_size = options["batch_size"]
|
||||
|
||||
list_of_files = []
|
||||
for path, subdirs, files in os.walk(folder_path):
|
||||
for name in files:
|
||||
if fnmatch.fnmatch(name, "*.csv"):
|
||||
list_of_files.append(os.path.join(path, name))
|
||||
list_of_files = sorted(list_of_files)
|
||||
|
||||
for file in tqdm(list_of_files, desc="File status"):
|
||||
with open(file, newline="") as csvfile:
|
||||
n_gram_reader = csv.reader(csvfile, delimiter="\t")
|
||||
|
||||
row_count = sum(1 for row in n_gram_reader) # closes csvfile
|
||||
iterations = int(row_count/batch_size) + 1
|
||||
self.stdout.write("Number of rows in csv is: " + str(row_count))
|
||||
self.stdout.write("Batch size is " + str(batch_size))
|
||||
self.stdout.write((str(iterations)
|
||||
+ " iterations are needed to import the"
|
||||
" data into the database."))
|
||||
|
||||
with open(file, newline="") as csvfile: # reopens csvfile
|
||||
sort_key = os.path.basename(file)[0:1]
|
||||
if(sort_key == "_"):
|
||||
sort_key = "_Non_ASCII"
|
||||
n_gram_reader = csv.reader(csvfile, delimiter="\t")
|
||||
|
||||
if(n_grams == 1):
|
||||
main_class = "One"
|
||||
elif(n_grams == 2):
|
||||
main_class = "Two"
|
||||
elif(n_grams == 3):
|
||||
main_class = "Three"
|
||||
elif(n_grams == 4):
|
||||
main_class = "Four"
|
||||
elif(n_grams == 5):
|
||||
main_class = "Five"
|
||||
|
||||
model = "Key{}_{}Gram_{}".format(sort_key, main_class, corpus_type)
|
||||
print(model)
|
||||
while True:
|
||||
batch = [globals()[model](ngram=row[0],
|
||||
key=row[1],
|
||||
count=row[2])
|
||||
for row in tqdm(islice(n_gram_reader, batch_size),
|
||||
desc="Creating batch from row")]
|
||||
if not batch:
|
||||
break
|
||||
self.stdout.write("Starting bulk insert.")
|
||||
globals()[model].objects.bulk_create(batch, batch_size)
|
||||
self.stdout.write("---------------------------------------")
|
||||
|
||||
end_time = datetime.now()
|
||||
self.stdout.write("End time of script is: " + str(end_time))
|
||||
duration = end_time - start_time
|
||||
self.stdout.write("Duration of script is: " + str(duration))
|
Reference in New Issue
Block a user