Untitled

# We will now proceed to remove Age and Gender from the dataset. This is to
# simplify the model and to reduce noise.
train_data = train_data %>% dplyr::select(-Age, -Gender)
test_data  = test_data  %>% dplyr::select(-Age, -Gender)

# We remove the GradeClass target variable so that it will not leak out into
# the features later on
train_scaled = train_data %>% dplyr::select(-GradeClass)
test_scaled  = test_data  %>% dplyr::select(-GradeClass)

num_features = c("StudyTimeWeekly", "Absences")

# For knn, it is a distance based algorithm. It is best to scale the numerical
# features down so that it will not be dominated by the larger values
sc = scale(train_scaled[, num_features])
train_scaled[, num_features] = sc
test_scaled[, num_features]  = scale(
  test_scaled[, num_features],
  center = attr(sc, "scaled:center"),
  scale  = attr(sc, "scaled:scale")
)

# Use one-hot encoding for the categorical features
dummies = dummyVars(~ ., data = train_scaled)
train_knn = predict(dummies, train_scaled)
test_knn  = predict(dummies, test_scaled)
summary(df$StudyTimeWeekly)
# Base knn Absences# Base knn model with k = 1
set.seed(100)
knn_pred = knn(train_knn, test_knn, train_data$GradeClass, k=1)

# Confusion matrix
# Accuracy of 59.45%. The average F1 score is also 0.444. Both are worse off
# compared to the logistic regression models.
CM1 = confusionMatrix(knn_pred, test_data$GradeClass)
CM1
F1_per_class = CM1$byClass[,"F1"]
F1_per_class[is.na(F1_per_class)] = 0
F1_per_class
mean(F1_per_class)
# 0.4437909