Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # We will now proceed to remove Age and Gender from the dataset. This is to
- # simplify the model and to reduce noise.
- train_data = train_data %>% dplyr::select(-Age, -Gender)
- test_data = test_data %>% dplyr::select(-Age, -Gender)
- # We remove the GradeClass target variable so that it will not leak out into
- # the features later on
- train_scaled = train_data %>% dplyr::select(-GradeClass)
- test_scaled = test_data %>% dplyr::select(-GradeClass)
- num_features = c("StudyTimeWeekly", "Absences")
- # For knn, it is a distance based algorithm. It is best to scale the numerical
- # features down so that it will not be dominated by the larger values
- sc = scale(train_scaled[, num_features])
- train_scaled[, num_features] = sc
- test_scaled[, num_features] = scale(
- test_scaled[, num_features],
- center = attr(sc, "scaled:center"),
- scale = attr(sc, "scaled:scale")
- )
- # Use one-hot encoding for the categorical features
- dummies = dummyVars(~ ., data = train_scaled)
- train_knn = predict(dummies, train_scaled)
- test_knn = predict(dummies, test_scaled)
- summary(df$StudyTimeWeekly)
- # Base knn Absences# Base knn model with k = 1
- set.seed(100)
- knn_pred = knn(train_knn, test_knn, train_data$GradeClass, k=1)
- # Confusion matrix
- # Accuracy of 59.45%. The average F1 score is also 0.444. Both are worse off
- # compared to the logistic regression models.
- CM1 = confusionMatrix(knn_pred, test_data$GradeClass)
- CM1
- F1_per_class = CM1$byClass[,"F1"]
- F1_per_class[is.na(F1_per_class)] = 0
- F1_per_class
- mean(F1_per_class)
- # 0.4437909
Advertisement
Add Comment
Please, Sign In to add comment