Untitled

df_clean1 = filter(df1, glucose!=0, mass!=0, pedigree!=0, age!=0, pressure != 0)
set.seed(100)
training.idx = sample(1: nrow(df_clean1), size=nrow(df_clean1)*0.8)
train1.data = df1[training.idx, ]
test1.data = df1[-training.idx, ]
---
# Create copies of original dataset
train1.kNN = copy(train1.data)
test1.kNN = copy(test1.data)
# Normalize numeric variables
nor = function(x) {(x -min(x))/(max(x)-min(x))}
train1.kNN[, 1:8] = sapply(train1.data[, 1:8], nor)
test1.kNN[, 1:8] = sapply(test1.data[, 1:8], nor)
# Include only pregnant+glucose+pressure+mass+pedigree
train1.kNN = train1.kNN[c(1, 2, 3, 6, 7, 10)]
test1.kNN = test1.kNN[c(1, 2, 3, 6, 7, 10)]
# Try different k to find the best classifier
ac = rep(0, 30)
for(i in 1:30) {
 set.seed(123)
 knn.i = knn(train1.kNN[, 1:5], test1.kNN[, 1:5], cl=train1.kNN$y, k=i)
 ac[i] = mean(knn.i == test1.kNN$y)
}
# Accuracy plot
plot(ac, type="b", xlab="K", ylab="Accuracy")
set.seed(123)
knn1 = knn(train1.kNN[, 1:5], test1.kNN[, 1:5], cl=train1.kNN$y, k=30)
mean(knn1 == test1.kNN$y)
table(knn1, test1.kNN$y)
confusionMatrix(knn1, test1.kNN$y)
acc_kNN <- confusionMatrix(knn1, test1.kNN$y )$overall['Accuracy']