knn

bank <- read.csv("data/bank/bank.csv")

bank$job=NULL
bank$marital=NULL
bank$education=NULL
bank$housing=NULL
bank$loan=NULL
bank$contact=NULL
bank$month=NULL
bank$poutcome=NULL
bank$default=NULL

# table of success/failure
table(bank$y)


# recode y as a factor
bank$y <- factor(bank$y,
                         levels = c("yes", "no"),
                         labels = c("Success", "Fail"))

# table or proportions with more informative labels
prop.table(table(bank$y))


# summarize three numeric features
summary(bank[c("age", "balance",
               "duration")])


# create normalization function
normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}


# test normalization function - result should be identical
normalize(c(1, 2, 3, 4, 5))
normalize(c(10, 20, 30, 40, 50))


# note doesn’t include the labels
bank_n <- as.data.frame(lapply(bank[1:7], normalize))
summary(bank_n$age)
hist(bank_n$age)


# create training and test data (no labels)
bank_train <- bank_n[1:4421, ]
bank_test <- bank_n[4422:4521, ]


# create labels for training and test data
bank_train_labels <- bank_n[1:4421, 1]
bank_test_labels <- bank_n[4422:4521, 1]


## Step 3: Training a model on the data ----
library(class)
predictions <- knn(train = bank_train, test =
                     bank_test, cl = bank_train_labels, k=21)


## Step 4: Evaluating model performance ----
# load the "gmodels" library
#install.packages("gmodels")
library(gmodels)

# Create the cross tabulation of predicted vs. actual
CrossTable(predictions, bank_test_labels,
           prop.chisq = FALSE,
           prop.c = FALSE, prop.r = FALSE)