Untitled

# Logistic Regression

rm(list = ls())

# Importing the dataset
dataset <- read.csv(file.path(getwd(),'Data/Social_Network_Ads.csv'))
dataset <- dataset[3:5]

# Encoding the target feature as factor
dataset$Purchased <- factor(dataset$Purchased, levels = c(0, 1), labels = c(0, 1))

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split <- sample.split(dataset$Purchased, SplitRatio = 0.75)
training_set <- subset(dataset, split == TRUE)
test_set <- subset(dataset, split == FALSE)

# Feature Scaling
training_set[-3] <- scale(training_set[-3])
test_set[-3] <- scale(test_set[-3])

# Fitting Logistic Regression to the Training set
classifier <- glm(formula = Purchased ~ .,
                 family = binomial,
                 data = training_set)

# Predicting the Test set results
prob_pred <- predict(classifier, type = 'response', newdata = test_set[-3])
y_pred <- ifelse(prob_pred > 0.5, 1, 0)
y_pred <- factor(y_pred, levels = c(0,1), labels = c(0,1))

# Making the Confusion Matrix
# cm = table(test_set[, 3], y_pred)
# cm = table(test_set[, 3]  > 0.5 , y_pred > 0.5) we cant use > when having factors
cm <- as.matrix(table(Actual = test_set[, 3], Predicted = y_pred)) # create the confusion matrix

(accuracy <- mean(y_pred == test_set$Purchased))

# Applying k-Fold Cross Validation
# install.packages('caret')
library(caret)
folds = createFolds(training_set$Purchased, k = 10)
cv = lapply(folds, function(x) {
  training_fold = training_set[-x, ]
  test_fold = training_set[x, ]
  classifier <- glm(formula = Purchased ~ .,
                    family = binomial,
                    data = training_fold)

  prob_pred <- predict(classifier, type = 'response', newdata = test_fold[-3])
  y_pred <- ifelse(prob_pred > 0.5, 1, 0)
  y_pred <- factor(y_pred, levels = c(0,1), labels = c(0,1))

  cm = table(test_fold[, 3], y_pred)
  # accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
  accuracy = sum(diag(cm)) / sum(cm)
  return(accuracy)
})
(accuracy_k_folds <- mean(as.numeric(cv)))

####### Finding the Optimal Classificaiton Threshold based on Accuracy for 1 fold
threshold <- seq(0, 1, 0.001) # classification thresholds
arr <- NULL
for(x in threshold){
  y_pred <- ifelse(prob_pred > x, 1, 0) # classification
  y_pred <- factor(y_pred, levels = c(0,1), labels = c(0,1)) # using class lables
  cm <- as.matrix(table(Actual = test_set[, 3], Predicted = y_pred)) # creating confusion matrix
  #accuracy <- (cm[1, 1] + cm[2,2]) / sum(cm)
  accuracy = sum(diag(cm)) / sum(cm)
  arr <- rbind(arr, c(x, accuracy, cm[1, 1], cm[2,2], cm[2,1], cm[1,2])) # adding the accuracy for each threshold
}
# if we have more than one optimal threshold
# we have to check which threshold is better
ifelse(sum(arr[, 2] == max(arr[, 2])) > 1,
       m_threshold <- arr[arr[, 2] == max(arr[, 2]), ], NA)
colnames(m_threshold) <- c("Threshold", "Accuracy", "TN", "TP", "FN", "FP")
(m_threshold) # is an array of the optimal thresholds with their accuracy

# if we have only one optimal threhsold
max_acc_pos <- which.max(arr[, 2]) # finding the position of max accuracy
(opt_threshold <- arr[max_acc_pos, 1]) # finding the optimal threshold value

# Hence, the optimal classificaiotion is:
y_pred <- ifelse(prob_pred > opt_threshold, 1, 0)
y_pred <- factor(y_pred, levels = c(0,1), labels = c(0,1))

# Making the Confusion Matrix
cm <- as.matrix(table(Actual = test_set[, 3], Predicted = y_pred)) # create the confusion matrix

# Calculating accuracy
# accuracy = sum(diag(cm)) / sum(cm)
(accuracy <- mean(y_pred == test_set$Purchased))


####### Finding the Optimal Classificaiton Threshold based on Accuracy for K folds
threshold <- seq(0, 1, 0.01) # classification thresholds
arr <- NULL
for(tx in threshold){
  folds = createFolds(training_set$Purchased, k = 10)
  cv = lapply(folds, function(x) {
    training_fold = training_set[-x, ]
    test_fold = training_set[x, ]
    classifier <- glm(formula = Purchased ~ .,
                      family = binomial,
                      data = training_fold)

    prob_pred <- predict(classifier, type = 'response', newdata = test_fold[-3])
    y_pred <- ifelse(prob_pred > tx, 1, 0)
    y_pred <- factor(y_pred, levels = c(0,1), labels = c(0,1))

    cm = table(test_fold[, 3], y_pred)
    # accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
    accuracy = sum(diag(cm)) / sum(cm)
    return(accuracy)
  })
  accuracy_k_folds <- mean(as.numeric(cv))
  arr <- rbind(arr, c(tx, accuracy_k_folds, cm[1, 1], cm[2,2], cm[2,1], cm[1,2])) # adding the accuracy for each threshold
}

# if we have more than one optimal threshold
# we have to check which threshold is better
ifelse(sum(arr[, 2] == max(arr[, 2])) > 1,
       m_threshold <- arr[arr[, 2] == max(arr[, 2]), ], "Only One Optimal Threshold")
colnames(m_threshold) <- c("Threshold", "Acc_K_Folds", "TN", "TP", "FN", "FP")
(m_threshold) # is an array of the optimal thresholds with their accuracy

# if we have only one optimal threhsold
# max_acc_pos <- which.max(arr[, 2]) # finding the position of max accuracy
# opt_threshold <- arr[max_acc_pos, 1] # finding the optimal threshold value
opt_threshold <- arr[arr[, 2] == max(arr[, 2]), ]
opt_threshold <- as.array(opt_threshold)
names(opt_threshold) <- (c("Threshold", "Acc_K_Folds", "TN", "TP", "FN", "FP"))
(round(opt_threshold, 3))

# Hence, the optimal classificaiotion is:
y_pred <- ifelse(prob_pred > opt_threshold[1], 1, 0)
y_pred <- factor(y_pred, levels = c(0,1), labels = c(0,1))

# Making the Confusion Matrix
cm <- as.matrix(table(Actual = test_set[, 3], Predicted = y_pred)) # create the confusion matrix

# Calculating accuracy
# accuracy = sum(diag(cm)) / sum(cm)
(accuracy <- mean(y_pred == test_set$Purchased))

# Applying Grid Search to find the best parameters
# Regularized Logistic Regression
# install.packages('caret')
library(caret)
classifier = train(form = Purchased ~ ., data = training_set, method = 'regLogistic')
classifier
classifier$bestTune

# Applying Grid Search to find the best parameters
# Penalized Logistic Regression
classifier = train(form = Purchased ~ ., data = training_set, method = 'plr')
classifier
classifier$bestTune


# Visualising the Training set results
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
prob_set = predict(classifier, type = 'response', newdata = grid_set)

# y_grid = ifelse(prob_set > 0.5, 1, 0) # traditional classification
y_grid = ifelse(prob_set > opt_threshold, 1, 0) # optimal classification

plot(set[, -3],
     main = 'Logistic Regression (Training set)',
     xlab = 'Age', ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

# Visualising the Test set results
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
prob_set = predict(classifier, type = 'response', newdata = grid_set)

# y_grid = ifelse(prob_set > 0.5, 1, 0) # traditional classification
y_grid = ifelse(prob_set > opt_threshold, 1, 0) # optimal classification

plot(set[, -3],
     main = 'Logistic Regression (Test set)',
     xlab = 'Age', ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))