Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Logistic Regression
- rm(list = ls())
- # Importing the dataset
- dataset <- read.csv(file.path(getwd(),'Data/Social_Network_Ads.csv'))
- dataset <- dataset[3:5]
- # Encoding the target feature as factor
- dataset$Purchased <- factor(dataset$Purchased, levels = c(0, 1), labels = c(0, 1))
- # Splitting the dataset into the Training set and Test set
- # install.packages('caTools')
- library(caTools)
- set.seed(123)
- split <- sample.split(dataset$Purchased, SplitRatio = 0.75)
- training_set <- subset(dataset, split == TRUE)
- test_set <- subset(dataset, split == FALSE)
- # Feature Scaling
- training_set[-3] <- scale(training_set[-3])
- test_set[-3] <- scale(test_set[-3])
- # Fitting Logistic Regression to the Training set
- classifier <- glm(formula = Purchased ~ .,
- family = binomial,
- data = training_set)
- # Predicting the Test set results
- prob_pred <- predict(classifier, type = 'response', newdata = test_set[-3])
- y_pred <- ifelse(prob_pred > 0.5, 1, 0)
- y_pred <- factor(y_pred, levels = c(0,1), labels = c(0,1))
- # Making the Confusion Matrix
- # cm = table(test_set[, 3], y_pred)
- # cm = table(test_set[, 3] > 0.5 , y_pred > 0.5) we cant use > when having factors
- cm <- as.matrix(table(Actual = test_set[, 3], Predicted = y_pred)) # create the confusion matrix
- (accuracy <- mean(y_pred == test_set$Purchased))
- # Applying k-Fold Cross Validation
- # install.packages('caret')
- library(caret)
- folds = createFolds(training_set$Purchased, k = 10)
- cv = lapply(folds, function(x) {
- training_fold = training_set[-x, ]
- test_fold = training_set[x, ]
- classifier <- glm(formula = Purchased ~ .,
- family = binomial,
- data = training_fold)
- prob_pred <- predict(classifier, type = 'response', newdata = test_fold[-3])
- y_pred <- ifelse(prob_pred > 0.5, 1, 0)
- y_pred <- factor(y_pred, levels = c(0,1), labels = c(0,1))
- cm = table(test_fold[, 3], y_pred)
- # accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
- accuracy = sum(diag(cm)) / sum(cm)
- return(accuracy)
- })
- (accuracy_k_folds <- mean(as.numeric(cv)))
- ####### Finding the Optimal Classificaiton Threshold based on Accuracy for 1 fold
- threshold <- seq(0, 1, 0.001) # classification thresholds
- arr <- NULL
- for(x in threshold){
- y_pred <- ifelse(prob_pred > x, 1, 0) # classification
- y_pred <- factor(y_pred, levels = c(0,1), labels = c(0,1)) # using class lables
- cm <- as.matrix(table(Actual = test_set[, 3], Predicted = y_pred)) # creating confusion matrix
- #accuracy <- (cm[1, 1] + cm[2,2]) / sum(cm)
- accuracy = sum(diag(cm)) / sum(cm)
- arr <- rbind(arr, c(x, accuracy, cm[1, 1], cm[2,2], cm[2,1], cm[1,2])) # adding the accuracy for each threshold
- }
- # if we have more than one optimal threshold
- # we have to check which threshold is better
- ifelse(sum(arr[, 2] == max(arr[, 2])) > 1,
- m_threshold <- arr[arr[, 2] == max(arr[, 2]), ], NA)
- colnames(m_threshold) <- c("Threshold", "Accuracy", "TN", "TP", "FN", "FP")
- (m_threshold) # is an array of the optimal thresholds with their accuracy
- # if we have only one optimal threhsold
- max_acc_pos <- which.max(arr[, 2]) # finding the position of max accuracy
- (opt_threshold <- arr[max_acc_pos, 1]) # finding the optimal threshold value
- # Hence, the optimal classificaiotion is:
- y_pred <- ifelse(prob_pred > opt_threshold, 1, 0)
- y_pred <- factor(y_pred, levels = c(0,1), labels = c(0,1))
- # Making the Confusion Matrix
- cm <- as.matrix(table(Actual = test_set[, 3], Predicted = y_pred)) # create the confusion matrix
- # Calculating accuracy
- # accuracy = sum(diag(cm)) / sum(cm)
- (accuracy <- mean(y_pred == test_set$Purchased))
- ####### Finding the Optimal Classificaiton Threshold based on Accuracy for K folds
- threshold <- seq(0, 1, 0.01) # classification thresholds
- arr <- NULL
- for(tx in threshold){
- folds = createFolds(training_set$Purchased, k = 10)
- cv = lapply(folds, function(x) {
- training_fold = training_set[-x, ]
- test_fold = training_set[x, ]
- classifier <- glm(formula = Purchased ~ .,
- family = binomial,
- data = training_fold)
- prob_pred <- predict(classifier, type = 'response', newdata = test_fold[-3])
- y_pred <- ifelse(prob_pred > tx, 1, 0)
- y_pred <- factor(y_pred, levels = c(0,1), labels = c(0,1))
- cm = table(test_fold[, 3], y_pred)
- # accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
- accuracy = sum(diag(cm)) / sum(cm)
- return(accuracy)
- })
- accuracy_k_folds <- mean(as.numeric(cv))
- arr <- rbind(arr, c(tx, accuracy_k_folds, cm[1, 1], cm[2,2], cm[2,1], cm[1,2])) # adding the accuracy for each threshold
- }
- # if we have more than one optimal threshold
- # we have to check which threshold is better
- ifelse(sum(arr[, 2] == max(arr[, 2])) > 1,
- m_threshold <- arr[arr[, 2] == max(arr[, 2]), ], "Only One Optimal Threshold")
- colnames(m_threshold) <- c("Threshold", "Acc_K_Folds", "TN", "TP", "FN", "FP")
- (m_threshold) # is an array of the optimal thresholds with their accuracy
- # if we have only one optimal threhsold
- # max_acc_pos <- which.max(arr[, 2]) # finding the position of max accuracy
- # opt_threshold <- arr[max_acc_pos, 1] # finding the optimal threshold value
- opt_threshold <- arr[arr[, 2] == max(arr[, 2]), ]
- opt_threshold <- as.array(opt_threshold)
- names(opt_threshold) <- (c("Threshold", "Acc_K_Folds", "TN", "TP", "FN", "FP"))
- (round(opt_threshold, 3))
- # Hence, the optimal classificaiotion is:
- y_pred <- ifelse(prob_pred > opt_threshold[1], 1, 0)
- y_pred <- factor(y_pred, levels = c(0,1), labels = c(0,1))
- # Making the Confusion Matrix
- cm <- as.matrix(table(Actual = test_set[, 3], Predicted = y_pred)) # create the confusion matrix
- # Calculating accuracy
- # accuracy = sum(diag(cm)) / sum(cm)
- (accuracy <- mean(y_pred == test_set$Purchased))
- # Applying Grid Search to find the best parameters
- # Regularized Logistic Regression
- # install.packages('caret')
- library(caret)
- classifier = train(form = Purchased ~ ., data = training_set, method = 'regLogistic')
- classifier
- classifier$bestTune
- # Applying Grid Search to find the best parameters
- # Penalized Logistic Regression
- classifier = train(form = Purchased ~ ., data = training_set, method = 'plr')
- classifier
- classifier$bestTune
- # Visualising the Training set results
- library(ElemStatLearn)
- set = training_set
- X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
- X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
- grid_set = expand.grid(X1, X2)
- colnames(grid_set) = c('Age', 'EstimatedSalary')
- prob_set = predict(classifier, type = 'response', newdata = grid_set)
- # y_grid = ifelse(prob_set > 0.5, 1, 0) # traditional classification
- y_grid = ifelse(prob_set > opt_threshold, 1, 0) # optimal classification
- plot(set[, -3],
- main = 'Logistic Regression (Training set)',
- xlab = 'Age', ylab = 'Estimated Salary',
- xlim = range(X1), ylim = range(X2))
- contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
- points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
- points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
- # Visualising the Test set results
- library(ElemStatLearn)
- set = test_set
- X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
- X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
- grid_set = expand.grid(X1, X2)
- colnames(grid_set) = c('Age', 'EstimatedSalary')
- prob_set = predict(classifier, type = 'response', newdata = grid_set)
- # y_grid = ifelse(prob_set > 0.5, 1, 0) # traditional classification
- y_grid = ifelse(prob_set > opt_threshold, 1, 0) # optimal classification
- plot(set[, -3],
- main = 'Logistic Regression (Test set)',
- xlab = 'Age', ylab = 'Estimated Salary',
- xlim = range(X1), ylim = range(X2))
- contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
- points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
- points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
Add Comment
Please, Sign In to add comment