Guest User

Untitled

a guest
May 21st, 2018
131
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.80 KB | None | 0 0
  1. # Logistic Regression
  2.  
  3. rm(list = ls())
  4.  
  5. # Importing the dataset
  6. dataset <- read.csv(file.path(getwd(),'Data/Social_Network_Ads.csv'))
  7. dataset <- dataset[3:5]
  8.  
  9. # Encoding the target feature as factor
  10. dataset$Purchased <- factor(dataset$Purchased, levels = c(0, 1), labels = c(0, 1))
  11.  
  12. # Splitting the dataset into the Training set and Test set
  13. # install.packages('caTools')
  14. library(caTools)
  15. set.seed(123)
  16. split <- sample.split(dataset$Purchased, SplitRatio = 0.75)
  17. training_set <- subset(dataset, split == TRUE)
  18. test_set <- subset(dataset, split == FALSE)
  19.  
  20. # Feature Scaling
  21. training_set[-3] <- scale(training_set[-3])
  22. test_set[-3] <- scale(test_set[-3])
  23.  
  24. # Fitting Logistic Regression to the Training set
  25. classifier <- glm(formula = Purchased ~ .,
  26. family = binomial,
  27. data = training_set)
  28.  
  29. # Predicting the Test set results
  30. prob_pred <- predict(classifier, type = 'response', newdata = test_set[-3])
  31. y_pred <- ifelse(prob_pred > 0.5, 1, 0)
  32. y_pred <- factor(y_pred, levels = c(0,1), labels = c(0,1))
  33.  
  34. # Making the Confusion Matrix
  35. # cm = table(test_set[, 3], y_pred)
  36. # cm = table(test_set[, 3] > 0.5 , y_pred > 0.5) we cant use > when having factors
  37. cm <- as.matrix(table(Actual = test_set[, 3], Predicted = y_pred)) # create the confusion matrix
  38.  
  39. (accuracy <- mean(y_pred == test_set$Purchased))
  40.  
  41. # Applying k-Fold Cross Validation
  42. # install.packages('caret')
  43. library(caret)
  44. folds = createFolds(training_set$Purchased, k = 10)
  45. cv = lapply(folds, function(x) {
  46. training_fold = training_set[-x, ]
  47. test_fold = training_set[x, ]
  48. classifier <- glm(formula = Purchased ~ .,
  49. family = binomial,
  50. data = training_fold)
  51.  
  52. prob_pred <- predict(classifier, type = 'response', newdata = test_fold[-3])
  53. y_pred <- ifelse(prob_pred > 0.5, 1, 0)
  54. y_pred <- factor(y_pred, levels = c(0,1), labels = c(0,1))
  55.  
  56. cm = table(test_fold[, 3], y_pred)
  57. # accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
  58. accuracy = sum(diag(cm)) / sum(cm)
  59. return(accuracy)
  60. })
  61. (accuracy_k_folds <- mean(as.numeric(cv)))
  62.  
  63. ####### Finding the Optimal Classificaiton Threshold based on Accuracy for 1 fold
  64. threshold <- seq(0, 1, 0.001) # classification thresholds
  65. arr <- NULL
  66. for(x in threshold){
  67. y_pred <- ifelse(prob_pred > x, 1, 0) # classification
  68. y_pred <- factor(y_pred, levels = c(0,1), labels = c(0,1)) # using class lables
  69. cm <- as.matrix(table(Actual = test_set[, 3], Predicted = y_pred)) # creating confusion matrix
  70. #accuracy <- (cm[1, 1] + cm[2,2]) / sum(cm)
  71. accuracy = sum(diag(cm)) / sum(cm)
  72. arr <- rbind(arr, c(x, accuracy, cm[1, 1], cm[2,2], cm[2,1], cm[1,2])) # adding the accuracy for each threshold
  73. }
  74. # if we have more than one optimal threshold
  75. # we have to check which threshold is better
  76. ifelse(sum(arr[, 2] == max(arr[, 2])) > 1,
  77. m_threshold <- arr[arr[, 2] == max(arr[, 2]), ], NA)
  78. colnames(m_threshold) <- c("Threshold", "Accuracy", "TN", "TP", "FN", "FP")
  79. (m_threshold) # is an array of the optimal thresholds with their accuracy
  80.  
  81. # if we have only one optimal threhsold
  82. max_acc_pos <- which.max(arr[, 2]) # finding the position of max accuracy
  83. (opt_threshold <- arr[max_acc_pos, 1]) # finding the optimal threshold value
  84.  
  85. # Hence, the optimal classificaiotion is:
  86. y_pred <- ifelse(prob_pred > opt_threshold, 1, 0)
  87. y_pred <- factor(y_pred, levels = c(0,1), labels = c(0,1))
  88.  
  89. # Making the Confusion Matrix
  90. cm <- as.matrix(table(Actual = test_set[, 3], Predicted = y_pred)) # create the confusion matrix
  91.  
  92. # Calculating accuracy
  93. # accuracy = sum(diag(cm)) / sum(cm)
  94. (accuracy <- mean(y_pred == test_set$Purchased))
  95.  
  96.  
  97. ####### Finding the Optimal Classificaiton Threshold based on Accuracy for K folds
  98. threshold <- seq(0, 1, 0.01) # classification thresholds
  99. arr <- NULL
  100. for(tx in threshold){
  101. folds = createFolds(training_set$Purchased, k = 10)
  102. cv = lapply(folds, function(x) {
  103. training_fold = training_set[-x, ]
  104. test_fold = training_set[x, ]
  105. classifier <- glm(formula = Purchased ~ .,
  106. family = binomial,
  107. data = training_fold)
  108.  
  109. prob_pred <- predict(classifier, type = 'response', newdata = test_fold[-3])
  110. y_pred <- ifelse(prob_pred > tx, 1, 0)
  111. y_pred <- factor(y_pred, levels = c(0,1), labels = c(0,1))
  112.  
  113. cm = table(test_fold[, 3], y_pred)
  114. # accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
  115. accuracy = sum(diag(cm)) / sum(cm)
  116. return(accuracy)
  117. })
  118. accuracy_k_folds <- mean(as.numeric(cv))
  119. arr <- rbind(arr, c(tx, accuracy_k_folds, cm[1, 1], cm[2,2], cm[2,1], cm[1,2])) # adding the accuracy for each threshold
  120. }
  121.  
  122. # if we have more than one optimal threshold
  123. # we have to check which threshold is better
  124. ifelse(sum(arr[, 2] == max(arr[, 2])) > 1,
  125. m_threshold <- arr[arr[, 2] == max(arr[, 2]), ], "Only One Optimal Threshold")
  126. colnames(m_threshold) <- c("Threshold", "Acc_K_Folds", "TN", "TP", "FN", "FP")
  127. (m_threshold) # is an array of the optimal thresholds with their accuracy
  128.  
  129. # if we have only one optimal threhsold
  130. # max_acc_pos <- which.max(arr[, 2]) # finding the position of max accuracy
  131. # opt_threshold <- arr[max_acc_pos, 1] # finding the optimal threshold value
  132. opt_threshold <- arr[arr[, 2] == max(arr[, 2]), ]
  133. opt_threshold <- as.array(opt_threshold)
  134. names(opt_threshold) <- (c("Threshold", "Acc_K_Folds", "TN", "TP", "FN", "FP"))
  135. (round(opt_threshold, 3))
  136.  
  137. # Hence, the optimal classificaiotion is:
  138. y_pred <- ifelse(prob_pred > opt_threshold[1], 1, 0)
  139. y_pred <- factor(y_pred, levels = c(0,1), labels = c(0,1))
  140.  
  141. # Making the Confusion Matrix
  142. cm <- as.matrix(table(Actual = test_set[, 3], Predicted = y_pred)) # create the confusion matrix
  143.  
  144. # Calculating accuracy
  145. # accuracy = sum(diag(cm)) / sum(cm)
  146. (accuracy <- mean(y_pred == test_set$Purchased))
  147.  
  148. # Applying Grid Search to find the best parameters
  149. # Regularized Logistic Regression
  150. # install.packages('caret')
  151. library(caret)
  152. classifier = train(form = Purchased ~ ., data = training_set, method = 'regLogistic')
  153. classifier
  154. classifier$bestTune
  155.  
  156. # Applying Grid Search to find the best parameters
  157. # Penalized Logistic Regression
  158. classifier = train(form = Purchased ~ ., data = training_set, method = 'plr')
  159. classifier
  160. classifier$bestTune
  161.  
  162.  
  163. # Visualising the Training set results
  164. library(ElemStatLearn)
  165. set = training_set
  166. X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
  167. X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
  168. grid_set = expand.grid(X1, X2)
  169. colnames(grid_set) = c('Age', 'EstimatedSalary')
  170. prob_set = predict(classifier, type = 'response', newdata = grid_set)
  171.  
  172. # y_grid = ifelse(prob_set > 0.5, 1, 0) # traditional classification
  173. y_grid = ifelse(prob_set > opt_threshold, 1, 0) # optimal classification
  174.  
  175. plot(set[, -3],
  176. main = 'Logistic Regression (Training set)',
  177. xlab = 'Age', ylab = 'Estimated Salary',
  178. xlim = range(X1), ylim = range(X2))
  179. contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
  180. points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
  181. points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
  182.  
  183. # Visualising the Test set results
  184. library(ElemStatLearn)
  185. set = test_set
  186. X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
  187. X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
  188. grid_set = expand.grid(X1, X2)
  189. colnames(grid_set) = c('Age', 'EstimatedSalary')
  190. prob_set = predict(classifier, type = 'response', newdata = grid_set)
  191.  
  192. # y_grid = ifelse(prob_set > 0.5, 1, 0) # traditional classification
  193. y_grid = ifelse(prob_set > opt_threshold, 1, 0) # optimal classification
  194.  
  195. plot(set[, -3],
  196. main = 'Logistic Regression (Test set)',
  197. xlab = 'Age', ylab = 'Estimated Salary',
  198. xlim = range(X1), ylim = range(X2))
  199. contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
  200. points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
  201. points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
Add Comment
Please, Sign In to add comment