Guest User

Untitled

a guest
May 21st, 2018
134
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.37 KB | None | 0 0
  1. # Decision Tree Classification
  2.  
  3. # we may need to prune the DT to avoid overfitting
  4.  
  5. rm(list = ls())
  6.  
  7. # Importing the dataset
  8. dataset <- read.csv(file.path(getwd(),'Data/Social_Network_Ads.csv'))
  9. dataset = dataset[3:5]
  10.  
  11. # Encoding the target feature as factor
  12. dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))
  13.  
  14. # Splitting the dataset into the Training set and Test set
  15. # install.packages('caTools')
  16. library(caTools)
  17. set.seed(123)
  18. split = sample.split(dataset$Purchased, SplitRatio = 0.75)
  19. training_set = subset(dataset, split == TRUE)
  20. test_set = subset(dataset, split == FALSE)
  21.  
  22. # Feature Scaling
  23. # DT does not require scaling because it does not use Euclidean distance.
  24. # However, if we dont scale the data, the visualization will take much longer.
  25. # I must mention that I noticed the performance of DT with scaled data is degraded slightly.
  26. training_set[-3] = scale(training_set[-3])
  27. test_set[-3] = scale(test_set[-3])
  28.  
  29. # Fitting Decision Tree Classification to the Training set
  30. # install.packages('rpart')
  31. library(rpart)
  32. # DT without pruning
  33. classifier <- rpart(formula = Purchased ~ .,
  34. data = training_set,
  35. method = "class",
  36. control = rpart.control(minsplit = 1))
  37.  
  38. # DT with post-pruning
  39. pruned_DT <- prune(classifier,
  40. cp = classifier$cptable[which.min(classifier$cptable[,"xerror"]),"CP"])
  41.  
  42. # Predicting the Test set results
  43. # The below code will give the probabilities
  44. # y_pred = predict(classifier, newdata = test_set[-3])
  45.  
  46. # The below code will give the class
  47. y_pred = predict(classifier, newdata = test_set[-3], type = 'class')
  48.  
  49. # Making the Confusion Matrix
  50. y_pred <- factor(y_pred, levels = c(0,1), labels = c(0,1)) # using class lables
  51. cm <- as.matrix(table(Actual = test_set[, 3], Predicted = y_pred)) # create the confusion matrix
  52.  
  53. # Calculating accuracy
  54. (accuracy <- mean(y_pred == test_set$Purchased))
  55.  
  56. # Model Evaluation
  57. # Applying k-Fold Cross Validation
  58. # The DT without prunning
  59. # install.packages('caret')
  60. library(caret)
  61. folds = createFolds(training_set$Purchased, k = 10)
  62. cv = lapply(folds, function(x) {
  63. training_fold = training_set[-x, ]
  64. test_fold = training_set[x, ]
  65. classifier <- rpart(formula = Purchased ~ .,
  66. data = training_fold,
  67. method = "class",
  68. control = rpart.control(minsplit = 1))
  69. y_pred = predict(classifier, newdata = test_fold[-3], type = 'class')
  70. y_pred <- factor(y_pred, levels = c(0,1), labels = c(0,1))
  71. cm = table(test_fold[, 3], y_pred)
  72. #accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
  73. accuracy = sum(diag(cm)) / sum(cm)
  74. return(accuracy)
  75. })
  76. (accuracy_k_folds <- mean(as.numeric(cv)))
  77.  
  78. # Model Evaluation
  79. # Applying k-Fold Cross Validation
  80. # The DT after prunning
  81. library(caret)
  82. folds = createFolds(training_set$Purchased, k = 10)
  83. cv = lapply(folds, function(x) {
  84. training_fold = training_set[-x, ]
  85. test_fold = training_set[x, ]
  86. pruned_DT <- prune(classifier,
  87. cp = classifier$cptable[which.min(classifier$cptable[,"xerror"]),"CP"])
  88. y_pred = predict(pruned_DT, newdata = test_fold[-3], type = 'class')
  89. y_pred <- factor(y_pred, levels = c(0,1), labels = c(0,1))
  90. cm = table(test_fold[, 3], y_pred)
  91. #accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
  92. accuracy = sum(diag(cm)) / sum(cm)
  93. return(accuracy)
  94. })
  95. (accuracy_k_folds <- mean(as.numeric(cv)))
  96.  
  97.  
  98. # Applying Grid Search to find the best parameters (error loading package!)
  99. # Logistic Model Trees
  100. # install.packages('caret')
  101. library(caret)
  102. classifier = train(form = Purchased ~ ., data = training_set, method = 'LMT')
  103. classifier
  104. classifier$bestTune
  105.  
  106.  
  107. # Visualising the Training set results
  108. library(ElemStatLearn)
  109. set = training_set
  110. X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
  111. X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
  112. grid_set = expand.grid(X1, X2)
  113. colnames(grid_set) = c('Age', 'EstimatedSalary')
  114. y_grid = predict(classifier, newdata = grid_set, type = 'class')
  115. plot(set[, -3],
  116. main = 'Decision Tree Classification (Training set)',
  117. xlab = 'Age', ylab = 'Estimated Salary',
  118. xlim = range(X1), ylim = range(X2))
  119. contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
  120. points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
  121. points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
  122.  
  123. # Visualising the Test set results
  124. library(ElemStatLearn)
  125. set = test_set
  126. X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
  127. X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
  128. grid_set = expand.grid(X1, X2)
  129. colnames(grid_set) = c('Age', 'EstimatedSalary')
  130. y_grid = predict(classifier, newdata = grid_set, type = 'class')
  131. plot(set[, -3], main = 'Decision Tree Classification (Test set)',
  132. xlab = 'Age', ylab = 'Estimated Salary',
  133. xlim = range(X1), ylim = range(X2))
  134. contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
  135. points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
  136. points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
  137.  
  138. # Plotting the tree
  139. # These generate poor plots
  140. plot(classifier)
  141. text(classifier)
  142.  
  143. # A better approach to plot DT
  144.  
  145. library(rpart)
  146. library(rpart.plot)
  147.  
  148. prp(classifier)
  149. prp(pruned_DT)
  150.  
  151. rpart.plot(classifier)
  152. rpart.plot(pruned_DT)
  153.  
  154. # Diplaying and ploting Complexity Parameter (cp)
  155. printcp(classifier)
  156. plotcp(classifier)
  157. rsq.rpart(classifier)
Add Comment
Please, Sign In to add comment