Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Decision Tree Classification
- # we may need to prune the DT to avoid overfitting
- rm(list = ls())
- # Importing the dataset
- dataset <- read.csv(file.path(getwd(),'Data/Social_Network_Ads.csv'))
- dataset = dataset[3:5]
- # Encoding the target feature as factor
- dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))
- # Splitting the dataset into the Training set and Test set
- # install.packages('caTools')
- library(caTools)
- set.seed(123)
- split = sample.split(dataset$Purchased, SplitRatio = 0.75)
- training_set = subset(dataset, split == TRUE)
- test_set = subset(dataset, split == FALSE)
- # Feature Scaling
- # DT does not require scaling because it does not use Euclidean distance.
- # However, if we dont scale the data, the visualization will take much longer.
- # I must mention that I noticed the performance of DT with scaled data is degraded slightly.
- training_set[-3] = scale(training_set[-3])
- test_set[-3] = scale(test_set[-3])
- # Fitting Decision Tree Classification to the Training set
- # install.packages('rpart')
- library(rpart)
- # DT without pruning
- classifier <- rpart(formula = Purchased ~ .,
- data = training_set,
- method = "class",
- control = rpart.control(minsplit = 1))
- # DT with post-pruning
- pruned_DT <- prune(classifier,
- cp = classifier$cptable[which.min(classifier$cptable[,"xerror"]),"CP"])
- # Predicting the Test set results
- # The below code will give the probabilities
- # y_pred = predict(classifier, newdata = test_set[-3])
- # The below code will give the class
- y_pred = predict(classifier, newdata = test_set[-3], type = 'class')
- # Making the Confusion Matrix
- y_pred <- factor(y_pred, levels = c(0,1), labels = c(0,1)) # using class lables
- cm <- as.matrix(table(Actual = test_set[, 3], Predicted = y_pred)) # create the confusion matrix
- # Calculating accuracy
- (accuracy <- mean(y_pred == test_set$Purchased))
- # Model Evaluation
- # Applying k-Fold Cross Validation
- # The DT without prunning
- # install.packages('caret')
- library(caret)
- folds = createFolds(training_set$Purchased, k = 10)
- cv = lapply(folds, function(x) {
- training_fold = training_set[-x, ]
- test_fold = training_set[x, ]
- classifier <- rpart(formula = Purchased ~ .,
- data = training_fold,
- method = "class",
- control = rpart.control(minsplit = 1))
- y_pred = predict(classifier, newdata = test_fold[-3], type = 'class')
- y_pred <- factor(y_pred, levels = c(0,1), labels = c(0,1))
- cm = table(test_fold[, 3], y_pred)
- #accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
- accuracy = sum(diag(cm)) / sum(cm)
- return(accuracy)
- })
- (accuracy_k_folds <- mean(as.numeric(cv)))
- # Model Evaluation
- # Applying k-Fold Cross Validation
- # The DT after prunning
- library(caret)
- folds = createFolds(training_set$Purchased, k = 10)
- cv = lapply(folds, function(x) {
- training_fold = training_set[-x, ]
- test_fold = training_set[x, ]
- pruned_DT <- prune(classifier,
- cp = classifier$cptable[which.min(classifier$cptable[,"xerror"]),"CP"])
- y_pred = predict(pruned_DT, newdata = test_fold[-3], type = 'class')
- y_pred <- factor(y_pred, levels = c(0,1), labels = c(0,1))
- cm = table(test_fold[, 3], y_pred)
- #accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
- accuracy = sum(diag(cm)) / sum(cm)
- return(accuracy)
- })
- (accuracy_k_folds <- mean(as.numeric(cv)))
- # Applying Grid Search to find the best parameters (error loading package!)
- # Logistic Model Trees
- # install.packages('caret')
- library(caret)
- classifier = train(form = Purchased ~ ., data = training_set, method = 'LMT')
- classifier
- classifier$bestTune
- # Visualising the Training set results
- library(ElemStatLearn)
- set = training_set
- X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
- X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
- grid_set = expand.grid(X1, X2)
- colnames(grid_set) = c('Age', 'EstimatedSalary')
- y_grid = predict(classifier, newdata = grid_set, type = 'class')
- plot(set[, -3],
- main = 'Decision Tree Classification (Training set)',
- xlab = 'Age', ylab = 'Estimated Salary',
- xlim = range(X1), ylim = range(X2))
- contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
- points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
- points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
- # Visualising the Test set results
- library(ElemStatLearn)
- set = test_set
- X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
- X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
- grid_set = expand.grid(X1, X2)
- colnames(grid_set) = c('Age', 'EstimatedSalary')
- y_grid = predict(classifier, newdata = grid_set, type = 'class')
- plot(set[, -3], main = 'Decision Tree Classification (Test set)',
- xlab = 'Age', ylab = 'Estimated Salary',
- xlim = range(X1), ylim = range(X2))
- contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
- points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
- points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
- # Plotting the tree
- # These generate poor plots
- plot(classifier)
- text(classifier)
- # A better approach to plot DT
- library(rpart)
- library(rpart.plot)
- prp(classifier)
- prp(pruned_DT)
- rpart.plot(classifier)
- rpart.plot(pruned_DT)
- # Diplaying and ploting Complexity Parameter (cp)
- printcp(classifier)
- plotcp(classifier)
- rsq.rpart(classifier)
Add Comment
Please, Sign In to add comment