Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #Check if packages are already installed, if not install them
- # https://stackoverflow.com/questions/4090169/elegant-way-to-check-for-missing-packages-and-install-them
- list.of.packages <- c("ggplot2", "MLmetrics", "ellipse", "reshape2", "Matrix", "e1071", "gbm", "rpart", "rattle", "GGally", "randomForest", "caret", "skimr", "pROC", "epiDisplay", "tidyverse", "pROC", "xgboost", "ROCR", "DescTools")
- new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
- if(length(new.packages)) install.packages(new.packages)
- #install.packages('e1071', dependencies=TRUE)#This avoids a missing e1071 error when generating prediction models using caret
- library(rpart)
- library(rattle) #fancyrpart chart
- library(GGally)
- library(randomForest)
- library(caret)
- library(skimr)
- library(epiDisplay)
- library(tidyverse) # Tidyverse includes all of the libraries below:
- library(pROC) ## For the AUC comparison
- library(xgboost) ## classification algorithm
- library(Matrix)
- library(DescTools) ## Creates descriptive charts
- library(gbm)
- library(reshape2)
- library(ellipse)library(MLmetrics)
- #switch off scientific notation for numbers
- options(scipen=999)
- set.seed(0)
- # configures the format of desctools, so it uses a comma rather than an apostrophe for big numbers
- options(fmt.abs=structure(list(digits=0, big.mark=","), class="fmt"))
- #import Data - change to where yours is sitting
- phishing_website_data <- read_csv("c:\\Chris\\Ass3\\Dataset_PhishingWebsite_A2.csv" )
- str(phishing_website_data)
- #remove any duplicates
- phishing_website_data <- distinct(phishing_website_data)
- #remove the index column from predictors
- phishing_website_data <- dplyr::select(phishing_website_data, -index)
- #Check for missing values and a quick histogram of values
- skim(phishing_website_data)
- head(phishing_website_data)
- #convert the dependant variable to a factor for easier prediction methods...maybe
- #remove all columns with low variance, i.e. if they are alway 1s, or -1
- #Check the probabilities of the dependent variable
- #results <- tab1(phishing_website_data$Result, sort.group = "decreasing", cum.percent = FALSE, main = " Distribution of Result in the whole dataset")
- #knitr::kable(results)
- Desc(as.factor(phishing_website_data$Result), plotit = TRUE)
- #===============================================================================================
- #Create a stratified data split for training and testing:
- ## Create training and testing data sets to use for all models.
- ## randomly choose 70% of the data set as training data
- #===============================================================================================
- #Using Caret to create a stratified split on the results column
- trainIndex <- createDataPartition(phishing_website_data$Result, p = .7,
- list = FALSE,
- times = 1)
- head(trainIndex)
- train <- phishing_website_data[ trainIndex, ]
- test <- phishing_website_data[ -trainIndex, ]
- #Show result splits in training and test
- Desc(as.factor(train$Result), plotit = TRUE)
- Desc(as.factor(test$Result), plotit = TRUE)
- #=====================================================
- #Training data
- train$Result <- ifelse(train$Result == -1,0,1)
- trainY <- as.factor(train$Result)
- label_name <- 'Result'
- trainX <- train %>% select(-Result)
- predictors <- names(train)[names(train) != label_name]
- #Testing Data
- testX <- test %>% select(-Result)
- test$Result <- ifelse(test$Result == -1,0,1)
- testY <-as.factor(test$Result)
- #===============================================================================================
- #visualise the correlations (makes it easier to see the correlation strength)
- ggcorr(phishing_website_data, label = TRUE, label_size = 2, size = 3, hjust = 1, legend.size = 5, legend.position = "none" ) +
- labs(title="Pearson Correlation matrix (All columns)")
- #Find correlations between columns and save to new data frame
- correlations <- as.data.frame(cor(train, method = "pearson"))
- head(correlations)
- #we are only interested in correlations relating to the result column
- correlations <- dplyr::select(correlations, Result)
- #dplyr filter omits rownames, so the a workaround is to use functions fron tibble library:
- #copy rownames to a new column
- correlations<-correlations %>% rownames_to_column('new_column')
- #filter the data
- correlations<-filter_at(correlations, vars(-new_column), any_vars(. > 0.25))
- #correlations<-filter_at(correlations, vars(-new_column), any_vars(. != 1)) #removes the Result to Result (1.0) correlation
- #view the result
- correlations$new_column
- #==============================================================================================
- ## we can see from the correlation matrix visualisation that there
- ## only a small number of columns that have a strong correlation with
- ## the result column
- ## Attributes with correlation value >=0.3 are:
- ## URL_of_Anchor
- ## SSLfinal_State
- ## Prefix_Suffix
- ## having_Sub_Domain
- ## Request_URL
- ## web_traffic
- #Select the strongly correlated columns along with the Result column for the Model.
- #another library has a select statement, so I have to use the dplyr:: to ensure that's the one used,
- #also used the correlations data set to pick the columns automatically
- dt.train <- dplyr::select( train, correlations$new_column )
- dt.test <- dplyr::select( test, correlations$new_column )
- ggcorr( dt.train, label = TRUE,label_size = 5, size = 4, hjust = 0.75, legend.size = 5, legend.position = "none" ) +
- labs(title="Pearson Correlation matrix")
- ################################################################################
- ## BUILD MODELs
- # Build a decision tree from the revised phishing_website_data dataset to predict
- #convert the dependent variable Results into a Factor
- dt.train$Result <- as.factor(train$Result)
- dt.test$Result <- as.factor(test$Result)
- #build Decision tree with no parameters (accuracy about 92%)
- tree <- rpart(Result ~. ,data= dt.train, method="class")
- # Reports the model
- print( tree)
- ## plot the tree structure
- fancyRpartPlot(tree, main = "Decision Tree Model of phishing website Data" )
- ## print the tree structure
- summary( tree)
- ################################################################################
- ## MODEL EVALUATION
- ## make prediction using decision model
- dt.predictions <- predict( tree, dt.test, type = "class")
- ## Using Caret to obtain the model evaluation
- ref<- as.factor(dt.test$Result)
- dt.cm <- confusionMatrix(dt.predictions, ref)
- dt.cm
- ## looking at features which has a different shape when the Result changes
- diff_features <- phishing_website_data[c(7, 8, 9, 13, 14, 15, 24, 26 )]
- featurePlot(x = diff_features,
- y = as.factor(phishing_website_data$Result),
- plot = "density",
- scales = list(x = list(relation="free"),
- y = list(relation="free")),
- adjust = 1.5,
- pch = "|"
- #layout = c(4, 8),
- #auto.key = list(columns = 3)
- )
- ################################################################################
- ## RANDOM FOREST
- rf.train <- train
- rf.test <- test
- # change the Result column to factor (since there are only 2 values, we need to do classification rather than regression)
- rf.train$Result <- as.factor( rf.train$Result)
- rf.test$Result <- as.factor( rf.test$Result)
- # Create a Random Forest model with default parameters
- model1 <- randomForest(Result ~ ., data = rf.train, importance = TRUE)
- model1
- # Using For loop to identify the best mtry (number of variables per split) for the most accurate model.
- a=c()
- #create variables to store the best mtry and accuracy for each iteration
- mtryResults <- as.data.frame(matrix(ncol = 2, nrow = 0))
- x <- c("mtry", "accuracy")
- colnames(mtryResults) <- x
- rm(x)
- mtry.value = 0
- accuracy = 0
- for (i in 3:31) {
- print(i) #output line to see current iteration
- #create model
- model3 <- randomForest(Result ~ ., data = rf.train, ntree = 500, mtry = i, importance = TRUE)
- #run prediction against current model iteration
- predValid <- predict(model3, rf.test, type = "class")
- #save accuracy to matrix
- a[i-2] = mean(predValid == rf.test$Result)
- print(a[i-2])
- #save mtry and accuracy to data frame. Give a better data set to visualize.
- resultTemp <- data.frame(mtry = i, accuracy = mean(predValid == rf.test$Result))
- mtryResults <- rbind(mtryResults, resultTemp)
- rm(resultTemp)
- #check if the current iteration accuracy is the highest and save the mtry to pass as fine tuning parameter
- if(a[i-2] > accuracy )
- {
- mtry.value = i
- accuracy <- a[i-2]
- }
- }
- #display accuracy of each iteration
- a
- #plot results using ggplot
- mtry.plot <- ggplot(mtryResults, aes(x=mtry, y=accuracy)) +
- geom_point() +
- geom_line() +
- theme_bw() +
- labs(title="plot of accuracy for each mtry value")
- mtry.plot
- #display selected best mtry
- print(mtry.value)
- # Fine tuning parameters of Random Forest model
- model2 <- randomForest(Result ~ ., data = rf.train, ntree = 500, mtry = mtry.value, importance = TRUE)
- model2
- ## MODEL EVALUATION
- ## Predict test set outcomes, reporting class labels
- rf.predictions <- predict(model2, rf.test, type="class")
- rf.cm <- confusionMatrix(rf.predictions, rf.test$Result)
- rf.cm
- ## calculate the confusion matrix
- rf.confusion <- table( rf.predictions, rf.test$Result)
- print( rf.confusion)
- ## accuracy
- rf.accuracy <- sum(diag( rf.confusion)) / sum( rf.confusion)
- print( rf.accuracy)
- #Precision per class ( Exactness % )
- rf.precision.A <- rf.confusion[1,1] / sum( rf.confusion[,1])
- print( rf.precision.A)
- rf.precision.B <- rf.confusion[2,2] / sum( rf.confusion[,2])
- print( rf.precision.B)
- #Overall precision ( Exactness % )
- overall.rf.precision<-( rf.precision.A+ rf.precision.B)/2
- print(overall.rf.precision)
- #Recall per class ( Completeness % )
- rf.recall.A <- rf.confusion[1,1] / sum( rf.confusion[1,])
- print( rf.recall.A)
- rf.recall.B <- rf.confusion[2,2] / sum( rf.confusion[2,])
- print( rf.recall.B)
- #Overall recall ( Completeness % )
- overall.rf.recall<-( rf.recall.A+ rf.recall.B)/2
- print(overall.rf.recall)
- #F1 score ( Harmonic mean of precision and recall )
- rf.f1 <- 2 * overall.rf.precision * overall.rf.recall / (overall.rf.precision + overall.rf.recall)
- print( rf.f1)
- # To check important variables
- #importance(model2)
- #varImpPlot(model2)
- ###############################################################################################
- #
- # Logistic Regression
- train$Result
- #
- # From previous results we only have those that are significant,
- classifier = glm(formula = Result ~ URL_of_Anchor + SSLfinal_State + having_Sub_Domain + Request_URL + web_traffic,
- #having_IPhaving_IP_Address + having_Sub_Domain + SSLfinal_State + HTTPS_token + URL_of_Anchor + Links_in_tags + SFH + Redirect + DNSRecord + web_traffic + Google_Index + Links_pointing_to_page,
- family = binomial,
- data = train)
- #show summary of model
- summary(classifier)
- ##This shows the fields which have an impact in deciding if it is a result or not (remove those without any stars)
- prob_pred = predict(classifier, type = 'response', newdata = test[,predictors]) # this gives the probabilities
- y_pred = ifelse(prob_pred > 0.5, 1, 0) # using the probabilities to define if the user will buy or not
- y_pred <-as.factor(y_pred)
- #str(y_pred)
- #y_pred
- # Making the Confusion Matrix
- lr.cm <- confusionMatrix(data = y_pred, reference = testY)
- lr.cm
- #====================================================================================
- #GBM Model
- train$Result <-as.factor(train$Result)
- test$Result <- as.factor(test$Result)
- trainY <-ifelse(train$Result == 0, "N","Y")
- head(trainY)
- testY <-ifelse(test$Result == 0, "N","Y")
- #create an object to control the number of cross validations performed
- myControl <- trainControl(method='repeatedcv',
- number=5,
- returnResamp='none',
- summaryFunction=twoClassSummary, # Use AUC to pick the best model
- classProbs=TRUE,
- allowParallel = TRUE)
- #Create grid to get multiple values and we select the best one
- grid <- expand.grid(interaction.depth=c(1,2), # Depth of variable interactions
- n.trees=c(250, 300), # Num trees to fit
- shrinkage=c(0.01,0.1), # Try 2 values for learning rate
- n.minobsinnode = 20)
- gbm.tune <- train(x=train[,predictors],y=trainY,
- method = "gbm",
- metric = "ROC",
- trControl = myControl,
- tuneGrid=grid,
- verbose=FALSE)
- #The best tuning settings. Could increasing trees even more improve the scores? it has so far
- gbm.tune$bestTune
- plot(gbm.tune)
- res <- gbm.tune$results
- res
- gbm.pred <- predict(gbm.tune,testX)
- str(gbm.pred)
- testY <-as.factor(testY)
- gbm.cm <- confusionMatrix(gbm.pred,testY)
- gbm.cm
- #=======================================================================================
- #Training the knn model
- model_knn<-train(train[,predictors],trainY,method='knn',trControl=myControl,tuneLength=3)
- #Predicting using knn model
- test$pred_knn <- predict(object = model_knn,test[,predictors])
- #Checking the accuracy of the random forest model
- knn.cm <- confusionMatrix(testY, test$pred_knn)
- knn.cm
- #========================================================================================
- #XGBoost
- input_x <- as.matrix(dplyr::select(train, -Result))
- input_y <- train$Result
- #Default tuning
- grid_default <- expand.grid(
- nrounds = 300,
- max_depth = 30,
- eta = 0.3,
- gamma = 0,
- colsample_bytree = 1,
- min_child_weight = 1,
- subsample = 1
- )
- train_control <- caret::trainControl(
- method = "none",
- verboseIter = FALSE, # no training log
- allowParallel = TRUE # FALSE for reproducible results
- )
- xgb_base <- caret::train(
- x = input_x,
- y = input_y,
- trControl = train_control,
- tuneGrid = grid_default,
- method = "xgbTree",
- verbose = TRUE
- )
- xgb_base$bestTune
- predictions<-predict(xgb_base,test)
- cm<-table(predictions=predictions,actual=test$Result)
- cm
- xgb.accuracy <- sum(diag( cm)) / sum( cm)
- print(xgb.accuracy)
- # Tuned Grid and cross validation
- tune_grid <- expand.grid(
- nrounds = c(30, 400, 500),
- max_depth = 19,
- eta = 0.3,
- gamma = 1,
- colsample_bytree = 1,
- min_child_weight = 1,
- subsample = 1
- )
- tune_control <- caret::trainControl(
- method = "cv", # cross-validation
- number = 3, # with n folds
- #index = createFolds(tr_treated$Id_clean), # fix the folds
- verboseIter = FALSE, # no training log
- allowParallel = TRUE # FALSE for reproducible results
- )
- xgb_tune <- caret::train(
- x = input_x,
- y = input_y,
- trControl = tune_control,
- tuneGrid = tune_grid,
- method = "xgbTree",
- verbose = TRUE
- )
- xgb_tune$bestTune
- predictions<-predict(xgb_tune,test)
- xgb.cm <- confusionMatrix(predictions, test$Result)
- xgb.cm
- ###############################################################################
- ## BAGGING
- #setting up cross-validation
- cvcontrol <- trainControl(method="repeatedcv", number = 10,
- allowParallel=TRUE)
- #Using treebag
- train.bagg <- train(Result ~ .,
- data=train,
- method="treebag",
- trControl=cvcontrol,
- importance=TRUE)
- # show bagging results
- train.bagg
- #plot variable importance
- plot(varImp(train.bagg))
- # obtain class predictions
- bagg.classTest <- predict(train.bagg,
- newdata = test,
- type="raw")
- head(bagg.classTest)
- # Create Confusion matrix
- bagg.cm <- confusionMatrix(test$Result,bagg.classTest)
- bagg.cm
- #prepare data for AUC
- bagg.probs=predict(train.bagg,
- newdata=test,
- type="prob")
- head(bagg.probs)
- # Plot AUC curve
- rocCurve.bagg <- roc(test$Result,bagg.probs[,1])
- plot(rocCurve.bagg,col=c(6))
- ###############################################################################
- # Model Comparsion
- # View Confusion Matrices
- dt.cm
- rf.cm
- lr.cm
- gbm.cm
- knn.cm
- xgb.cm
- bagg.cm
- #Create dataFrame to store Model Results
- ModelResults <- as.data.frame(matrix(ncol = 3, nrow = 0))
- colnames(ModelResults) <- c("ModelType", "Metric", "Value")
- #Save result metric to data frame
- ModelResults <- rbind(ModelResults, data.frame(ModelType = "Decision Tree", Metric = "Accuracy", Value = dt.cm[["overall"]][["Accuracy"]] *100))
- ModelResults <- rbind(ModelResults, data.frame(ModelType = "Random Forest", Metric = "Accuracy", Value = rf.cm[["overall"]][["Accuracy"]] *100))
- ModelResults <- rbind(ModelResults, data.frame(ModelType = "Linear Regression", Metric = "Accuracy", Value = lr.cm[["overall"]][["Accuracy"]] *100))
- ModelResults <- rbind(ModelResults, data.frame(ModelType = "Gradient Boosting", Metric = "Accuracy", Value = gbm.cm[["overall"]][["Accuracy"]] *100))
- ModelResults <- rbind(ModelResults, data.frame(ModelType = "K-Nearest Neighbour", Metric = "Accuracy", Value = knn.cm[["overall"]][["Accuracy"]] *100))
- ModelResults <- rbind(ModelResults, data.frame(ModelType = "XGBoost", Metric = "Accuracy", Value = xgb.cm[["overall"]][["Accuracy"]] *100))
- ModelResults <- rbind(ModelResults, data.frame(ModelType = "Bagging", Metric = "Accuracy", Value = bagg.cm[["overall"]][["Accuracy"]] *100))
- ModelResults
- # wrap text on model type
- ModelResults$ModelType = str_wrap(ModelResults$ModelType, width = 10)
- #round Value
- ModelResults$Value <- round(ModelResults$Value, 2)
- #plot Overall results
- ggplot(data=ModelResults, aes(x=ModelType, y=Value, group=Metric)) +
- geom_line(aes(color=Metric)) +
- geom_point(aes(color=Metric)) +
- #geom_text(aes(label=Value)) +
- theme_bw() +
- labs(title="Prediction Model Accuracy Comparison") +
- facet_wrap(~Metric)
- #clean up environment and remove from memory
- rm(list = ls())
- Gc
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement