Advertisement
Guest User

Untitled

a guest
Nov 20th, 2019
147
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 18.17 KB | None | 0 0
  1. #Check if packages are already installed, if not install them
  2. # https://stackoverflow.com/questions/4090169/elegant-way-to-check-for-missing-packages-and-install-them
  3.  
  4. list.of.packages <- c("ggplot2", "MLmetrics", "ellipse", "reshape2", "Matrix", "e1071", "gbm", "rpart", "rattle", "GGally", "randomForest", "caret", "skimr", "pROC", "epiDisplay", "tidyverse", "pROC", "xgboost", "ROCR", "DescTools")
  5. new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
  6. if(length(new.packages)) install.packages(new.packages)
  7. #install.packages('e1071', dependencies=TRUE)#This avoids a missing e1071 error when generating prediction models using caret
  8.  
  9. library(rpart)
  10. library(rattle) #fancyrpart chart
  11. library(GGally)
  12. library(randomForest)
  13. library(caret)
  14. library(skimr)
  15. library(epiDisplay)
  16. library(tidyverse) # Tidyverse includes all of the libraries below:
  17. library(pROC) ## For the AUC comparison
  18. library(xgboost) ## classification algorithm
  19. library(Matrix)
  20. library(DescTools) ## Creates descriptive charts
  21. library(gbm)
  22. library(reshape2)
  23. library(ellipse)library(MLmetrics)
  24.  
  25.  
  26. #switch off scientific notation for numbers
  27. options(scipen=999)
  28. set.seed(0)
  29.  
  30. # configures the format of desctools, so it uses a comma rather than an apostrophe for big numbers
  31. options(fmt.abs=structure(list(digits=0, big.mark=","), class="fmt"))
  32.  
  33. #import Data - change to where yours is sitting
  34. phishing_website_data <- read_csv("c:\\Chris\\Ass3\\Dataset_PhishingWebsite_A2.csv" )
  35. str(phishing_website_data)
  36.  
  37.  
  38. #remove any duplicates
  39. phishing_website_data <- distinct(phishing_website_data)
  40.  
  41. #remove the index column from predictors
  42. phishing_website_data <- dplyr::select(phishing_website_data, -index)
  43.  
  44. #Check for missing values and a quick histogram of values
  45. skim(phishing_website_data)
  46.  
  47. head(phishing_website_data)
  48.  
  49. #convert the dependant variable to a factor for easier prediction methods...maybe
  50.  
  51.  
  52.  
  53. #remove all columns with low variance, i.e. if they are alway 1s, or -1
  54.  
  55. #Check the probabilities of the dependent variable
  56. #results <- tab1(phishing_website_data$Result, sort.group = "decreasing", cum.percent = FALSE, main = " Distribution of Result in the whole dataset")
  57. #knitr::kable(results)
  58. Desc(as.factor(phishing_website_data$Result), plotit = TRUE)
  59.  
  60.  
  61. #===============================================================================================
  62.  
  63. #Create a stratified data split for training and testing:
  64.  
  65.  
  66.  
  67. ## Create training and testing data sets to use for all models.
  68. ## randomly choose 70% of the data set as training data
  69.  
  70.  
  71. #===============================================================================================
  72. #Using Caret to create a stratified split on the results column
  73. trainIndex <- createDataPartition(phishing_website_data$Result, p = .7,
  74.                                   list = FALSE,
  75.                                   times = 1)
  76. head(trainIndex)
  77.  
  78. train <- phishing_website_data[ trainIndex, ]
  79. test <- phishing_website_data[ -trainIndex, ]
  80.  
  81. #Show result splits in training and test
  82.  
  83. Desc(as.factor(train$Result), plotit = TRUE)
  84.  
  85. Desc(as.factor(test$Result), plotit = TRUE)
  86.  
  87. #=====================================================
  88. #Training data
  89. train$Result <- ifelse(train$Result == -1,0,1)
  90. trainY <- as.factor(train$Result)
  91.  
  92. label_name <- 'Result'
  93. trainX <- train %>% select(-Result)
  94. predictors <- names(train)[names(train) != label_name]
  95.  
  96. #Testing Data
  97. testX <- test %>% select(-Result)
  98. test$Result <- ifelse(test$Result == -1,0,1)
  99. testY <-as.factor(test$Result)
  100.  
  101.  
  102.  
  103. #===============================================================================================
  104.  
  105. #visualise the correlations (makes it easier to see the correlation strength)
  106. ggcorr(phishing_website_data, label = TRUE, label_size = 2, size = 3, hjust = 1, legend.size = 5, legend.position = "none" ) +
  107.   labs(title="Pearson Correlation matrix (All columns)")
  108.  
  109. #Find correlations between columns and save to new data frame
  110. correlations <- as.data.frame(cor(train, method = "pearson"))
  111. head(correlations)
  112.  
  113. #we are only interested in correlations relating to the result column
  114. correlations <- dplyr::select(correlations, Result)
  115.  
  116. #dplyr filter omits rownames, so the a workaround is to use functions fron tibble library:
  117. #copy rownames to a new column
  118. correlations<-correlations %>% rownames_to_column('new_column')
  119.  
  120. #filter the data
  121. correlations<-filter_at(correlations, vars(-new_column), any_vars(. > 0.25))
  122. #correlations<-filter_at(correlations, vars(-new_column), any_vars(. != 1)) #removes the Result to Result (1.0) correlation
  123.  
  124. #view the result
  125. correlations$new_column
  126.  
  127.  
  128. #==============================================================================================
  129. ## we can see from the correlation matrix visualisation that there
  130. ## only a small number of columns that have a strong correlation with
  131. ## the result column
  132.  
  133. ## Attributes with correlation value >=0.3 are:
  134. ## URL_of_Anchor
  135. ## SSLfinal_State
  136. ## Prefix_Suffix
  137. ## having_Sub_Domain
  138. ## Request_URL
  139. ## web_traffic
  140.  
  141. #Select the strongly correlated columns along with the Result column for the  Model.
  142.  
  143. #another library has a select statement, so I have to use the dplyr:: to ensure that's the one used,
  144. #also used the correlations data set to pick the columns automatically
  145. dt.train <- dplyr::select( train, correlations$new_column )
  146. dt.test <- dplyr::select( test, correlations$new_column )
  147.  
  148.  
  149. ggcorr( dt.train, label = TRUE,label_size = 5, size = 4, hjust = 0.75, legend.size = 5, legend.position = "none" ) +
  150.   labs(title="Pearson Correlation matrix")
  151.  
  152.  
  153. ################################################################################
  154. ## BUILD MODELs
  155.  
  156. # Build a decision tree from the revised phishing_website_data dataset to predict
  157.  
  158. #convert the dependent variable Results into a Factor
  159. dt.train$Result <- as.factor(train$Result)
  160. dt.test$Result <- as.factor(test$Result)
  161.  
  162. #build Decision tree with no parameters (accuracy about 92%)
  163. tree <- rpart(Result ~. ,data= dt.train, method="class")
  164.  
  165.  
  166. # Reports the model
  167. print( tree)
  168.  
  169.  
  170. ## plot the tree structure
  171. fancyRpartPlot(tree, main = "Decision Tree Model of phishing website Data" )
  172.  
  173. ## print the tree structure
  174. summary( tree)
  175.  
  176. ################################################################################
  177. ## MODEL EVALUATION
  178. ## make prediction using decision model
  179.  
  180. dt.predictions <- predict( tree,  dt.test, type = "class")
  181.  
  182.  
  183. ## Using Caret to obtain the model evaluation
  184. ref<- as.factor(dt.test$Result)
  185. dt.cm <- confusionMatrix(dt.predictions, ref)
  186. dt.cm
  187.  
  188. ## looking at features which has a different shape when the Result changes
  189.  diff_features <- phishing_website_data[c(7, 8, 9, 13, 14, 15, 24, 26 )]
  190.  
  191. featurePlot(x = diff_features,
  192.             y = as.factor(phishing_website_data$Result),
  193.             plot = "density",
  194.             scales = list(x = list(relation="free"),
  195.                           y = list(relation="free")),
  196.             adjust = 1.5,
  197.             pch = "|"
  198.             #layout = c(4, 8),
  199.             #auto.key = list(columns = 3)
  200. )
  201.  
  202. ################################################################################
  203. ## RANDOM FOREST
  204.  
  205.  
  206. rf.train <-  train
  207. rf.test <-  test
  208.  
  209. # change the Result column to factor (since there are only 2 values, we need to do classification rather than regression)
  210. rf.train$Result <- as.factor( rf.train$Result)
  211. rf.test$Result <- as.factor( rf.test$Result)
  212.  
  213. # Create a Random Forest model with default parameters
  214. model1 <- randomForest(Result ~ ., data =  rf.train, importance = TRUE)
  215. model1
  216.  
  217.  
  218. # Using For loop to identify the best mtry (number of variables per split) for the most accurate model.
  219. a=c()
  220.  
  221. #create variables to store the best mtry and accuracy for each iteration
  222. mtryResults <- as.data.frame(matrix(ncol = 2, nrow = 0))
  223. x <- c("mtry", "accuracy")
  224. colnames(mtryResults) <- x
  225. rm(x)
  226.  
  227. mtry.value = 0
  228. accuracy = 0
  229. for (i in 3:31) {
  230.   print(i) #output line to see current iteration
  231.  
  232.   #create model
  233.   model3 <- randomForest(Result ~ ., data =  rf.train, ntree = 500, mtry = i, importance = TRUE)
  234.  
  235.   #run prediction against current model iteration
  236.   predValid <- predict(model3,  rf.test, type = "class")
  237.  
  238.   #save accuracy to matrix
  239.   a[i-2] = mean(predValid ==  rf.test$Result)
  240.   print(a[i-2])
  241.  
  242.   #save mtry and accuracy to data frame. Give a better data set to visualize.
  243.   resultTemp <- data.frame(mtry = i, accuracy = mean(predValid ==  rf.test$Result))
  244.   mtryResults <- rbind(mtryResults, resultTemp)
  245.   rm(resultTemp)
  246.  
  247.  
  248.   #check if the current iteration accuracy is the highest and save the mtry to pass as fine tuning parameter
  249.   if(a[i-2] > accuracy )
  250.   {
  251.     mtry.value = i
  252.     accuracy <- a[i-2]
  253.   }
  254. }
  255.  
  256. #display accuracy of each iteration
  257. a
  258.  
  259. #plot results using ggplot
  260. mtry.plot <- ggplot(mtryResults, aes(x=mtry, y=accuracy)) +
  261.   geom_point() +
  262.   geom_line() +
  263.   theme_bw() +
  264.   labs(title="plot of accuracy for each mtry value")
  265. mtry.plot
  266.  
  267. #display selected best mtry
  268. print(mtry.value)
  269.  
  270. # Fine tuning parameters of Random Forest model
  271. model2 <- randomForest(Result ~ ., data =  rf.train, ntree = 500, mtry = mtry.value, importance = TRUE)
  272. model2
  273.  
  274.  
  275. ## MODEL EVALUATION
  276.  
  277. ## Predict test set outcomes, reporting class labels
  278. rf.predictions <- predict(model2,  rf.test, type="class")
  279.  
  280. rf.cm <- confusionMatrix(rf.predictions, rf.test$Result)
  281. rf.cm
  282.  
  283. ## calculate the confusion matrix
  284. rf.confusion <- table( rf.predictions,  rf.test$Result)
  285. print( rf.confusion)
  286.  
  287. ## accuracy
  288. rf.accuracy <- sum(diag( rf.confusion)) / sum( rf.confusion)
  289. print( rf.accuracy)
  290.  
  291. #Precision per class  ( Exactness % )
  292. rf.precision.A <-  rf.confusion[1,1] / sum( rf.confusion[,1])
  293. print( rf.precision.A)
  294.  
  295. rf.precision.B <-  rf.confusion[2,2] / sum( rf.confusion[,2])
  296. print( rf.precision.B)
  297.  
  298. #Overall precision ( Exactness % )
  299. overall.rf.precision<-( rf.precision.A+ rf.precision.B)/2
  300. print(overall.rf.precision)
  301.  
  302. #Recall per class ( Completeness % )
  303. rf.recall.A <-  rf.confusion[1,1] / sum( rf.confusion[1,])
  304. print( rf.recall.A)
  305.  
  306. rf.recall.B <-  rf.confusion[2,2] / sum( rf.confusion[2,])
  307. print( rf.recall.B)
  308.  
  309.  
  310. #Overall recall  ( Completeness % )
  311. overall.rf.recall<-( rf.recall.A+ rf.recall.B)/2
  312. print(overall.rf.recall)
  313.  
  314. #F1 score  ( Harmonic mean of precision and recall )
  315. rf.f1 <- 2 * overall.rf.precision * overall.rf.recall / (overall.rf.precision + overall.rf.recall)
  316. print( rf.f1)
  317.  
  318. # To check important variables
  319. #importance(model2)        
  320. #varImpPlot(model2)    
  321.  
  322.  
  323.  
  324. ###############################################################################################
  325. #
  326. # Logistic Regression
  327.  
  328.  
  329. train$Result
  330. #
  331.  
  332. # From previous results we only have those that are significant,
  333. classifier = glm(formula = Result ~ URL_of_Anchor + SSLfinal_State + having_Sub_Domain + Request_URL + web_traffic,
  334.                  #having_IPhaving_IP_Address + having_Sub_Domain + SSLfinal_State + HTTPS_token + URL_of_Anchor + Links_in_tags + SFH + Redirect + DNSRecord + web_traffic + Google_Index + Links_pointing_to_page,
  335.                  family = binomial,
  336.                  data = train)
  337.  
  338. #show summary of model
  339. summary(classifier)
  340.  
  341. ##This shows the fields which have an impact in deciding if it is a result or not (remove those without any stars)
  342.  
  343. prob_pred = predict(classifier, type = 'response', newdata = test[,predictors]) # this gives the probabilities
  344.  
  345.  
  346. y_pred = ifelse(prob_pred > 0.5, 1, 0) # using the probabilities to define if the user will buy or not
  347. y_pred <-as.factor(y_pred)
  348. #str(y_pred)
  349. #y_pred
  350.  
  351. # Making the Confusion Matrix
  352.  
  353. lr.cm <- confusionMatrix(data = y_pred, reference = testY)
  354. lr.cm
  355.  
  356. #====================================================================================
  357. #GBM Model
  358.  
  359. train$Result <-as.factor(train$Result)
  360. test$Result <- as.factor(test$Result)
  361.  
  362. trainY <-ifelse(train$Result == 0, "N","Y")
  363. head(trainY)
  364. testY <-ifelse(test$Result == 0, "N","Y")
  365.  
  366.  
  367. #create an object to control the number of cross validations performed
  368. myControl <- trainControl(method='repeatedcv',
  369.                           number=5,
  370.                           returnResamp='none',
  371.                           summaryFunction=twoClassSummary,  # Use AUC to pick the best model
  372.                           classProbs=TRUE,
  373.                           allowParallel = TRUE)
  374.  
  375. #Create grid to get multiple values and we select the best one
  376. grid <- expand.grid(interaction.depth=c(1,2), # Depth of variable interactions
  377.                     n.trees=c(250, 300),            # Num trees to fit
  378.                     shrinkage=c(0.01,0.1),      # Try 2 values for learning rate
  379.                     n.minobsinnode = 20)
  380.  
  381. gbm.tune <- train(x=train[,predictors],y=trainY,
  382.                   method = "gbm",
  383.                   metric = "ROC",
  384.                   trControl = myControl,
  385.                   tuneGrid=grid,
  386.                   verbose=FALSE)
  387.  
  388. #The best tuning settings.  Could increasing trees even more improve the scores? it has so far
  389. gbm.tune$bestTune
  390. plot(gbm.tune)  
  391. res <- gbm.tune$results
  392. res
  393.  
  394. gbm.pred <- predict(gbm.tune,testX)
  395. str(gbm.pred)
  396. testY <-as.factor(testY)
  397.  
  398. gbm.cm <- confusionMatrix(gbm.pred,testY)
  399. gbm.cm
  400.  
  401. #=======================================================================================
  402.  
  403. #Training the knn model
  404. model_knn<-train(train[,predictors],trainY,method='knn',trControl=myControl,tuneLength=3)
  405.  
  406. #Predicting using knn model
  407. test$pred_knn <- predict(object = model_knn,test[,predictors])
  408.  
  409. #Checking the accuracy of the random forest model
  410. knn.cm <- confusionMatrix(testY, test$pred_knn)
  411. knn.cm
  412.  
  413.  
  414.  
  415. #========================================================================================
  416. #XGBoost
  417.  
  418.  
  419. input_x <- as.matrix(dplyr::select(train, -Result))
  420. input_y <- train$Result
  421.  
  422. #Default tuning
  423. grid_default <- expand.grid(
  424.   nrounds = 300,
  425.   max_depth = 30,
  426.   eta = 0.3,
  427.   gamma = 0,
  428.   colsample_bytree = 1,
  429.   min_child_weight = 1,
  430.   subsample = 1
  431. )
  432. train_control <- caret::trainControl(
  433.   method = "none",
  434.   verboseIter = FALSE, # no training log
  435.   allowParallel = TRUE # FALSE for reproducible results
  436. )
  437. xgb_base <- caret::train(
  438.   x = input_x,
  439.   y = input_y,
  440.   trControl = train_control,
  441.   tuneGrid = grid_default,
  442.   method = "xgbTree",
  443.   verbose = TRUE
  444. )
  445. xgb_base$bestTune
  446. predictions<-predict(xgb_base,test)
  447. cm<-table(predictions=predictions,actual=test$Result)
  448. cm
  449. xgb.accuracy <- sum(diag( cm)) / sum( cm)
  450. print(xgb.accuracy)
  451. # Tuned Grid and cross validation
  452.  
  453. tune_grid <- expand.grid(
  454.   nrounds = c(30, 400, 500),
  455.   max_depth = 19,
  456.   eta = 0.3,
  457.   gamma = 1,
  458.   colsample_bytree = 1,
  459.   min_child_weight = 1,
  460.   subsample = 1
  461. )
  462. tune_control <- caret::trainControl(
  463.   method = "cv", # cross-validation
  464.   number = 3, # with n folds
  465.   #index = createFolds(tr_treated$Id_clean), # fix the folds
  466.   verboseIter = FALSE, # no training log
  467.   allowParallel = TRUE # FALSE for reproducible results
  468. )
  469. xgb_tune <- caret::train(
  470.   x = input_x,
  471.   y = input_y,
  472.   trControl = tune_control,
  473.   tuneGrid = tune_grid,
  474.   method = "xgbTree",
  475.   verbose = TRUE
  476. )
  477. xgb_tune$bestTune
  478. predictions<-predict(xgb_tune,test)
  479.  
  480. xgb.cm <- confusionMatrix(predictions, test$Result)
  481. xgb.cm
  482.  
  483.  
  484.  
  485. ###############################################################################
  486. ## BAGGING
  487.  
  488. #setting up cross-validation
  489. cvcontrol <- trainControl(method="repeatedcv", number = 10,
  490.                           allowParallel=TRUE)
  491.  
  492. #Using treebag
  493. train.bagg <- train(Result ~ .,
  494.                     data=train,
  495.                     method="treebag",
  496.                     trControl=cvcontrol,
  497.                     importance=TRUE)
  498.  
  499. # show bagging results
  500. train.bagg
  501.  
  502. #plot variable importance
  503. plot(varImp(train.bagg))
  504.  
  505. # obtain class predictions
  506. bagg.classTest <-  predict(train.bagg,
  507.                            newdata = test,
  508.                            type="raw")
  509. head(bagg.classTest)
  510.  
  511. # Create Confusion matrix
  512. bagg.cm <- confusionMatrix(test$Result,bagg.classTest)
  513. bagg.cm
  514.  
  515.  
  516. #prepare data for AUC
  517. bagg.probs=predict(train.bagg,
  518.                    newdata=test,
  519.                    type="prob")
  520. head(bagg.probs)
  521.  
  522. # Plot AUC curve
  523. rocCurve.bagg <- roc(test$Result,bagg.probs[,1])
  524. plot(rocCurve.bagg,col=c(6))
  525.  
  526. ###############################################################################
  527. # Model Comparsion
  528.  
  529. # View Confusion Matrices
  530. dt.cm
  531. rf.cm
  532. lr.cm
  533. gbm.cm
  534. knn.cm
  535. xgb.cm
  536. bagg.cm
  537.  
  538.  
  539. #Create dataFrame to store Model Results
  540. ModelResults <- as.data.frame(matrix(ncol = 3, nrow = 0))
  541. colnames(ModelResults) <- c("ModelType", "Metric", "Value")
  542.  
  543.  
  544. #Save result metric to data frame
  545. ModelResults <- rbind(ModelResults, data.frame(ModelType = "Decision Tree", Metric = "Accuracy", Value = dt.cm[["overall"]][["Accuracy"]] *100))
  546. ModelResults <- rbind(ModelResults, data.frame(ModelType = "Random Forest", Metric = "Accuracy", Value = rf.cm[["overall"]][["Accuracy"]] *100))
  547. ModelResults <- rbind(ModelResults, data.frame(ModelType = "Linear Regression", Metric = "Accuracy", Value = lr.cm[["overall"]][["Accuracy"]] *100))
  548. ModelResults <- rbind(ModelResults, data.frame(ModelType = "Gradient Boosting", Metric = "Accuracy", Value = gbm.cm[["overall"]][["Accuracy"]] *100))
  549. ModelResults <- rbind(ModelResults, data.frame(ModelType = "K-Nearest Neighbour", Metric = "Accuracy", Value = knn.cm[["overall"]][["Accuracy"]] *100))
  550. ModelResults <- rbind(ModelResults, data.frame(ModelType = "XGBoost", Metric = "Accuracy", Value = xgb.cm[["overall"]][["Accuracy"]] *100))
  551. ModelResults <- rbind(ModelResults, data.frame(ModelType = "Bagging", Metric = "Accuracy", Value = bagg.cm[["overall"]][["Accuracy"]] *100))
  552.  
  553. ModelResults
  554.  
  555. # wrap text on model type
  556. ModelResults$ModelType = str_wrap(ModelResults$ModelType, width = 10)
  557.  
  558. #round Value
  559. ModelResults$Value <- round(ModelResults$Value, 2)
  560.  
  561. #plot Overall results
  562. ggplot(data=ModelResults, aes(x=ModelType, y=Value, group=Metric)) +
  563.   geom_line(aes(color=Metric)) +
  564.   geom_point(aes(color=Metric)) +
  565.   #geom_text(aes(label=Value)) +
  566.   theme_bw() +
  567.   labs(title="Prediction Model Accuracy Comparison") +
  568.   facet_wrap(~Metric)
  569.  
  570.  
  571. #clean up environment and remove from memory
  572. rm(list = ls())
  573. Gc
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement