Untitled

#Check if packages are already installed, if not install them
# https://stackoverflow.com/questions/4090169/elegant-way-to-check-for-missing-packages-and-install-them

list.of.packages <- c("ggplot2", "MLmetrics", "ellipse", "reshape2", "Matrix", "e1071", "gbm", "rpart", "rattle", "GGally", "randomForest", "caret", "skimr", "pROC", "epiDisplay", "tidyverse", "pROC", "xgboost", "ROCR", "DescTools")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
#install.packages('e1071', dependencies=TRUE)#This avoids a missing e1071 error when generating prediction models using caret

library(rpart)
library(rattle) #fancyrpart chart
library(GGally)
library(randomForest)
library(caret)
library(skimr)
library(epiDisplay)
library(tidyverse) # Tidyverse includes all of the libraries below:
library(pROC) ## For the AUC comparison
library(xgboost) ## classification algorithm
library(Matrix)
library(DescTools) ## Creates descriptive charts
library(gbm)
library(reshape2)
library(ellipse)library(MLmetrics)


#switch off scientific notation for numbers
options(scipen=999)
set.seed(0)

# configures the format of desctools, so it uses a comma rather than an apostrophe for big numbers
options(fmt.abs=structure(list(digits=0, big.mark=","), class="fmt"))

#import Data - change to where yours is sitting
phishing_website_data <- read_csv("c:\\Chris\\Ass3\\Dataset_PhishingWebsite_A2.csv" )
str(phishing_website_data)


#remove any duplicates
phishing_website_data <- distinct(phishing_website_data)

#remove the index column from predictors
phishing_website_data <- dplyr::select(phishing_website_data, -index)

#Check for missing values and a quick histogram of values
skim(phishing_website_data)

head(phishing_website_data)

#convert the dependant variable to a factor for easier prediction methods...maybe


#remove all columns with low variance, i.e. if they are alway 1s, or -1

#Check the probabilities of the dependent variable
#results <- tab1(phishing_website_data$Result, sort.group = "decreasing", cum.percent = FALSE, main = " Distribution of Result in the whole dataset")
#knitr::kable(results)
Desc(as.factor(phishing_website_data$Result), plotit = TRUE)


#===============================================================================================

#Create a stratified data split for training and testing:


## Create training and testing data sets to use for all models.
## randomly choose 70% of the data set as training data


#===============================================================================================
#Using Caret to create a stratified split on the results column
trainIndex <- createDataPartition(phishing_website_data$Result, p = .7,
                                  list = FALSE,
                                  times = 1)
head(trainIndex)

train <- phishing_website_data[ trainIndex, ]
test <- phishing_website_data[ -trainIndex, ]

#Show result splits in training and test

Desc(as.factor(train$Result), plotit = TRUE)

Desc(as.factor(test$Result), plotit = TRUE)

#=====================================================
#Training data
train$Result <- ifelse(train$Result == -1,0,1)
trainY <- as.factor(train$Result)

label_name <- 'Result'
trainX <- train %>% select(-Result)
predictors <- names(train)[names(train) != label_name]

#Testing Data
testX <- test %>% select(-Result)
test$Result <- ifelse(test$Result == -1,0,1)
testY <-as.factor(test$Result)


#===============================================================================================

#visualise the correlations (makes it easier to see the correlation strength)
ggcorr(phishing_website_data, label = TRUE, label_size = 2, size = 3, hjust = 1, legend.size = 5, legend.position = "none" ) +
  labs(title="Pearson Correlation matrix (All columns)")

#Find correlations between columns and save to new data frame
correlations <- as.data.frame(cor(train, method = "pearson"))
head(correlations)

#we are only interested in correlations relating to the result column
correlations <- dplyr::select(correlations, Result)

#dplyr filter omits rownames, so the a workaround is to use functions fron tibble library:
#copy rownames to a new column
correlations<-correlations %>% rownames_to_column('new_column')

#filter the data
correlations<-filter_at(correlations, vars(-new_column), any_vars(. > 0.25))
#correlations<-filter_at(correlations, vars(-new_column), any_vars(. != 1)) #removes the Result to Result (1.0) correlation

#view the result
correlations$new_column


#==============================================================================================
## we can see from the correlation matrix visualisation that there
## only a small number of columns that have a strong correlation with
## the result column

## Attributes with correlation value >=0.3 are:
## URL_of_Anchor
## SSLfinal_State
## Prefix_Suffix
## having_Sub_Domain
## Request_URL
## web_traffic

#Select the strongly correlated columns along with the Result column for the  Model.

#another library has a select statement, so I have to use the dplyr:: to ensure that's the one used,
#also used the correlations data set to pick the columns automatically
dt.train <- dplyr::select( train, correlations$new_column )
dt.test <- dplyr::select( test, correlations$new_column )


ggcorr( dt.train, label = TRUE,label_size = 5, size = 4, hjust = 0.75, legend.size = 5, legend.position = "none" ) +
  labs(title="Pearson Correlation matrix")


################################################################################
## BUILD MODELs

# Build a decision tree from the revised phishing_website_data dataset to predict

#convert the dependent variable Results into a Factor
dt.train$Result <- as.factor(train$Result)
dt.test$Result <- as.factor(test$Result)

#build Decision tree with no parameters (accuracy about 92%)
tree <- rpart(Result ~. ,data= dt.train, method="class")


# Reports the model
print( tree)


## plot the tree structure
fancyRpartPlot(tree, main = "Decision Tree Model of phishing website Data" )

## print the tree structure
summary( tree)

################################################################################
## MODEL EVALUATION
## make prediction using decision model

dt.predictions <- predict( tree,  dt.test, type = "class")


## Using Caret to obtain the model evaluation
ref<- as.factor(dt.test$Result)
dt.cm <- confusionMatrix(dt.predictions, ref)
dt.cm

## looking at features which has a different shape when the Result changes
 diff_features <- phishing_website_data[c(7, 8, 9, 13, 14, 15, 24, 26 )]

featurePlot(x = diff_features,
            y = as.factor(phishing_website_data$Result),
            plot = "density",
            scales = list(x = list(relation="free"),
                          y = list(relation="free")),
            adjust = 1.5,
            pch = "|"
            #layout = c(4, 8),
            #auto.key = list(columns = 3)
)

################################################################################
## RANDOM FOREST


rf.train <-  train
rf.test <-  test

# change the Result column to factor (since there are only 2 values, we need to do classification rather than regression)
rf.train$Result <- as.factor( rf.train$Result)
rf.test$Result <- as.factor( rf.test$Result)

# Create a Random Forest model with default parameters
model1 <- randomForest(Result ~ ., data =  rf.train, importance = TRUE)
model1


# Using For loop to identify the best mtry (number of variables per split) for the most accurate model.
a=c()

#create variables to store the best mtry and accuracy for each iteration
mtryResults <- as.data.frame(matrix(ncol = 2, nrow = 0))
x <- c("mtry", "accuracy")
colnames(mtryResults) <- x
rm(x)

mtry.value = 0
accuracy = 0
for (i in 3:31) {
  print(i) #output line to see current iteration

  #create model
  model3 <- randomForest(Result ~ ., data =  rf.train, ntree = 500, mtry = i, importance = TRUE)

  #run prediction against current model iteration
  predValid <- predict(model3,  rf.test, type = "class")

  #save accuracy to matrix
  a[i-2] = mean(predValid ==  rf.test$Result)
  print(a[i-2])

  #save mtry and accuracy to data frame. Give a better data set to visualize.
  resultTemp <- data.frame(mtry = i, accuracy = mean(predValid ==  rf.test$Result))
  mtryResults <- rbind(mtryResults, resultTemp)
  rm(resultTemp)


  #check if the current iteration accuracy is the highest and save the mtry to pass as fine tuning parameter
  if(a[i-2] > accuracy )
  {
    mtry.value = i
    accuracy <- a[i-2]
  }
}

#display accuracy of each iteration
a

#plot results using ggplot
mtry.plot <- ggplot(mtryResults, aes(x=mtry, y=accuracy)) +
  geom_point() +
  geom_line() +
  theme_bw() +
  labs(title="plot of accuracy for each mtry value")
mtry.plot

#display selected best mtry
print(mtry.value)

# Fine tuning parameters of Random Forest model
model2 <- randomForest(Result ~ ., data =  rf.train, ntree = 500, mtry = mtry.value, importance = TRUE)
model2


## MODEL EVALUATION

## Predict test set outcomes, reporting class labels
rf.predictions <- predict(model2,  rf.test, type="class")

rf.cm <- confusionMatrix(rf.predictions, rf.test$Result)
rf.cm

## calculate the confusion matrix
rf.confusion <- table( rf.predictions,  rf.test$Result)
print( rf.confusion)

## accuracy
rf.accuracy <- sum(diag( rf.confusion)) / sum( rf.confusion)
print( rf.accuracy)

#Precision per class  ( Exactness % )
rf.precision.A <-  rf.confusion[1,1] / sum( rf.confusion[,1])
print( rf.precision.A)

rf.precision.B <-  rf.confusion[2,2] / sum( rf.confusion[,2])
print( rf.precision.B)

#Overall precision ( Exactness % )
overall.rf.precision<-( rf.precision.A+ rf.precision.B)/2
print(overall.rf.precision)

#Recall per class ( Completeness % )
rf.recall.A <-  rf.confusion[1,1] / sum( rf.confusion[1,])
print( rf.recall.A)

rf.recall.B <-  rf.confusion[2,2] / sum( rf.confusion[2,])
print( rf.recall.B)


#Overall recall  ( Completeness % )
overall.rf.recall<-( rf.recall.A+ rf.recall.B)/2
print(overall.rf.recall)

#F1 score  ( Harmonic mean of precision and recall )
rf.f1 <- 2 * overall.rf.precision * overall.rf.recall / (overall.rf.precision + overall.rf.recall)
print( rf.f1)

# To check important variables
#importance(model2)
#varImpPlot(model2)


###############################################################################################
#
# Logistic Regression


train$Result
#

# From previous results we only have those that are significant,
classifier = glm(formula = Result ~ URL_of_Anchor + SSLfinal_State + having_Sub_Domain + Request_URL + web_traffic,
                 #having_IPhaving_IP_Address + having_Sub_Domain + SSLfinal_State + HTTPS_token + URL_of_Anchor + Links_in_tags + SFH + Redirect + DNSRecord + web_traffic + Google_Index + Links_pointing_to_page,
                 family = binomial,
                 data = train)

#show summary of model
summary(classifier)

##This shows the fields which have an impact in deciding if it is a result or not (remove those without any stars)

prob_pred = predict(classifier, type = 'response', newdata = test[,predictors]) # this gives the probabilities


y_pred = ifelse(prob_pred > 0.5, 1, 0) # using the probabilities to define if the user will buy or not
y_pred <-as.factor(y_pred)
#str(y_pred)
#y_pred

# Making the Confusion Matrix

lr.cm <- confusionMatrix(data = y_pred, reference = testY)
lr.cm

#====================================================================================
#GBM Model

train$Result <-as.factor(train$Result)
test$Result <- as.factor(test$Result)

trainY <-ifelse(train$Result == 0, "N","Y")
head(trainY)
testY <-ifelse(test$Result == 0, "N","Y")


#create an object to control the number of cross validations performed
myControl <- trainControl(method='repeatedcv',
                          number=5,
                          returnResamp='none',
                          summaryFunction=twoClassSummary,  # Use AUC to pick the best model
                          classProbs=TRUE,
                          allowParallel = TRUE)

#Create grid to get multiple values and we select the best one
grid <- expand.grid(interaction.depth=c(1,2), # Depth of variable interactions
                    n.trees=c(250, 300),            # Num trees to fit
                    shrinkage=c(0.01,0.1),      # Try 2 values for learning rate
                    n.minobsinnode = 20)

gbm.tune <- train(x=train[,predictors],y=trainY,
                  method = "gbm",
                  metric = "ROC",
                  trControl = myControl,
                  tuneGrid=grid,
                  verbose=FALSE)

#The best tuning settings.  Could increasing trees even more improve the scores? it has so far
gbm.tune$bestTune
plot(gbm.tune)
res <- gbm.tune$results
res

gbm.pred <- predict(gbm.tune,testX)
str(gbm.pred)
testY <-as.factor(testY)

gbm.cm <- confusionMatrix(gbm.pred,testY)
gbm.cm

#=======================================================================================

#Training the knn model
model_knn<-train(train[,predictors],trainY,method='knn',trControl=myControl,tuneLength=3)

#Predicting using knn model
test$pred_knn <- predict(object = model_knn,test[,predictors])

#Checking the accuracy of the random forest model
knn.cm <- confusionMatrix(testY, test$pred_knn)
knn.cm


#========================================================================================
#XGBoost


input_x <- as.matrix(dplyr::select(train, -Result))
input_y <- train$Result

#Default tuning
grid_default <- expand.grid(
  nrounds = 300,
  max_depth = 30,
  eta = 0.3,
  gamma = 0,
  colsample_bytree = 1,
  min_child_weight = 1,
  subsample = 1
)
train_control <- caret::trainControl(
  method = "none",
  verboseIter = FALSE, # no training log
  allowParallel = TRUE # FALSE for reproducible results
)
xgb_base <- caret::train(
  x = input_x,
  y = input_y,
  trControl = train_control,
  tuneGrid = grid_default,
  method = "xgbTree",
  verbose = TRUE
)
xgb_base$bestTune
predictions<-predict(xgb_base,test)
cm<-table(predictions=predictions,actual=test$Result)
cm
xgb.accuracy <- sum(diag( cm)) / sum( cm)
print(xgb.accuracy)
# Tuned Grid and cross validation

tune_grid <- expand.grid(
  nrounds = c(30, 400, 500),
  max_depth = 19,
  eta = 0.3,
  gamma = 1,
  colsample_bytree = 1,
  min_child_weight = 1,
  subsample = 1
)
tune_control <- caret::trainControl(
  method = "cv", # cross-validation
  number = 3, # with n folds
  #index = createFolds(tr_treated$Id_clean), # fix the folds
  verboseIter = FALSE, # no training log
  allowParallel = TRUE # FALSE for reproducible results
)
xgb_tune <- caret::train(
  x = input_x,
  y = input_y,
  trControl = tune_control,
  tuneGrid = tune_grid,
  method = "xgbTree",
  verbose = TRUE
)
xgb_tune$bestTune
predictions<-predict(xgb_tune,test)

xgb.cm <- confusionMatrix(predictions, test$Result)
xgb.cm


###############################################################################
## BAGGING

#setting up cross-validation
cvcontrol <- trainControl(method="repeatedcv", number = 10,
                          allowParallel=TRUE)

#Using treebag
train.bagg <- train(Result ~ .,
                    data=train,
                    method="treebag",
                    trControl=cvcontrol,
                    importance=TRUE)

# show bagging results
train.bagg

#plot variable importance
plot(varImp(train.bagg))

# obtain class predictions
bagg.classTest <-  predict(train.bagg,
                           newdata = test,
                           type="raw")
head(bagg.classTest)

# Create Confusion matrix
bagg.cm <- confusionMatrix(test$Result,bagg.classTest)
bagg.cm


#prepare data for AUC
bagg.probs=predict(train.bagg,
                   newdata=test,
                   type="prob")
head(bagg.probs)

# Plot AUC curve
rocCurve.bagg <- roc(test$Result,bagg.probs[,1])
plot(rocCurve.bagg,col=c(6))

###############################################################################
# Model Comparsion

# View Confusion Matrices
dt.cm
rf.cm
lr.cm
gbm.cm
knn.cm
xgb.cm
bagg.cm


#Create dataFrame to store Model Results
ModelResults <- as.data.frame(matrix(ncol = 3, nrow = 0))
colnames(ModelResults) <- c("ModelType", "Metric", "Value")


#Save result metric to data frame
ModelResults <- rbind(ModelResults, data.frame(ModelType = "Decision Tree", Metric = "Accuracy", Value = dt.cm[["overall"]][["Accuracy"]] *100))
ModelResults <- rbind(ModelResults, data.frame(ModelType = "Random Forest", Metric = "Accuracy", Value = rf.cm[["overall"]][["Accuracy"]] *100))
ModelResults <- rbind(ModelResults, data.frame(ModelType = "Linear Regression", Metric = "Accuracy", Value = lr.cm[["overall"]][["Accuracy"]] *100))
ModelResults <- rbind(ModelResults, data.frame(ModelType = "Gradient Boosting", Metric = "Accuracy", Value = gbm.cm[["overall"]][["Accuracy"]] *100))
ModelResults <- rbind(ModelResults, data.frame(ModelType = "K-Nearest Neighbour", Metric = "Accuracy", Value = knn.cm[["overall"]][["Accuracy"]] *100))
ModelResults <- rbind(ModelResults, data.frame(ModelType = "XGBoost", Metric = "Accuracy", Value = xgb.cm[["overall"]][["Accuracy"]] *100))
ModelResults <- rbind(ModelResults, data.frame(ModelType = "Bagging", Metric = "Accuracy", Value = bagg.cm[["overall"]][["Accuracy"]] *100))

ModelResults

# wrap text on model type
ModelResults$ModelType = str_wrap(ModelResults$ModelType, width = 10)

#round Value
ModelResults$Value <- round(ModelResults$Value, 2)

#plot Overall results
ggplot(data=ModelResults, aes(x=ModelType, y=Value, group=Metric)) +
  geom_line(aes(color=Metric)) +
  geom_point(aes(color=Metric)) +
  #geom_text(aes(label=Value)) +
  theme_bw() +
  labs(title="Prediction Model Accuracy Comparison") +
  facet_wrap(~Metric)


#clean up environment and remove from memory
rm(list = ls())
Gc