Untitled

library(dplyr)
library(ggplot2)
library(caTools)
library(e1071)
library(randomForest)
library(caret)
library(ROCR)
library(Metrics)
library(pROC)
library(rpart)
Accidents=read.csv("Accidents0515.csv",header = T,stringsAsFactors = F)
Casualties = read.csv("Casualties0515.csv",header = T,stringsAsFactors = F)
Vehicles = read.csv("Vehicles0515.csv",header = T,stringsAsFactors = F)


Incidents=inner_join(Accidents,Casualties,"Accident_Index")

Incidents=inner_join(Incidents,Vehicles,"Accident_Index")

str(Incidents)
Incidents=Incidents[,-c(1:3)]


drops= c('Driver_Home_Area_Type',
         'Driver_IMD_Decile',
         'X1st_Road_Class',
         'X1st_Road_Number',
         'X2nd_Road_Class',
         'X2nd_Road_Number',
         'Weather_Conditions',
         'Road_Surface_Conditions',
        ' Special_Conditions_at_Site',
         'Carriageway_Hazards',
         'LSOA_of_Accident_Location',
        "Vehicle_Reference.x",
        "Casualty_Reference",
        "Car_Passenger" ,
        "Bus_or_Coach_Passenger",
        "Pedestrian_Road_Maintenance_Worker" ,
        "Casualty_Home_Area_Type" ,
        "Vehicle_Reference.y",
        "Towing_and_Articulation" ,
        "Vehicle_Location.Restricted_Lane",
        "Skidding_and_Overturning",
        "Hit_Object_in_Carriageway",
        "Vehicle_Leaving_Carriageway" ,
        "Hit_Object_off_Carriageway",
        "X1st_Point_of_Impact",
        "Was_Vehicle_Left_Hand_Drive.",
        "Propulsion_Code" )
Incidents=Incidents[,!(names(Incidents) %in% drops)]

rm(drops)

names(Incidents)
attach(Incidents)
dim(Accidents)
unique(Accidents$Accident_Index)
str(Vehicles)

Incidents[Incidents$Did_Police_Officer_Attend_Scene_of_Accident==3,
          "Did_Police_Officer_Attend_Scene_of_Accident"]=2

table(Incidents$Did_Police_Officer_Attend_Scene_of_Accident)

Incidents= Incidents[!Incidents$Did_Police_Officer_Attend_Scene_of_Accident==-1,]

Incidents$Did_Police_Officer_Attend_Scene_of_Accident=factor(Incidents$Did_Police_Officer_Attend_Scene_of_Accident)


# Target variable
p=ggplot(Incidents)
p+geom_bar(aes(x=Did_Police_Officer_Attend_Scene_of_Accident,fill=Incidents$Did_Police_Officer_Attend_Scene_of_Accident))+labs(title="Barplot of police attendence",
                                                                    y="number of accidents",
                                                                    x="Police attendence")+theme(legend.position = "top")+scale_fill_discrete(name="Police Attendence",labels=c("Yes","No"))

###################### Accidents #######################################################################
# police force
unique(Incidents$Police_Force)
p+geom_bar(aes(x=as.factor(Police_Force),fill=Incidents$Did_Police_Officer_Attend_Scene_of_Accident))+labs(title="Barplot of police attendence",
                                                                    y="number of accidents",
                                                                    x="Police attendence")+theme(legend.position = "top")+scale_fill_discrete(name="Police Attendence",labels=c("Yes","No"))

police=table(Did_Police_Officer_Attend_Scene_of_Accident,Police_Force)
prop.table(police,2)

# Accident severity

str(Incidents$Accident_Severity)

p+geom_bar(aes(x=Accident_Severity,fill=Incidents$Did_Police_Officer_Attend_Scene_of_Accident))+labs(title="Barplot of police attendence",
                                                                    y="number of accidents",
                                                                    x="Accident severity") + theme(legend.position = "top")

# number of vehicles involved
p+geom_density(aes(x=log(Number_of_Vehicles,fill=Did_Police_Officer_Attend_Scene_of_Accident)))
hist(Incidents$Number_of_Vehicles,fill=Did_Police_Officer_Attend_Scene_of_Accident,fill = Did_Police_Officer_Attend_Scene_of_Accident)
# number of casualties

p+geom_point(aes(Number_of_Casualties,Number_of_Vehicles,colour=Did_Police_Officer_Attend_Scene_of_Accident))

# Road type & light conditons
p+geom_bar(aes(x=as.factor(Road_Type),fill=Did_Police_Officer_Attend_Scene_of_Accident))+facet_grid(.~Light_Conditions)


# speed limit & urban or rural ggmap
p+geom_bar(aes(x=as.factor(Speed_limit),fill=Did_Police_Officer_Attend_Scene_of_Accident))+facet_grid(.~Urban_or_Rural_Area)


###################### Vehicle #######################################################################
# sex of driver

p+geom_bar(aes(x=as.factor(Sex_of_Driver),fill=Incidents$Did_Police_Officer_Attend_Scene_of_Accident)) + labs(title="Barplot of police attendence",
                                                                                                                   y="number of accidents",
                                                                                                                   x="Sex of Driver") + theme(legend.position = "top")+scale_fill_discrete(name="Police Attendence",labels=c("Yes","No"))
sex=table(Did_Police_Officer_Attend_Scene_of_Accident,Sex_of_Driver)
sex
prop.table(sex,2)

# Age of drivers


hist(Incidents$Age_of_Driver[!Incidents$Age_of_Driver==-1],xlim = c(0,100),col = "blue",main = "Histogram of Age of Drivers",xlab = "Age of Driver",ylab = "Number of Accidents")

boxplot(Age_of_Driver~Did_Police_Officer_Attend_Scene_of_Accident,col=c("gold","green"),xlab="Police Attendance",ylab="Age of Driver",
        main="Police Attendance in relation to the Age of the Driver",names=c("Yes","No"))

# Age band of driver
p+geom_bar(aes(x=as.factor(Age_Band_of_Driver),fill=Incidents$Did_Police_Officer_Attend_Scene_of_Accident)) + labs(title="Barplot of police attendence",
                                                                                                              y="number of accidents",
                                                                                                              x="Age band of Drivers") + theme(legend.position = "top")
Age=table(Did_Police_Officer_Attend_Scene_of_Accident,Age_Band_of_Driver)
Age
prop.table(Age,2)

# Vehicle type

p+geom_bar(aes(x=as.factor(Vehicle_Type),fill=Incidents$Did_Police_Officer_Attend_Scene_of_Accident)) + labs(title="Barplot of police attendence",
                                                                                                                   y="number of accidents",
                                                                                                                   x="Vehicles Type") + theme(legend.position = "top")
vtype=table(Did_Police_Officer_Attend_Scene_of_Accident,Vehicle_Type)
vtype
prop.table(vtype,2)

# Age of vehicle

p+geom_bar(aes(x=as.factor(Age_of_Vehicle),fill=Incidents$Did_Police_Officer_Attend_Scene_of_Accident)) + labs(title="Barplot of police attendence",
                                                                                                             y="number of accidents",
                                                                                                             x="Age of Vehicle") + theme(legend.position = "top")

###################### Casualty #######################################################################

#Casualty class
#sex of casualty
p+geom_bar(aes(x=as.factor(Sex_of_Casualty),fill=Did_Police_Officer_Attend_Scene_of_Accident))+facet_grid(.~Casualty_Class)

#age of casualty

p+geom_boxplot(aes(x=Did_Police_Officer_Attend_Scene_of_Accident,y=Age_of_Casualty,fill=Did_Police_Officer_Attend_Scene_of_Accident))


# age band of casualty & casualty severity
p+geom_bar(aes(x=as.factor(Age_Band_of_Casualty),fill=Did_Police_Officer_Attend_Scene_of_Accident))+facet_grid(.~Casualty_Severity)+ theme(legend.position = "top")


######## Unsupervised learning ######################################################
set.seed(123)
split=sample.split(Incidents$Did_Police_Officer_Attend_Scene_of_Accident,SplitRatio = 0.001)
Incidents_us=subset(Incidents,split==TRUE)

dendrogram = hclust(d = dist(Incidents_sample_us[,c("Age_of_Casualty","Age_of_Driver","Age_of_Vehicle","Number_of_Vehicles","Number_of_Casualties")],
                             method = 'euclidean'), method = 'ward.D')

plot(dendrogram)
abline()


############### PCA analysis ########################################################

set.seed(123)
split2=sample.split(Incidents$Did_Police_Officer_Attend_Scene_of_Accident,SplitRatio = 0.01)
Incidents_sample=subset(Incidents,split2==TRUE)


pca_incidents=prcomp(Incidents_sample[,c("Age_of_Casualty","Age_of_Driver","Age_of_Vehicle","Number_of_Vehicles","Number_of_Casualties")],scale = TRUE,center =T )
plot(pca_incidents)

pca_incidents_var=pca_incidents$sdev^2
pca_incidents_PEV= pca_incidents_var / sum(pca_incidents_var)

plot(pca_incidents_PEV, xlab = "Principal Component",
     ylab = "Proportion of Variance Explained",
     ylim = c(0, 1), type = "b")

opar <- par()
plot(cumsum(pca_incidents_PEV), xlab = "Principal Component",
     ylab = "Cumulative Proportion of Variance Explained",
     ylim = c(0, 1), type = "b")

abline(h = 0.8, col = 'red', lty = 'dashed')
par(opar)

biplot(pca_incidents)


pca = preProcess(x = train_Incidents_ml[-33], method = 'pca', pcaComp = 3)
training_set_pca = predict(pca,train_Incidents_ml )
training_set_pca = training_set_pca[c(2, 3,4, 1)]
test_set_pca = predict(pca, test_Incidents_ml)
test_set_pca = test_set_pca[c(2, 3,4, 1)]
################### Data Preperation ###############################

Incidents_ml=Incidents[,-c(1:2,7,9,11)]

Incidents_ml=Incidents_ml[,c(1:15,17:33,16)]

set.seed(123)
split3=sample.split(Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,SplitRatio = 0.01)
Incidents_ml=subset(Incidents_ml,split3==TRUE)


# Test and train split
set.seed(678)
split4=sample.split(Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,SplitRatio = 0.75)
train_Incidents_ml=subset(Incidents_ml,split4==TRUE)
test_Incidents_ml=subset(Incidents_ml,split4==FALSE)

# Feature scale
train_Incidents_ml[-33] = scale(train_Incidents_ml[-33])
test_Incidents_ml[-33] = scale(test_Incidents_ml[-33])
##################### Svm ###############################################
install.packages("e1071")
library(e1071)

svm_model_pca=svm(Did_Police_Officer_Attend_Scene_of_Accident~.,data=training_set_pca,
                  type="C-classification",
                  kernel="linear")

pred_svm_pca=predict(svm_model_pca,test_set_pca[-4])
prob_svm_pca=predict(svm_model_pca,test_set_pca[-4],type="prob")
svm_model_pca
#################### Decision Tree #######################################

dt_model=rpart(Did_Police_Officer_Attend_Scene_of_Accident~.,data = train_Incidents_ml)

pred_dt=predict(dt_model,test_Incidents_ml[-33],type = "class")
prob_dt=predict(dt_model,test_Incidents_ml[-33],type="prob")
dt_model

#################### Random forrest #######################################
install.packages("randomForest")
library(randomForest)

rf_model=randomForest(x=train_Incidents_ml[,-33],
                      y=train_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,
                      importance = TRUE,
                      ntree = 1000)

pred_rf=predict(rf_model,test_Incidents_ml[-33])
prob_rf=predict(rf_model,test_Incidents_ml[-33],type = "prob")

###### Preformance measure ########################################

# Confusion matrix
confusionMatrix(data = pred_dt,
                reference = test_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident)

confusionMatrix(data = pred_rf,
                reference = test_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident)


# ROC & AUC plots
roc_dt=roc(test_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,prob_dt[,2])
roc_rf=roc(test_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,prob_rf[,2])

par(mfrow=c(1,2))
plot(roc_dt,main="Decision Tree")
plot(roc_rf,main="RandomForest")


auc(roc_dt)
auc(roc_rf)

tree_predict <-  cbind(
  actual = test_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,
  predicted = predict(dt_model,test_Incidents_ml[-33] , type = 'class'),
  predict(dt_model,test_Incidents_ml[-33], type = 'prob')
)

rf_predict <-  cbind(
  actual = test_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,
  predicted = predict(rf_model, test_Incidents_ml[-33], type = 'class'),
  True=predict(rf_model, test_Incidents_ml[-33], type = "prob")
)


models_prob <- data.frame(
  dt = tree_predict$True,
  rf = rf_predict$True
)

models_label <- data.frame(
  dt = tree_predict$actual,
  rf =rf_predict$actual

)


svm_ROC_pred = prediction(tchurn_models_prob, tchurn_label)
svm_ROC_perf = performance(tchurn_ROC_pred, "tpr", "fpr")


rf_predict <-  cbind(
  actual = test_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,
  predicted = predict(rf_model, test_Incidents_ml[-33], type = 'raw'),
  predict(rf_model, test_Incidents_ml[-33], type = 'prob')
)

attributes(svm_predict)

# Model Tuning
plot

res <- tuneRF(x = subset(train_Incidents_ml, select = -Did_Police_Officer_Attend_Scene_of_Accident),
              y = train_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,
              ntreeTry = 1000)


# Random Search
control <- trainControl(method="repeatedcv", number=10, repeats=3, search="random")
set.seed(123)
mtry <- sqrt(ncol(x))
rf_random <- train(Class~., data=train, method="rf", metric=metric, tuneLength=15, trControl=control)
print(rf_random)
plot(rf_random)

plot(rf_model)
train_Incidents_ml=train_Incidents_ml[,-5]
test_Incidents_ml=test_Incidents_ml[,-5]
rf_model2=randomForest(x=train_Incidents_ml[,-32],
                      y=train_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,
                      importance = TRUE,
                      ntree = 1000,mtry = 5)

pred_rf2=predict(rf_model2,test_Incidents_ml[-32])
prob_rf2=predict(rf_model2,test_Incidents_ml[-32],type = "prob")

confusionMatrix(data = pred_rf2,
                reference = test_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident)