Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(dplyr)
- library(ggplot2)
- library(caTools)
- library(e1071)
- library(randomForest)
- library(caret)
- library(ROCR)
- library(Metrics)
- library(pROC)
- library(rpart)
- Accidents=read.csv("Accidents0515.csv",header = T,stringsAsFactors = F)
- Casualties = read.csv("Casualties0515.csv",header = T,stringsAsFactors = F)
- Vehicles = read.csv("Vehicles0515.csv",header = T,stringsAsFactors = F)
- Incidents=inner_join(Accidents,Casualties,"Accident_Index")
- Incidents=inner_join(Incidents,Vehicles,"Accident_Index")
- str(Incidents)
- Incidents=Incidents[,-c(1:3)]
- drops= c('Driver_Home_Area_Type',
- 'Driver_IMD_Decile',
- 'X1st_Road_Class',
- 'X1st_Road_Number',
- 'X2nd_Road_Class',
- 'X2nd_Road_Number',
- 'Weather_Conditions',
- 'Road_Surface_Conditions',
- ' Special_Conditions_at_Site',
- 'Carriageway_Hazards',
- 'LSOA_of_Accident_Location',
- "Vehicle_Reference.x",
- "Casualty_Reference",
- "Car_Passenger" ,
- "Bus_or_Coach_Passenger",
- "Pedestrian_Road_Maintenance_Worker" ,
- "Casualty_Home_Area_Type" ,
- "Vehicle_Reference.y",
- "Towing_and_Articulation" ,
- "Vehicle_Location.Restricted_Lane",
- "Skidding_and_Overturning",
- "Hit_Object_in_Carriageway",
- "Vehicle_Leaving_Carriageway" ,
- "Hit_Object_off_Carriageway",
- "X1st_Point_of_Impact",
- "Was_Vehicle_Left_Hand_Drive.",
- "Propulsion_Code" )
- Incidents=Incidents[,!(names(Incidents) %in% drops)]
- rm(drops)
- names(Incidents)
- attach(Incidents)
- dim(Accidents)
- unique(Accidents$Accident_Index)
- str(Vehicles)
- Incidents[Incidents$Did_Police_Officer_Attend_Scene_of_Accident==3,
- "Did_Police_Officer_Attend_Scene_of_Accident"]=2
- table(Incidents$Did_Police_Officer_Attend_Scene_of_Accident)
- Incidents= Incidents[!Incidents$Did_Police_Officer_Attend_Scene_of_Accident==-1,]
- Incidents$Did_Police_Officer_Attend_Scene_of_Accident=factor(Incidents$Did_Police_Officer_Attend_Scene_of_Accident)
- # Target variable
- p=ggplot(Incidents)
- p+geom_bar(aes(x=Did_Police_Officer_Attend_Scene_of_Accident,fill=Incidents$Did_Police_Officer_Attend_Scene_of_Accident))+labs(title="Barplot of police attendence",
- y="number of accidents",
- x="Police attendence")+theme(legend.position = "top")+scale_fill_discrete(name="Police Attendence",labels=c("Yes","No"))
- ###################### Accidents #######################################################################
- # police force
- unique(Incidents$Police_Force)
- p+geom_bar(aes(x=as.factor(Police_Force),fill=Incidents$Did_Police_Officer_Attend_Scene_of_Accident))+labs(title="Barplot of police attendence",
- y="number of accidents",
- x="Police attendence")+theme(legend.position = "top")+scale_fill_discrete(name="Police Attendence",labels=c("Yes","No"))
- police=table(Did_Police_Officer_Attend_Scene_of_Accident,Police_Force)
- prop.table(police,2)
- # Accident severity
- str(Incidents$Accident_Severity)
- p+geom_bar(aes(x=Accident_Severity,fill=Incidents$Did_Police_Officer_Attend_Scene_of_Accident))+labs(title="Barplot of police attendence",
- y="number of accidents",
- x="Accident severity") + theme(legend.position = "top")
- # number of vehicles involved
- p+geom_density(aes(x=log(Number_of_Vehicles,fill=Did_Police_Officer_Attend_Scene_of_Accident)))
- hist(Incidents$Number_of_Vehicles,fill=Did_Police_Officer_Attend_Scene_of_Accident,fill = Did_Police_Officer_Attend_Scene_of_Accident)
- # number of casualties
- p+geom_point(aes(Number_of_Casualties,Number_of_Vehicles,colour=Did_Police_Officer_Attend_Scene_of_Accident))
- # Road type & light conditons
- p+geom_bar(aes(x=as.factor(Road_Type),fill=Did_Police_Officer_Attend_Scene_of_Accident))+facet_grid(.~Light_Conditions)
- # speed limit & urban or rural ggmap
- p+geom_bar(aes(x=as.factor(Speed_limit),fill=Did_Police_Officer_Attend_Scene_of_Accident))+facet_grid(.~Urban_or_Rural_Area)
- ###################### Vehicle #######################################################################
- # sex of driver
- p+geom_bar(aes(x=as.factor(Sex_of_Driver),fill=Incidents$Did_Police_Officer_Attend_Scene_of_Accident)) + labs(title="Barplot of police attendence",
- y="number of accidents",
- x="Sex of Driver") + theme(legend.position = "top")+scale_fill_discrete(name="Police Attendence",labels=c("Yes","No"))
- sex=table(Did_Police_Officer_Attend_Scene_of_Accident,Sex_of_Driver)
- sex
- prop.table(sex,2)
- # Age of drivers
- hist(Incidents$Age_of_Driver[!Incidents$Age_of_Driver==-1],xlim = c(0,100),col = "blue",main = "Histogram of Age of Drivers",xlab = "Age of Driver",ylab = "Number of Accidents")
- boxplot(Age_of_Driver~Did_Police_Officer_Attend_Scene_of_Accident,col=c("gold","green"),xlab="Police Attendance",ylab="Age of Driver",
- main="Police Attendance in relation to the Age of the Driver",names=c("Yes","No"))
- # Age band of driver
- p+geom_bar(aes(x=as.factor(Age_Band_of_Driver),fill=Incidents$Did_Police_Officer_Attend_Scene_of_Accident)) + labs(title="Barplot of police attendence",
- y="number of accidents",
- x="Age band of Drivers") + theme(legend.position = "top")
- Age=table(Did_Police_Officer_Attend_Scene_of_Accident,Age_Band_of_Driver)
- Age
- prop.table(Age,2)
- # Vehicle type
- p+geom_bar(aes(x=as.factor(Vehicle_Type),fill=Incidents$Did_Police_Officer_Attend_Scene_of_Accident)) + labs(title="Barplot of police attendence",
- y="number of accidents",
- x="Vehicles Type") + theme(legend.position = "top")
- vtype=table(Did_Police_Officer_Attend_Scene_of_Accident,Vehicle_Type)
- vtype
- prop.table(vtype,2)
- # Age of vehicle
- p+geom_bar(aes(x=as.factor(Age_of_Vehicle),fill=Incidents$Did_Police_Officer_Attend_Scene_of_Accident)) + labs(title="Barplot of police attendence",
- y="number of accidents",
- x="Age of Vehicle") + theme(legend.position = "top")
- ###################### Casualty #######################################################################
- #Casualty class
- #sex of casualty
- p+geom_bar(aes(x=as.factor(Sex_of_Casualty),fill=Did_Police_Officer_Attend_Scene_of_Accident))+facet_grid(.~Casualty_Class)
- #age of casualty
- p+geom_boxplot(aes(x=Did_Police_Officer_Attend_Scene_of_Accident,y=Age_of_Casualty,fill=Did_Police_Officer_Attend_Scene_of_Accident))
- # age band of casualty & casualty severity
- p+geom_bar(aes(x=as.factor(Age_Band_of_Casualty),fill=Did_Police_Officer_Attend_Scene_of_Accident))+facet_grid(.~Casualty_Severity)+ theme(legend.position = "top")
- ######## Unsupervised learning ######################################################
- set.seed(123)
- split=sample.split(Incidents$Did_Police_Officer_Attend_Scene_of_Accident,SplitRatio = 0.001)
- Incidents_us=subset(Incidents,split==TRUE)
- dendrogram = hclust(d = dist(Incidents_sample_us[,c("Age_of_Casualty","Age_of_Driver","Age_of_Vehicle","Number_of_Vehicles","Number_of_Casualties")],
- method = 'euclidean'), method = 'ward.D')
- plot(dendrogram)
- abline()
- ############### PCA analysis ########################################################
- set.seed(123)
- split2=sample.split(Incidents$Did_Police_Officer_Attend_Scene_of_Accident,SplitRatio = 0.01)
- Incidents_sample=subset(Incidents,split2==TRUE)
- pca_incidents=prcomp(Incidents_sample[,c("Age_of_Casualty","Age_of_Driver","Age_of_Vehicle","Number_of_Vehicles","Number_of_Casualties")],scale = TRUE,center =T )
- plot(pca_incidents)
- pca_incidents_var=pca_incidents$sdev^2
- pca_incidents_PEV= pca_incidents_var / sum(pca_incidents_var)
- plot(pca_incidents_PEV, xlab = "Principal Component",
- ylab = "Proportion of Variance Explained",
- ylim = c(0, 1), type = "b")
- opar <- par()
- plot(cumsum(pca_incidents_PEV), xlab = "Principal Component",
- ylab = "Cumulative Proportion of Variance Explained",
- ylim = c(0, 1), type = "b")
- abline(h = 0.8, col = 'red', lty = 'dashed')
- par(opar)
- biplot(pca_incidents)
- pca = preProcess(x = train_Incidents_ml[-33], method = 'pca', pcaComp = 3)
- training_set_pca = predict(pca,train_Incidents_ml )
- training_set_pca = training_set_pca[c(2, 3,4, 1)]
- test_set_pca = predict(pca, test_Incidents_ml)
- test_set_pca = test_set_pca[c(2, 3,4, 1)]
- ################### Data Preperation ###############################
- Incidents_ml=Incidents[,-c(1:2,7,9,11)]
- Incidents_ml=Incidents_ml[,c(1:15,17:33,16)]
- set.seed(123)
- split3=sample.split(Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,SplitRatio = 0.01)
- Incidents_ml=subset(Incidents_ml,split3==TRUE)
- # Test and train split
- set.seed(678)
- split4=sample.split(Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,SplitRatio = 0.75)
- train_Incidents_ml=subset(Incidents_ml,split4==TRUE)
- test_Incidents_ml=subset(Incidents_ml,split4==FALSE)
- # Feature scale
- train_Incidents_ml[-33] = scale(train_Incidents_ml[-33])
- test_Incidents_ml[-33] = scale(test_Incidents_ml[-33])
- ##################### Svm ###############################################
- install.packages("e1071")
- library(e1071)
- svm_model_pca=svm(Did_Police_Officer_Attend_Scene_of_Accident~.,data=training_set_pca,
- type="C-classification",
- kernel="linear")
- pred_svm_pca=predict(svm_model_pca,test_set_pca[-4])
- prob_svm_pca=predict(svm_model_pca,test_set_pca[-4],type="prob")
- svm_model_pca
- #################### Decision Tree #######################################
- dt_model=rpart(Did_Police_Officer_Attend_Scene_of_Accident~.,data = train_Incidents_ml)
- pred_dt=predict(dt_model,test_Incidents_ml[-33],type = "class")
- prob_dt=predict(dt_model,test_Incidents_ml[-33],type="prob")
- dt_model
- #################### Random forrest #######################################
- install.packages("randomForest")
- library(randomForest)
- rf_model=randomForest(x=train_Incidents_ml[,-33],
- y=train_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,
- importance = TRUE,
- ntree = 1000)
- pred_rf=predict(rf_model,test_Incidents_ml[-33])
- prob_rf=predict(rf_model,test_Incidents_ml[-33],type = "prob")
- ###### Preformance measure ########################################
- # Confusion matrix
- confusionMatrix(data = pred_dt,
- reference = test_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident)
- confusionMatrix(data = pred_rf,
- reference = test_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident)
- # ROC & AUC plots
- roc_dt=roc(test_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,prob_dt[,2])
- roc_rf=roc(test_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,prob_rf[,2])
- par(mfrow=c(1,2))
- plot(roc_dt,main="Decision Tree")
- plot(roc_rf,main="RandomForest")
- auc(roc_dt)
- auc(roc_rf)
- tree_predict <- cbind(
- actual = test_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,
- predicted = predict(dt_model,test_Incidents_ml[-33] , type = 'class'),
- predict(dt_model,test_Incidents_ml[-33], type = 'prob')
- )
- rf_predict <- cbind(
- actual = test_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,
- predicted = predict(rf_model, test_Incidents_ml[-33], type = 'class'),
- True=predict(rf_model, test_Incidents_ml[-33], type = "prob")
- )
- models_prob <- data.frame(
- dt = tree_predict$True,
- rf = rf_predict$True
- )
- models_label <- data.frame(
- dt = tree_predict$actual,
- rf =rf_predict$actual
- )
- svm_ROC_pred = prediction(tchurn_models_prob, tchurn_label)
- svm_ROC_perf = performance(tchurn_ROC_pred, "tpr", "fpr")
- rf_predict <- cbind(
- actual = test_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,
- predicted = predict(rf_model, test_Incidents_ml[-33], type = 'raw'),
- predict(rf_model, test_Incidents_ml[-33], type = 'prob')
- )
- attributes(svm_predict)
- # Model Tuning
- plot
- res <- tuneRF(x = subset(train_Incidents_ml, select = -Did_Police_Officer_Attend_Scene_of_Accident),
- y = train_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,
- ntreeTry = 1000)
- # Random Search
- control <- trainControl(method="repeatedcv", number=10, repeats=3, search="random")
- set.seed(123)
- mtry <- sqrt(ncol(x))
- rf_random <- train(Class~., data=train, method="rf", metric=metric, tuneLength=15, trControl=control)
- print(rf_random)
- plot(rf_random)
- plot(rf_model)
- train_Incidents_ml=train_Incidents_ml[,-5]
- test_Incidents_ml=test_Incidents_ml[,-5]
- rf_model2=randomForest(x=train_Incidents_ml[,-32],
- y=train_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,
- importance = TRUE,
- ntree = 1000,mtry = 5)
- pred_rf2=predict(rf_model2,test_Incidents_ml[-32])
- prob_rf2=predict(rf_model2,test_Incidents_ml[-32],type = "prob")
- confusionMatrix(data = pred_rf2,
- reference = test_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident)
Add Comment
Please, Sign In to add comment