Guest User

Untitled

a guest
Apr 24th, 2018
118
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 13.82 KB | None | 0 0
  1. library(dplyr)
  2. library(ggplot2)
  3. library(caTools)
  4. library(e1071)
  5. library(randomForest)
  6. library(caret)
  7. library(ROCR)
  8. library(Metrics)
  9. library(pROC)
  10. library(rpart)
  11. Accidents=read.csv("Accidents0515.csv",header = T,stringsAsFactors = F)
  12. Casualties = read.csv("Casualties0515.csv",header = T,stringsAsFactors = F)
  13. Vehicles = read.csv("Vehicles0515.csv",header = T,stringsAsFactors = F)
  14.  
  15.  
  16. Incidents=inner_join(Accidents,Casualties,"Accident_Index")
  17.  
  18. Incidents=inner_join(Incidents,Vehicles,"Accident_Index")
  19.  
  20. str(Incidents)
  21. Incidents=Incidents[,-c(1:3)]
  22.  
  23.  
  24. drops= c('Driver_Home_Area_Type',
  25. 'Driver_IMD_Decile',
  26. 'X1st_Road_Class',
  27. 'X1st_Road_Number',
  28. 'X2nd_Road_Class',
  29. 'X2nd_Road_Number',
  30. 'Weather_Conditions',
  31. 'Road_Surface_Conditions',
  32. ' Special_Conditions_at_Site',
  33. 'Carriageway_Hazards',
  34. 'LSOA_of_Accident_Location',
  35. "Vehicle_Reference.x",
  36. "Casualty_Reference",
  37. "Car_Passenger" ,
  38. "Bus_or_Coach_Passenger",
  39. "Pedestrian_Road_Maintenance_Worker" ,
  40. "Casualty_Home_Area_Type" ,
  41. "Vehicle_Reference.y",
  42. "Towing_and_Articulation" ,
  43. "Vehicle_Location.Restricted_Lane",
  44. "Skidding_and_Overturning",
  45. "Hit_Object_in_Carriageway",
  46. "Vehicle_Leaving_Carriageway" ,
  47. "Hit_Object_off_Carriageway",
  48. "X1st_Point_of_Impact",
  49. "Was_Vehicle_Left_Hand_Drive.",
  50. "Propulsion_Code" )
  51. Incidents=Incidents[,!(names(Incidents) %in% drops)]
  52.  
  53. rm(drops)
  54.  
  55. names(Incidents)
  56. attach(Incidents)
  57. dim(Accidents)
  58. unique(Accidents$Accident_Index)
  59. str(Vehicles)
  60.  
  61. Incidents[Incidents$Did_Police_Officer_Attend_Scene_of_Accident==3,
  62. "Did_Police_Officer_Attend_Scene_of_Accident"]=2
  63.  
  64. table(Incidents$Did_Police_Officer_Attend_Scene_of_Accident)
  65.  
  66. Incidents= Incidents[!Incidents$Did_Police_Officer_Attend_Scene_of_Accident==-1,]
  67.  
  68. Incidents$Did_Police_Officer_Attend_Scene_of_Accident=factor(Incidents$Did_Police_Officer_Attend_Scene_of_Accident)
  69.  
  70.  
  71.  
  72. # Target variable
  73. p=ggplot(Incidents)
  74. p+geom_bar(aes(x=Did_Police_Officer_Attend_Scene_of_Accident,fill=Incidents$Did_Police_Officer_Attend_Scene_of_Accident))+labs(title="Barplot of police attendence",
  75. y="number of accidents",
  76. x="Police attendence")+theme(legend.position = "top")+scale_fill_discrete(name="Police Attendence",labels=c("Yes","No"))
  77.  
  78. ###################### Accidents #######################################################################
  79. # police force
  80. unique(Incidents$Police_Force)
  81. p+geom_bar(aes(x=as.factor(Police_Force),fill=Incidents$Did_Police_Officer_Attend_Scene_of_Accident))+labs(title="Barplot of police attendence",
  82. y="number of accidents",
  83. x="Police attendence")+theme(legend.position = "top")+scale_fill_discrete(name="Police Attendence",labels=c("Yes","No"))
  84.  
  85. police=table(Did_Police_Officer_Attend_Scene_of_Accident,Police_Force)
  86. prop.table(police,2)
  87.  
  88. # Accident severity
  89.  
  90. str(Incidents$Accident_Severity)
  91.  
  92. p+geom_bar(aes(x=Accident_Severity,fill=Incidents$Did_Police_Officer_Attend_Scene_of_Accident))+labs(title="Barplot of police attendence",
  93. y="number of accidents",
  94. x="Accident severity") + theme(legend.position = "top")
  95.  
  96. # number of vehicles involved
  97. p+geom_density(aes(x=log(Number_of_Vehicles,fill=Did_Police_Officer_Attend_Scene_of_Accident)))
  98. hist(Incidents$Number_of_Vehicles,fill=Did_Police_Officer_Attend_Scene_of_Accident,fill = Did_Police_Officer_Attend_Scene_of_Accident)
  99. # number of casualties
  100.  
  101. p+geom_point(aes(Number_of_Casualties,Number_of_Vehicles,colour=Did_Police_Officer_Attend_Scene_of_Accident))
  102.  
  103. # Road type & light conditons
  104. p+geom_bar(aes(x=as.factor(Road_Type),fill=Did_Police_Officer_Attend_Scene_of_Accident))+facet_grid(.~Light_Conditions)
  105.  
  106.  
  107.  
  108.  
  109. # speed limit & urban or rural ggmap
  110. p+geom_bar(aes(x=as.factor(Speed_limit),fill=Did_Police_Officer_Attend_Scene_of_Accident))+facet_grid(.~Urban_or_Rural_Area)
  111.  
  112.  
  113.  
  114. ###################### Vehicle #######################################################################
  115. # sex of driver
  116.  
  117. p+geom_bar(aes(x=as.factor(Sex_of_Driver),fill=Incidents$Did_Police_Officer_Attend_Scene_of_Accident)) + labs(title="Barplot of police attendence",
  118. y="number of accidents",
  119. x="Sex of Driver") + theme(legend.position = "top")+scale_fill_discrete(name="Police Attendence",labels=c("Yes","No"))
  120. sex=table(Did_Police_Officer_Attend_Scene_of_Accident,Sex_of_Driver)
  121. sex
  122. prop.table(sex,2)
  123.  
  124. # Age of drivers
  125.  
  126.  
  127. hist(Incidents$Age_of_Driver[!Incidents$Age_of_Driver==-1],xlim = c(0,100),col = "blue",main = "Histogram of Age of Drivers",xlab = "Age of Driver",ylab = "Number of Accidents")
  128.  
  129. boxplot(Age_of_Driver~Did_Police_Officer_Attend_Scene_of_Accident,col=c("gold","green"),xlab="Police Attendance",ylab="Age of Driver",
  130. main="Police Attendance in relation to the Age of the Driver",names=c("Yes","No"))
  131.  
  132. # Age band of driver
  133. p+geom_bar(aes(x=as.factor(Age_Band_of_Driver),fill=Incidents$Did_Police_Officer_Attend_Scene_of_Accident)) + labs(title="Barplot of police attendence",
  134. y="number of accidents",
  135. x="Age band of Drivers") + theme(legend.position = "top")
  136. Age=table(Did_Police_Officer_Attend_Scene_of_Accident,Age_Band_of_Driver)
  137. Age
  138. prop.table(Age,2)
  139.  
  140. # Vehicle type
  141.  
  142. p+geom_bar(aes(x=as.factor(Vehicle_Type),fill=Incidents$Did_Police_Officer_Attend_Scene_of_Accident)) + labs(title="Barplot of police attendence",
  143. y="number of accidents",
  144. x="Vehicles Type") + theme(legend.position = "top")
  145. vtype=table(Did_Police_Officer_Attend_Scene_of_Accident,Vehicle_Type)
  146. vtype
  147. prop.table(vtype,2)
  148.  
  149. # Age of vehicle
  150.  
  151. p+geom_bar(aes(x=as.factor(Age_of_Vehicle),fill=Incidents$Did_Police_Officer_Attend_Scene_of_Accident)) + labs(title="Barplot of police attendence",
  152. y="number of accidents",
  153. x="Age of Vehicle") + theme(legend.position = "top")
  154.  
  155. ###################### Casualty #######################################################################
  156.  
  157. #Casualty class
  158. #sex of casualty
  159. p+geom_bar(aes(x=as.factor(Sex_of_Casualty),fill=Did_Police_Officer_Attend_Scene_of_Accident))+facet_grid(.~Casualty_Class)
  160.  
  161. #age of casualty
  162.  
  163. p+geom_boxplot(aes(x=Did_Police_Officer_Attend_Scene_of_Accident,y=Age_of_Casualty,fill=Did_Police_Officer_Attend_Scene_of_Accident))
  164.  
  165.  
  166. # age band of casualty & casualty severity
  167. p+geom_bar(aes(x=as.factor(Age_Band_of_Casualty),fill=Did_Police_Officer_Attend_Scene_of_Accident))+facet_grid(.~Casualty_Severity)+ theme(legend.position = "top")
  168.  
  169.  
  170. ######## Unsupervised learning ######################################################
  171. set.seed(123)
  172. split=sample.split(Incidents$Did_Police_Officer_Attend_Scene_of_Accident,SplitRatio = 0.001)
  173. Incidents_us=subset(Incidents,split==TRUE)
  174.  
  175. dendrogram = hclust(d = dist(Incidents_sample_us[,c("Age_of_Casualty","Age_of_Driver","Age_of_Vehicle","Number_of_Vehicles","Number_of_Casualties")],
  176. method = 'euclidean'), method = 'ward.D')
  177.  
  178. plot(dendrogram)
  179. abline()
  180.  
  181.  
  182. ############### PCA analysis ########################################################
  183.  
  184. set.seed(123)
  185. split2=sample.split(Incidents$Did_Police_Officer_Attend_Scene_of_Accident,SplitRatio = 0.01)
  186. Incidents_sample=subset(Incidents,split2==TRUE)
  187.  
  188.  
  189. pca_incidents=prcomp(Incidents_sample[,c("Age_of_Casualty","Age_of_Driver","Age_of_Vehicle","Number_of_Vehicles","Number_of_Casualties")],scale = TRUE,center =T )
  190. plot(pca_incidents)
  191.  
  192. pca_incidents_var=pca_incidents$sdev^2
  193. pca_incidents_PEV= pca_incidents_var / sum(pca_incidents_var)
  194.  
  195. plot(pca_incidents_PEV, xlab = "Principal Component",
  196. ylab = "Proportion of Variance Explained",
  197. ylim = c(0, 1), type = "b")
  198.  
  199. opar <- par()
  200. plot(cumsum(pca_incidents_PEV), xlab = "Principal Component",
  201. ylab = "Cumulative Proportion of Variance Explained",
  202. ylim = c(0, 1), type = "b")
  203.  
  204. abline(h = 0.8, col = 'red', lty = 'dashed')
  205. par(opar)
  206.  
  207. biplot(pca_incidents)
  208.  
  209.  
  210. pca = preProcess(x = train_Incidents_ml[-33], method = 'pca', pcaComp = 3)
  211. training_set_pca = predict(pca,train_Incidents_ml )
  212. training_set_pca = training_set_pca[c(2, 3,4, 1)]
  213. test_set_pca = predict(pca, test_Incidents_ml)
  214. test_set_pca = test_set_pca[c(2, 3,4, 1)]
  215. ################### Data Preperation ###############################
  216.  
  217. Incidents_ml=Incidents[,-c(1:2,7,9,11)]
  218.  
  219. Incidents_ml=Incidents_ml[,c(1:15,17:33,16)]
  220.  
  221. set.seed(123)
  222. split3=sample.split(Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,SplitRatio = 0.01)
  223. Incidents_ml=subset(Incidents_ml,split3==TRUE)
  224.  
  225.  
  226. # Test and train split
  227. set.seed(678)
  228. split4=sample.split(Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,SplitRatio = 0.75)
  229. train_Incidents_ml=subset(Incidents_ml,split4==TRUE)
  230. test_Incidents_ml=subset(Incidents_ml,split4==FALSE)
  231.  
  232. # Feature scale
  233. train_Incidents_ml[-33] = scale(train_Incidents_ml[-33])
  234. test_Incidents_ml[-33] = scale(test_Incidents_ml[-33])
  235. ##################### Svm ###############################################
  236. install.packages("e1071")
  237. library(e1071)
  238.  
  239. svm_model_pca=svm(Did_Police_Officer_Attend_Scene_of_Accident~.,data=training_set_pca,
  240. type="C-classification",
  241. kernel="linear")
  242.  
  243. pred_svm_pca=predict(svm_model_pca,test_set_pca[-4])
  244. prob_svm_pca=predict(svm_model_pca,test_set_pca[-4],type="prob")
  245. svm_model_pca
  246. #################### Decision Tree #######################################
  247.  
  248. dt_model=rpart(Did_Police_Officer_Attend_Scene_of_Accident~.,data = train_Incidents_ml)
  249.  
  250. pred_dt=predict(dt_model,test_Incidents_ml[-33],type = "class")
  251. prob_dt=predict(dt_model,test_Incidents_ml[-33],type="prob")
  252. dt_model
  253.  
  254. #################### Random forrest #######################################
  255. install.packages("randomForest")
  256. library(randomForest)
  257.  
  258. rf_model=randomForest(x=train_Incidents_ml[,-33],
  259. y=train_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,
  260. importance = TRUE,
  261. ntree = 1000)
  262.  
  263. pred_rf=predict(rf_model,test_Incidents_ml[-33])
  264. prob_rf=predict(rf_model,test_Incidents_ml[-33],type = "prob")
  265.  
  266. ###### Preformance measure ########################################
  267.  
  268. # Confusion matrix
  269. confusionMatrix(data = pred_dt,
  270. reference = test_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident)
  271.  
  272. confusionMatrix(data = pred_rf,
  273. reference = test_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident)
  274.  
  275.  
  276. # ROC & AUC plots
  277. roc_dt=roc(test_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,prob_dt[,2])
  278. roc_rf=roc(test_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,prob_rf[,2])
  279.  
  280. par(mfrow=c(1,2))
  281. plot(roc_dt,main="Decision Tree")
  282. plot(roc_rf,main="RandomForest")
  283.  
  284.  
  285. auc(roc_dt)
  286. auc(roc_rf)
  287.  
  288. tree_predict <- cbind(
  289. actual = test_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,
  290. predicted = predict(dt_model,test_Incidents_ml[-33] , type = 'class'),
  291. predict(dt_model,test_Incidents_ml[-33], type = 'prob')
  292. )
  293.  
  294. rf_predict <- cbind(
  295. actual = test_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,
  296. predicted = predict(rf_model, test_Incidents_ml[-33], type = 'class'),
  297. True=predict(rf_model, test_Incidents_ml[-33], type = "prob")
  298. )
  299.  
  300.  
  301. models_prob <- data.frame(
  302. dt = tree_predict$True,
  303. rf = rf_predict$True
  304. )
  305.  
  306. models_label <- data.frame(
  307. dt = tree_predict$actual,
  308. rf =rf_predict$actual
  309.  
  310. )
  311.  
  312.  
  313. svm_ROC_pred = prediction(tchurn_models_prob, tchurn_label)
  314. svm_ROC_perf = performance(tchurn_ROC_pred, "tpr", "fpr")
  315.  
  316.  
  317.  
  318. rf_predict <- cbind(
  319. actual = test_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,
  320. predicted = predict(rf_model, test_Incidents_ml[-33], type = 'raw'),
  321. predict(rf_model, test_Incidents_ml[-33], type = 'prob')
  322. )
  323.  
  324. attributes(svm_predict)
  325.  
  326. # Model Tuning
  327. plot
  328.  
  329. res <- tuneRF(x = subset(train_Incidents_ml, select = -Did_Police_Officer_Attend_Scene_of_Accident),
  330. y = train_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,
  331. ntreeTry = 1000)
  332.  
  333.  
  334. # Random Search
  335. control <- trainControl(method="repeatedcv", number=10, repeats=3, search="random")
  336. set.seed(123)
  337. mtry <- sqrt(ncol(x))
  338. rf_random <- train(Class~., data=train, method="rf", metric=metric, tuneLength=15, trControl=control)
  339. print(rf_random)
  340. plot(rf_random)
  341.  
  342. plot(rf_model)
  343. train_Incidents_ml=train_Incidents_ml[,-5]
  344. test_Incidents_ml=test_Incidents_ml[,-5]
  345. rf_model2=randomForest(x=train_Incidents_ml[,-32],
  346. y=train_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident,
  347. importance = TRUE,
  348. ntree = 1000,mtry = 5)
  349.  
  350. pred_rf2=predict(rf_model2,test_Incidents_ml[-32])
  351. prob_rf2=predict(rf_model2,test_Incidents_ml[-32],type = "prob")
  352.  
  353. confusionMatrix(data = pred_rf2,
  354. reference = test_Incidents_ml$Did_Police_Officer_Attend_Scene_of_Accident)
Add Comment
Please, Sign In to add comment