Guest User

Untitled

a guest
Jan 16th, 2018
109
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.84 KB | None | 0 0
  1. set.seed(212)
  2.  
  3. #get/check working directory
  4. getwd()
  5. #set working directory
  6. setwd("/Users/ganeshharugeri/Documents/R_workspace/input/PracticeTest2017/")
  7.  
  8. #Import Data Set
  9. #Read file from working directory
  10. filename1 <- paste(c(getwd(),"/Validation.csv"),collapse="")
  11. filename2 <- paste(c(getwd(),"/Training.csv"),collapse="")
  12. # read file now
  13. Validation <- read.csv2(filename1,sep = ";",header = 1, stringsAsFactors = 1 )
  14. Training <- read.csv2(filename2,sep = ";",header = 1, stringsAsFactors = 1 )
  15.  
  16.  
  17. #Structure of both dataset
  18. str(Training)
  19. str(Validation) #classlabel is absent
  20.  
  21. #Check suumary
  22. summary(Validation)
  23. summary(Training)
  24.  
  25. #Check sample data
  26. head(Validation)
  27. head(Training)
  28. #Check STructure
  29. str(Validation)
  30. str(Training)
  31.  
  32. dim(Validation)
  33. dim(Training)
  34.  
  35. # Binding training and test data
  36. Backup <- rbind(Training, Validation)
  37.  
  38. All_Data <- rbind(Training, Validation)
  39. str(All_Data)
  40.  
  41. #Check for the unique values in each column
  42. sapply(All_Data, function(x) length(unique(x)))
  43.  
  44. # issing value entries
  45. nrowCount =nrow(All_Data)
  46. ncol(All_Data)
  47. #Complete rows i.e, rows without missing values
  48. completeRows = sum(complete.cases(All_Data))
  49.  
  50. #Check for the percentage of missing row entries
  51. prop=sum(complete.cases(All_Data))/nrow(All_Data)
  52.  
  53. #Number of missing values
  54. table(is.na(All_Data))
  55.  
  56. #Check for missing by column
  57. colSums(is.na(All_Data))
  58. colSums(is.na(Training))
  59. colSums(is.na(Validation))
  60.  
  61. #Analyse the variable dependancy
  62. library(ggplot2)
  63. head(All_Data)
  64.  
  65. #Arranging the data by column names
  66. temp=NULL
  67. temp=All_Data[ , order(names(All_Data[1:21]))]
  68. temp$v7 =NULL
  69. temp$v9 =NULL
  70. str(temp)
  71. head(temp)
  72.  
  73. #Ordering as required
  74. temp <- cbind(v7=All_Data$v7,v9=All_Data$v9,temp,classlabel = All_Data$classlabel)
  75.  
  76. dim(temp)
  77. #Assigning back to All_Data
  78. All_Data =NULL
  79. All_Data <- temp
  80. temp <- NULL
  81.  
  82. head(All_Data)
  83. str(All_Data)
  84.  
  85. #Plotting Univariate analysis
  86. df = data.frame(All_Data$v7)
  87.  
  88. #quick plots
  89. qplot(All_Data$v7,All_Data$classlabel)
  90. qplot(All_Data$v9,All_Data$classlabel)
  91. qplot(All_Data$v42,All_Data$classlabel)
  92.  
  93. #Advanced plots
  94. ggplot(All_Data) + geom_bar( aes(All_Data$classlabel) )
  95.  
  96. #REMOVING v95 as it has more than 55% data is NA
  97. All_Data$v95= NULL
  98. #Library dplyr and tidyr is required for pipe(%>%) and fill functions
  99. library(dplyr)
  100. library(tidyr)
  101.  
  102.  
  103. All_Data <- All_Data %>% fill(v7,v12,v32,v33,v99,v85)
  104.  
  105. All_Data$v9 <- factor(All_Data$v9)
  106.  
  107. table(All_Data$v24)
  108.  
  109. qplot(All_Data$v24)
  110.  
  111.  
  112. # Standarisation function
  113. range01 <- function(x){(x-min(x))/(max(x)-min(x))}
  114. #test column
  115. TestCol= All_Data$v20
  116. # USe function
  117. All_Data$v20=range01(All_Data$v20)
  118. All_Data$v24=range01(All_Data$v24)
  119. #All_Data$v42 = Backup$v42
  120. All_Data$v42[is.na(All_Data$v42)] <- median(All_Data$v42,na.rm=T)
  121. All_Data$v55[is.na(All_Data$v55)] <- median(All_Data$v55,na.rm=T)
  122.  
  123. All_Data$v42=range01(All_Data$v42)
  124. All_Data$v55=range01(All_Data$v55)
  125.  
  126.  
  127. summary(All_Data)
  128. sapply(All_Data, function(x) length(unique(x)))
  129.  
  130. #Rename label values
  131. library(plyr)
  132. All_Data$classlabel<- mapvalues(All_Data$classlabel, from = c("yes.", "no."), to = c("yes", "no"))
  133.  
  134. #See the output
  135. # col = NULL
  136. # chk=NULL
  137. # df=NULL
  138.  
  139.  
  140. #Building models
  141. library(randomForest)
  142.  
  143. #Splitting data
  144. train = All_Data[1:3700,]
  145. test=NULL
  146. test = All_Data[3701:3900,1:20]
  147.  
  148. #randomforest
  149. rfmodel <- randomForest(classlabel~ v68+v99+v50+v20+v53,data=train,importance=TRUE, ntrees=69)
  150. predicted<- predict(rfmodel,test)
  151.  
  152. #Confusion matrix for randomforest
  153. confusionMatrix(predicted,All_Data[3701:3900,]$classlabel)
  154.  
  155. #Variable Importance
  156. varImpPlot(rfmodel,sort=T,n.var=10,main = "Variable importance")
  157.  
  158.  
  159. #SVM model
  160. svmmodel<-svm(classlabel ~.,data=train)
  161. pred<- predict(svmmodel,test)
  162.  
  163. #Accuracy test [Evaluation]
  164. library(caret)
  165. library(e1071)
  166.  
  167.  
  168.  
  169. #Confusion matrix for SVM
  170. confusionMatrix(pred,All_Data[3701:3900,]$classlabel)
  171.  
  172. # cbind(predicted,All_Data[3701:3900,]$classlabel)
  173. compare=NULL
  174. compare <- cbind(Actual=All_Data[3701:3900,]$classlabel, rfPrediction = predicted, SVMPrediction = pred)
Add Comment
Please, Sign In to add comment