Advertisement
khaiwen1111

rcode

Mar 21st, 2020
150
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.67 KB | None | 0 0
  1. setwd("D:/Users")
  2. data<-read.delim("winequality-white.csv",header=TRUE,sep=";")
  3. names(data)
  4. # Removing the Duplicate Rows
  5. data <- data[!duplicated(data), ]
  6. dim(data)
  7.  
  8. # Checking if there are any missing (NA) values
  9. sum(is.na(data))
  10.  
  11. # Count Response Variable
  12. table(data$quality)
  13. round(cor(data, method = "pearson"), 2)
  14.  
  15. library(reshape)
  16. library(ggplot2)
  17. melt_wine2 <- melt(data, "quality")
  18. ggplot(melt_wine2, aes(value, quality, color = variable)) + geom_point() + geom_smooth(aes(value,quality, colour=variable), method=lm, se=FALSE)+facet_wrap(.~variable, scales = "free")
  19.  
  20.  
  21. model<-lm(quality~.,data=data)
  22. # fitting regression
  23. summary(model)
  24. ######################################################################################################################################
  25.  
  26. # leverage and influential point
  27. # outliers
  28. wine_reg<-lm(quality~.,data=data)
  29. sum.fit<-summary(wine_reg)
  30. # fitted value
  31. yhat<-round(wine_reg$fitted.values,3)
  32. # residual
  33. ei<-round(wine_reg$residuals,3)
  34. # standardized residual
  35. di<-round(wine_reg$residuals/sum.fit$sigma,3)
  36. # studentized residual
  37. ri<-round(rstandard(wine_reg),3)
  38. #PRESS
  39. hii<-round(hatvalues(wine_reg),3)
  40. press<-round(ei/(1-hii),3)
  41. # R student residual
  42. ti<-round(rstudent(wine_reg),3)
  43. table <-cbind(yhat,ei,di,hii,ri,press,ti)
  44.  
  45.  
  46. options(max.print=999999)
  47.  
  48. dffits.i<-round(dffits(model=wine_reg),3)
  49. cook.i<-round(cooks.distance(model=wine_reg),3)
  50. dfbeta.all<-round(dfbetas(model= wine_reg),3)
  51. table2<-cbind(dffits.i,cook.i,dfbeta.all)
  52.  
  53. #bonferroni
  54. #H0: Not an outlier
  55. #H1: OUtlier
  56.  
  57. n<-length(wine_reg$fitted.values)
  58. qt(p=1-0.05/(2*n),df=wine_reg$df.residual-1)
  59.  
  60. #observation 254,446,2782,3308,4746 failed bonferroni test
  61. #therefrore they are outliers
  62.  
  63. # plot cook's distance
  64. plot (cook.i, pch="*", cex=2, main = "Influential Observation by Cooks distance")
  65. abline(h=4*mean(cook.i, na.rm=T), col="red")
  66. text(x=1:length(cook.i)+3, y=cook.i, labels=ifelse(cook.i>4*mean(cook.i,na.rm=T),names(cook.i),""),col="red")
  67.  
  68. influential <- as.numeric(names(cook.i)[(cook.i>4*mean(cook.i, na.rm=T))])
  69. head(data[influential,])
  70. data<-data-c()
  71.  
  72. qf(0.2,11,4886, lower.tail = FALSE)
  73. qf(0.5,11,4886, lower.tail = FALSE)
  74.  
  75. wine<-read.delim("winequality-white.csv",header=TRUE,sep=";")
  76. clean_data<-wine[-c(254,446,2782,3308,4746),]
  77. #clean_data
  78. new_reg<-lm(quality~.,data=clean_data)
  79. summary(new_reg)
  80. ######################################################################################################################################
  81. #Model Adequacy Checking
  82. # before outlier
  83. data<-read.delim("winequality-white.csv",header=TRUE,sep=";")
  84. names(data)
  85. old_reg<-lm(quality~.,data=data)
  86. plot(x=old_reg$fitted.values,y=old_reg$residuals,
  87. xlab="Predicted Values",ylab="Residuals",main="old Residuals vs. Predicted values",
  88. col="blue",pch=19,cex=1.5,panel.first=grid(col="gray",lty="dotted"))
  89. abline(h=0,col="red")
  90.  
  91. qq.plot<-qqnorm(old_reg$residuals,main = " old Normal Probability Plot",
  92. xlab = "Theoretical Quantiles", ylab = "Sample Quantiles",
  93. plot.it = TRUE,col="blue", pch = 19, cex=1.5,
  94. panel.first = grid(col = "gray", lty = "dotted"))
  95. abline(lm(qq.plot$y~qq.plot$x),col="red")
  96. # after outlier
  97. plot(x=new_reg$fitted.values,y=new_reg$residuals,
  98. xlab="Predicted Values",ylab="Residuals",main="Residuals vs. Predicted values",
  99. col="blue",pch=19,cex=1.5,panel.first=grid(col="gray",lty="dotted"))
  100. abline(h=0,col="red")
  101.  
  102. qq.plot<-qqnorm(new_reg$residuals,main = "Normal Probability Plot",
  103. xlab = "Theoretical Quantiles", ylab = "Sample Quantiles",
  104. plot.it = TRUE,col="blue", pch = 19, cex=1.5,
  105. panel.first = grid(col = "gray", lty = "dotted"))
  106. abline(lm(qq.plot$y~qq.plot$x),col="red")
  107.  
  108. # constant variance violated, no violation on normality
  109. ######################################################################################################################################
  110.  
  111. # citric acid, chlorides and total sulfur have high P value
  112. # build a new model without above attributes
  113.  
  114. model_reduced1<- lm(quality~fixed.acidity+volatile.acidity+residual.sugar+free.sulfur.dioxide+density+pH+sulphates+alcohol,data=clean_data)
  115. summary(model_reduced1)
  116.  
  117.  
  118. anvF<-anova(model)
  119. anvR1<-anova(model_reduced1)
  120.  
  121. # workflow will be finding (SSER1-SSEF)/(DF)/(MSEF)
  122.  
  123. # Reduced model 1
  124. # H0: B1=B3=B5=B7=0
  125. # H1: At least one Beta are non zero
  126. SSER1<-anvR1$`Sum Sq`[8]
  127. SSEF<-anvF$`Sum Sq`[12]
  128. NDF<-sum(anvF$Df[1:11])-sum(anvR1$Df[1:7])
  129. DDF<-anvF$Df[12]
  130. FstatR1<- (SSER1-SSEF)/NDF/(SSEF/DDF)
  131. qf(0.05,4,anvF$"Df"[12], lower.tail = FALSE)
  132. FstatR1
  133. # since f value of R2 is less than F at 0.05, do not reject H0
  134. # we determined that B3(citric acid),B5(Chlorides), B7(Total Sulfur Dioxide) are insignificant
  135.  
  136.  
  137. ######################################################################################################################################
  138.  
  139. # Finding Interaction Point
  140.  
  141. wine.Reg<-lm(quality~fixed.acidity+volatile.acidity+residual.sugar+free.sulfur.dioxide+density+pH+sulphates+alcohol,data=clean_data)
  142. summary(wine.Reg)
  143. x1<-clean_data$fixed.acidity
  144. x2<-clean_data$volatile.acidity
  145. x3<-clean_data$residual.sugar
  146. x4<-clean_data$free.sulfur.dioxide
  147. x5<-clean_data$density
  148. x6<-clean_data$pH
  149. x7<-clean_data$sulphates
  150. x8<-clean_data$alcohol
  151. Y<-clean_data$quality
  152.  
  153. # we interact all the chemical components in the features
  154. #sugar, sulfur.dioxide, suplhates and alcohol is chemical components
  155. cor(x3,x4)
  156. cor(x4,x7)
  157. cor(x4,x8)
  158. cor(x7,x8)
  159.  
  160. #only sugar, sulfur dioxide and suplhates possitively related
  161. inter<-lm(Y~x1+x2+x3+x4+x5+x6+x7+x8+I(x3*x4*x7))
  162. summary(inter)
  163. inter2<-lm(Y~x1+x2+x3+x4+x5+x6+x7+x8+I(x3*x4))
  164. summary(inter2)
  165. inter3<-lm(Y~x1+x2+x3+x4+x5+x6+x7+x8+I(x3*x7))
  166. summary(inter3)
  167. inter4<-lm(Y~x1+x2+x3+x4+x5+x6+x7+x8+I(x4*x7))
  168. summary(inter4)
  169.  
  170. # we decided to use model with 3 interaction since it has the higehst adjR
  171. ######################################################################################################################################
  172. # transform response variable x1 x3 x5
  173. x9<-x3*x4*x7
  174. quad_model<-lm(Y~x1+x2+x3+x4+x5+x6+x7+x8+x9)
  175. sx1<-x1-mean(x1)
  176. px1_reg<-lm(Y~sx1+I(exp(-sx1))+x2+x3+x4+x5+x6+x7+x8+x9)
  177. summary(px1_reg)
  178.  
  179. sx3<-x3-mean(x3)
  180. px3_reg<-lm(Y~x1+x2+sx3+I(exp(-sx3))+x4+x5+x6+x7+x8+x9)
  181. summary(px3_reg)
  182.  
  183. sx5<-x5-mean(x5)
  184. px5_reg<-lm(Y~x1+x2+x3+x4+sx5+I(exp(-sx5))+x6+x7+x8+x9)
  185. summary(px5_reg)
  186.  
  187. # how about x9?
  188. plot(x9,Y,xlab ="Fixed Acidity",ylab="Quality",main="Quality vs. Fixed Acidity", col="red", pch =19,cex=1.5,panel.first=grid(col="gray",lty="dotted"))
  189. abline(lm(Y~x9))
  190.  
  191. sx9<-x9-mean(x9)
  192. px9_reg<-lm(Y~x1+x2+x3+x4+x5+x6+x7+x8+sx9+I(exp(-sx9)))
  193. summary(px9_reg)
  194.  
  195. #box cox for Y
  196. library(MASS)
  197. save.bc<-boxcox(object = quad_model, lambda = seq(from = -2,to = 2, by = 0.01))
  198. title(main = "Box-Cox transformation plot")
  199. lambda.hat<-save.bc$x[save.bc$y == max(save.bc$y)]
  200. lambda.hat
  201. #concluded model with all increased adjR
  202. Yt<-Y^0.68
  203. final_quad_model<-lm(Yt~sx1+I(exp(-sx1))+x2+sx3+I(exp(-sx3))+x4+sx5+I(exp(-sx5))+x6+x7+x8+sx9+I(exp(-sx9)))
  204. summary(final_quad_model)
  205.  
  206. #final model model adequcy checking
  207. plot(x=final_quad_model$fitted.values,y=final_quad_model$residuals,
  208. xlab="Predicted Values",ylab="Residuals",main="Final Model Residuals vs. Predicted values",
  209. col="blue",pch=19,cex=1.5,panel.first=grid(col="gray",lty="dotted"))
  210. abline(h=0,col="red")
  211.  
  212. qq.plot<-qqnorm(final_quad_model$residuals,main = "Normal Probability Plot",
  213. xlab = "Theoretical Quantiles", ylab = "Sample Quantiles",
  214. plot.it = TRUE,col="blue", pch = 19, cex=1.5,
  215. panel.first = grid(col = "gray", lty = "dotted"))
  216. abline(lm(qq.plot$y~qq.plot$x),col="red")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement