Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- setwd("D:/Users")
- data<-read.delim("winequality-white.csv",header=TRUE,sep=";")
- names(data)
- # Removing the Duplicate Rows
- data <- data[!duplicated(data), ]
- dim(data)
- # Checking if there are any missing (NA) values
- sum(is.na(data))
- # Count Response Variable
- table(data$quality)
- round(cor(data, method = "pearson"), 2)
- library(reshape)
- library(ggplot2)
- melt_wine2 <- melt(data, "quality")
- ggplot(melt_wine2, aes(value, quality, color = variable)) + geom_point() + geom_smooth(aes(value,quality, colour=variable), method=lm, se=FALSE)+facet_wrap(.~variable, scales = "free")
- model<-lm(quality~.,data=data)
- # fitting regression
- summary(model)
- ######################################################################################################################################
- # leverage and influential point
- # outliers
- wine_reg<-lm(quality~.,data=data)
- sum.fit<-summary(wine_reg)
- # fitted value
- yhat<-round(wine_reg$fitted.values,3)
- # residual
- ei<-round(wine_reg$residuals,3)
- # standardized residual
- di<-round(wine_reg$residuals/sum.fit$sigma,3)
- # studentized residual
- ri<-round(rstandard(wine_reg),3)
- #PRESS
- hii<-round(hatvalues(wine_reg),3)
- press<-round(ei/(1-hii),3)
- # R student residual
- ti<-round(rstudent(wine_reg),3)
- table <-cbind(yhat,ei,di,hii,ri,press,ti)
- options(max.print=999999)
- dffits.i<-round(dffits(model=wine_reg),3)
- cook.i<-round(cooks.distance(model=wine_reg),3)
- dfbeta.all<-round(dfbetas(model= wine_reg),3)
- table2<-cbind(dffits.i,cook.i,dfbeta.all)
- #bonferroni
- #H0: Not an outlier
- #H1: OUtlier
- n<-length(wine_reg$fitted.values)
- qt(p=1-0.05/(2*n),df=wine_reg$df.residual-1)
- #observation 254,446,2782,3308,4746 failed bonferroni test
- #therefrore they are outliers
- # plot cook's distance
- plot (cook.i, pch="*", cex=2, main = "Influential Observation by Cooks distance")
- abline(h=4*mean(cook.i, na.rm=T), col="red")
- text(x=1:length(cook.i)+3, y=cook.i, labels=ifelse(cook.i>4*mean(cook.i,na.rm=T),names(cook.i),""),col="red")
- influential <- as.numeric(names(cook.i)[(cook.i>4*mean(cook.i, na.rm=T))])
- head(data[influential,])
- data<-data-c()
- qf(0.2,11,4886, lower.tail = FALSE)
- qf(0.5,11,4886, lower.tail = FALSE)
- wine<-read.delim("winequality-white.csv",header=TRUE,sep=";")
- clean_data<-wine[-c(254,446,2782,3308,4746),]
- #clean_data
- new_reg<-lm(quality~.,data=clean_data)
- summary(new_reg)
- ######################################################################################################################################
- #Model Adequacy Checking
- # before outlier
- data<-read.delim("winequality-white.csv",header=TRUE,sep=";")
- names(data)
- old_reg<-lm(quality~.,data=data)
- plot(x=old_reg$fitted.values,y=old_reg$residuals,
- xlab="Predicted Values",ylab="Residuals",main="old Residuals vs. Predicted values",
- col="blue",pch=19,cex=1.5,panel.first=grid(col="gray",lty="dotted"))
- abline(h=0,col="red")
- qq.plot<-qqnorm(old_reg$residuals,main = " old Normal Probability Plot",
- xlab = "Theoretical Quantiles", ylab = "Sample Quantiles",
- plot.it = TRUE,col="blue", pch = 19, cex=1.5,
- panel.first = grid(col = "gray", lty = "dotted"))
- abline(lm(qq.plot$y~qq.plot$x),col="red")
- # after outlier
- plot(x=new_reg$fitted.values,y=new_reg$residuals,
- xlab="Predicted Values",ylab="Residuals",main="Residuals vs. Predicted values",
- col="blue",pch=19,cex=1.5,panel.first=grid(col="gray",lty="dotted"))
- abline(h=0,col="red")
- qq.plot<-qqnorm(new_reg$residuals,main = "Normal Probability Plot",
- xlab = "Theoretical Quantiles", ylab = "Sample Quantiles",
- plot.it = TRUE,col="blue", pch = 19, cex=1.5,
- panel.first = grid(col = "gray", lty = "dotted"))
- abline(lm(qq.plot$y~qq.plot$x),col="red")
- # constant variance violated, no violation on normality
- ######################################################################################################################################
- # citric acid, chlorides and total sulfur have high P value
- # build a new model without above attributes
- model_reduced1<- lm(quality~fixed.acidity+volatile.acidity+residual.sugar+free.sulfur.dioxide+density+pH+sulphates+alcohol,data=clean_data)
- summary(model_reduced1)
- anvF<-anova(model)
- anvR1<-anova(model_reduced1)
- # workflow will be finding (SSER1-SSEF)/(DF)/(MSEF)
- # Reduced model 1
- # H0: B1=B3=B5=B7=0
- # H1: At least one Beta are non zero
- SSER1<-anvR1$`Sum Sq`[8]
- SSEF<-anvF$`Sum Sq`[12]
- NDF<-sum(anvF$Df[1:11])-sum(anvR1$Df[1:7])
- DDF<-anvF$Df[12]
- FstatR1<- (SSER1-SSEF)/NDF/(SSEF/DDF)
- qf(0.05,4,anvF$"Df"[12], lower.tail = FALSE)
- FstatR1
- # since f value of R2 is less than F at 0.05, do not reject H0
- # we determined that B3(citric acid),B5(Chlorides), B7(Total Sulfur Dioxide) are insignificant
- ######################################################################################################################################
- # Finding Interaction Point
- wine.Reg<-lm(quality~fixed.acidity+volatile.acidity+residual.sugar+free.sulfur.dioxide+density+pH+sulphates+alcohol,data=clean_data)
- summary(wine.Reg)
- x1<-clean_data$fixed.acidity
- x2<-clean_data$volatile.acidity
- x3<-clean_data$residual.sugar
- x4<-clean_data$free.sulfur.dioxide
- x5<-clean_data$density
- x6<-clean_data$pH
- x7<-clean_data$sulphates
- x8<-clean_data$alcohol
- Y<-clean_data$quality
- # we interact all the chemical components in the features
- #sugar, sulfur.dioxide, suplhates and alcohol is chemical components
- cor(x3,x4)
- cor(x4,x7)
- cor(x4,x8)
- cor(x7,x8)
- #only sugar, sulfur dioxide and suplhates possitively related
- inter<-lm(Y~x1+x2+x3+x4+x5+x6+x7+x8+I(x3*x4*x7))
- summary(inter)
- inter2<-lm(Y~x1+x2+x3+x4+x5+x6+x7+x8+I(x3*x4))
- summary(inter2)
- inter3<-lm(Y~x1+x2+x3+x4+x5+x6+x7+x8+I(x3*x7))
- summary(inter3)
- inter4<-lm(Y~x1+x2+x3+x4+x5+x6+x7+x8+I(x4*x7))
- summary(inter4)
- # we decided to use model with 3 interaction since it has the higehst adjR
- ######################################################################################################################################
- # transform response variable x1 x3 x5
- x9<-x3*x4*x7
- quad_model<-lm(Y~x1+x2+x3+x4+x5+x6+x7+x8+x9)
- sx1<-x1-mean(x1)
- px1_reg<-lm(Y~sx1+I(exp(-sx1))+x2+x3+x4+x5+x6+x7+x8+x9)
- summary(px1_reg)
- sx3<-x3-mean(x3)
- px3_reg<-lm(Y~x1+x2+sx3+I(exp(-sx3))+x4+x5+x6+x7+x8+x9)
- summary(px3_reg)
- sx5<-x5-mean(x5)
- px5_reg<-lm(Y~x1+x2+x3+x4+sx5+I(exp(-sx5))+x6+x7+x8+x9)
- summary(px5_reg)
- # how about x9?
- plot(x9,Y,xlab ="Fixed Acidity",ylab="Quality",main="Quality vs. Fixed Acidity", col="red", pch =19,cex=1.5,panel.first=grid(col="gray",lty="dotted"))
- abline(lm(Y~x9))
- sx9<-x9-mean(x9)
- px9_reg<-lm(Y~x1+x2+x3+x4+x5+x6+x7+x8+sx9+I(exp(-sx9)))
- summary(px9_reg)
- #box cox for Y
- library(MASS)
- save.bc<-boxcox(object = quad_model, lambda = seq(from = -2,to = 2, by = 0.01))
- title(main = "Box-Cox transformation plot")
- lambda.hat<-save.bc$x[save.bc$y == max(save.bc$y)]
- lambda.hat
- #concluded model with all increased adjR
- Yt<-Y^0.68
- final_quad_model<-lm(Yt~sx1+I(exp(-sx1))+x2+sx3+I(exp(-sx3))+x4+sx5+I(exp(-sx5))+x6+x7+x8+sx9+I(exp(-sx9)))
- summary(final_quad_model)
- #final model model adequcy checking
- plot(x=final_quad_model$fitted.values,y=final_quad_model$residuals,
- xlab="Predicted Values",ylab="Residuals",main="Final Model Residuals vs. Predicted values",
- col="blue",pch=19,cex=1.5,panel.first=grid(col="gray",lty="dotted"))
- abline(h=0,col="red")
- qq.plot<-qqnorm(final_quad_model$residuals,main = "Normal Probability Plot",
- xlab = "Theoretical Quantiles", ylab = "Sample Quantiles",
- plot.it = TRUE,col="blue", pch = 19, cex=1.5,
- panel.first = grid(col = "gray", lty = "dotted"))
- abline(lm(qq.plot$y~qq.plot$x),col="red")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement