Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #Linear Regression Model
- car <- read.csv("CarPrice_Assignment.csv", stringsAsFactors = F)
- View(car)
- str(car)
- summary(car)
- # datadictonary
- # CarPrice_Assignment data.frame contains 205 obs with 26 variables
- # Variables give technical and performance specifications of car models and price
- # Variables are of class 'num' , 'int' & 'char'
- # Business objective is to foucus on independant variables that have a tight correlation with price of the car
- ############################################## Data Preparation ##################################################################################
- # Checking for duplicate obs in car data.frame
- sum(duplicated(car))
- sum(duplicated(car$car_ID))
- # NIL duplicates found , looking for NAs
- sum(is.na(car))
- # NIL NAs found
- # Categorical variables of 'char' type will be converted to binaries for ease of anlysis
- car$fueltype<-as.factor(car$fueltype)
- levels(car$fueltype) <- c(1,0)
- car$fueltype<-as.numeric(levels(car$fueltype))[car$fueltype]
- summary(car$fueltype)
- # Checking the number of distinct 'levels' or 'factors' before conversion to binary
- # When only two levels are present , variable values can be set to (1,0)
- array_name = sqldf("select DISTINCT aspiration from car")
- View(array_name)
- car$aspiration<-as.factor(car$aspiration)
- levels(car$aspiration) <- c(1,0)
- car$aspiration<-as.numeric(levels(car$aspiration))[car$aspiration]
- array_name = sqldf("select DISTINCT doornumber from car")
- View(array_name)
- car$doornumber<-as.factor(car$doornumber)
- levels(car$doornumber) <- c(1,0)
- car$doornumber<-as.numeric(levels(car$doornumber))[car$doornumber]
- summary(car$doornumber)
- array_name = sqldf("select DISTINCT carbody from car")
- View(array_name)
- # No.of levels more than two, need to assign dummy values
- # create a matrix of dummy variables, convert to data.frame and cbind with main data.frame
- # This helps in significance analysis of all possible independent variables
- dummy <- data.frame(model.matrix(~ factor(carbody), data=car))
- View(dummy)
- dummy <- dummy[,-1]
- View(dummy)
- car1 <- cbind(car[,-7],dummy)
- summary(car1)
- array_name = sqldf("select DISTINCT drivewheel from car")
- View(array_name)
- dummy1 <- data.frame(model.matrix(~ factor(drivewheel), data=car1))
- dummy1 <- dummy1[,-1]
- car2 <- cbind(car1[,-7], dummy1)
- array_name = sqldf("select DISTINCT enginelocation from car")
- View(array_name)
- car2$enginelocation<-as.factor(car2$enginelocation)
- levels(car2$enginelocation) <- c(1,0)
- car2$enginelocation<-as.numeric(levels(car2$enginelocation))[car2$enginelocation]
- array_name = sqldf("select DISTINCT enginetype from car2")
- View(array_name)
- dummy2 <- data.frame(model.matrix(~ factor(enginetype), data=car2))
- dummy2 <- dummy2[,-1]
- car3 <- cbind(car2[,-13], dummy2)
- array_name = sqldf("select DISTINCT fuelsystem from car3")
- View(array_name)
- dummy3 <- data.frame(model.matrix(~ factor(fuelsystem), data=car3))
- View(dummy3)
- dummy3 <- dummy3[,-1]
- car4 <- cbind(car3[,-15], dummy3)
- # As problem statement, only name of car company is to be retained.
- # Car company name is separated as 'brand'
- car5 <- separate(car4, CarName, c("brand", "model"), sep = " ")
- car5 <- car5[,-4]
- # Spell errors in brand names are rectified
- car5$brand <- replace(car5$brand, car5$brand=="toyouta", "toyota")
- car5$brand <- replace(car5$brand, car5$brand=="vokswagen", "volkswagen")
- car5$brand <- replace(car5$brand, car5$brand=="vw", "volkswagen")
- car5$brand <- replace(car5$brand, car5$brand=="Nissan", "nissan")
- car5$brand <- replace(car5$brand, car5$brand=="porchsce", "porsche")
- car5$brand <- replace(car5$brand, car5$brand=="maxda", "mazda")
- # Dummy variables created for brand names to analyse significance
- dummy4 <- data.frame(model.matrix(~ factor(brand), data=car5))
- dummy4 <- dummy4[,-1]
- car6 <- cbind(car5[,-3], dummy4)
- # Dummy variables for 'symboling'
- array_name = sqldf("select DISTINCT symboling from car6")
- View(array_name)
- dummy5 <- data.frame(model.matrix(~ factor(symboling), data=car6))
- dummy5 <- dummy5[,-1]
- car7 <- cbind(car6[,-2], dummy5)
- # Dummy variables for no.of.cylinders
- array_name = sqldf("select DISTINCT cylindernumber from car7")
- View(array_name)
- dummy6 <- data.frame(model.matrix(~ factor(symboling), data=car7))
- dummy6 <- dummy6[,-1]
- car8 <- cbind(car7[,-2], dummy6)
- # Derived metrics: Average of mpg in city and highway has been taken as a independent variable
- car8$avg.mpg <- (car8$citympg+car8$highwaympg)/2
- View(car8)
- car9 <- car8[,-c(17,18)]
- # Carlength has been classified as "Small car" (141-157) , "Midsize" (158-197) & "Luxury" (Above 197)
- # Dummy variables et for the three segments
- car9$carlength <- as.factor(car9$carlength)
- unique(car9$carlength)
- levels(car9$carlength)[1:8] <- "smallcar"
- levels(car9$carlength)[2:62] <- "midsize"
- levels(car9$carlength)[3:8] <- "luxury"
- dummy7 <- data.frame(model.matrix(~ carlength, data=car9))
- dummy7 <- dummy7[,-1]
- car10 <- cbind(car9[,-7],dummy7)
- # Variables with just '1' obs have been identified
- sum(car9$factor.cylindernumber.twelve)
- sum(car9$factor.cylindernumber.three)
- sum(car9$factor.fuelsystem.spfi)
- sum(car9$factor.fuelsystem.mfi)
- sum(car9$factor.enginetype.dohcv)
- sum(car9$factor.brand.renault)
- sum(car9$factor.brand.mercury)
- # Variables with just 1 obs have been discarded as they would be insignificant
- car10 <- car10[,-c(22,31,34,43,45,51)]
- # CarID discarded as it does not add to price significance
- car10 <- car9[,-1]
- #### Checking for outliers in numerical variables ##############
- ggplot(data=car10, aes(car10$wheelbase)) + geom_histogram( col="red", fill="green", alpha=.2)
- quantile(car10$wheelbase, probs = c(0.05,0.95))
- quantile(car10$wheelbase,seq(0,1,0.01))
- # Capping the outliers at 93-02 and 110.00 beyond whhich there are sharp changes
- car10$wheelbase[which(car10$wheelbase>110.00)] <- 110.00
- car10$wheelbase[which(car10$wheelbase<93.02)] <- 93.02
- # Outliers for enginesize
- outlier <- boxplot.stats(car$enginesize)$out
- car$enginesize[which(car$enginesize>209.00)] <- 209.00
- # Outliers for curbweight
- quantile(car10$curbweight,seq(0,1,0.01))
- # Sharp changes at 1% and 98% in curbweight capped
- carprice$curbweight[which(carprice$curbweight<1819.72)] <- 1819.72
- carprice$curbweight[which(carprice$curbweight>3768.40)] <- 3768.40
- # Outliers check for carprice
- outlier <- boxplot.stats(car10$price)$out
- ggplot(data=car10, aes(car10$price)) + geom_histogram( col="red", fill="green", alpha=.2)
- # Outlier check for horsepower
- ggplot(data=car10, aes(car10$horsepower)) + geom_histogram( col="red", fill="green", alpha=.2)
- outlier <- boxplot.stats(car10$horsepower)$out
- car10$horsepower[which(car10$horsepower>184.00)] <- 184.00
- ######################################### Linear Regression Modeling ####################################
- #set the seed to 100, let's run it
- set.seed(100)
- # randomly generate row indices for train dataset
- trainindices= sample(1:nrow(car10), 0.7*nrow(car10))
- # generate the train data set
- train = car10[trainindices,]
- #Similarly store the rest of the observations into an object "test".
- test = car10[-trainindices,]
- # Run the model on the training dataset
- model.1 <- lm(price~., data = car10)
- summary(model.1)
- # Check correlation among all variables
- corrs = cor(car10)
- View(corrs)
- # Run the AIC model steps to examine what variables are insignificant ( '+' sign)
- step<-stepAIC(model.1,direction="both")
- # Based on inputs from correlation matrix and stepAIC , insignificant variables removed
- # Model.2 cleaned of carheight,doornumber,compressionratio,drivewheel,cylindernumber...
- # Checked for P-value more than 0.35 , VIF in double-digits
- model.2 <-lm(formula=price ~ fueltype + aspiration + enginelocation + carwidth + curbweight +
- enginesize + boreratio + stroke + horsepower + peakrpm +
- factor.carbody.hardtop + factor.carbody.hatchback + factor.carbody.sedan + factor.carbody.wagon +
- factor.enginetype.rotor + factor.brand.audi + factor.brand.bmw + factor.brand.buick + factor.brand.chevrolet +
- factor.brand.jaguar + factor.brand.peugeot + factor.brand.plymouth + factor.brand.porsche +
- factor.brand.saab + factor.brand.toyota + factor.symboling..1 +
- factor.cylindernumber.five + factor.cylindernumber.twelve +
- avg.mpg + carlengthmidsize + carlengthluxury, data = train)
- summary(model.2)
- # Model.2 has high adjusted R-squared but insignificant variables p-value >0.35
- model_3 <-lm(formula=price ~ aspiration + enginelocation + carwidth + curbweight + enginesize + boreratio + stroke +
- horsepower + peakrpm + factor.carbody.hardtop + factor.carbody.hatchback + factor.carbody.sedan +
- factor.carbody.wagon + factor.enginetype.rotor + factor.brand.audi + factor.brand.bmw + factor.brand.buick +
- factor.brand.chevrolet + factor.brand.jaguar + factor.brand.peugeot + factor.brand.porsche + factor.brand.saab +
- factor.symboling..1 + factor.cylindernumber.five + avg.mpg + carlengthmidsize + carlengthluxury, data = train)
- summary(model_3)
- vif(model_3)
- summary(model_3)
- # Model_3 has variables with VIF > 6 and high P-value
- # corr check between technically related variables curbweight and engine size
- cor(car10$curbweight,car10$enginesize)
- # At 0.85 corr, curbweight with higher VIF and P-value has to go
- model.4 <- lm(formula=price ~ aspiration + enginelocation + carwidth + enginesize + boreratio + stroke + peakrpm +
- factor.carbody.hardtop + factor.carbody.hatchback + factor.carbody.sedan + factor.carbody.wagon +
- factor.enginetype.rotor + factor.brand.audi + factor.brand.bmw + factor.brand.buick + factor.brand.chevrolet +
- factor.brand.jaguar + factor.brand.peugeot + factor.brand.porsche + factor.brand.saab + factor.symboling..1 +
- factor.cylindernumber.five + avg.mpg + carlengthmidsize + carlengthluxury, data = train)
- summary(model.4)
- vif(model.4)
- summary(model.4)
- # carwidth,enginesize,cylindernumber removed
- # Avgmpg though seeming insignificant , domain knowledge prompts retention
- model.5 <- lm(formula = price ~ aspiration + enginelocation + boreratio +
- stroke + peakrpm + factor.carbody.hardtop + factor.carbody.hatchback +
- factor.carbody.sedan + factor.enginetype.rotor + factor.brand.audi +
- factor.brand.bmw + factor.brand.buick + factor.brand.chevrolet +
- factor.brand.jaguar + factor.brand.porsche + factor.brand.saab +
- factor.symboling..1 + avg.mpg + carlengthmidsize, data = train)
- summary(model.5)
- vif(model-5)
- summary(model.5)
- # Car brands , aspiration,cylindernumber, stroke, bore, rpm, carbodytypes, carwidth, symboling removed for high P-value
- # R squared decreased
- # enginesize re-introduced
- model.10 <- lm(formula=price ~ enginesize + enginelocation + factor.brand.bmw + factor.brand.buick + factor.brand.jaguar
- + avg.mpg + carlengthluxury, data = train)
- summary(model.10)
- vif(model.10)
- summary(model.10)
- cor(car10$factor.brand.jaguar, car10$carlengthluxury)
- # Corr value luxury removed, carwidth re-introduced to check R-squared
- model.11 <- lm(formula=price ~ carwidth+enginesize + enginelocation + factor.brand.bmw + factor.brand.buick +
- factor.brand.jaguar + avg.mpg , data = train)
- summary(model.11)
- vif(model.11)
- summary(model.11)
- cor(car10$carwidth,car10$enginesize)
- cor(car10$factor.brand.bmw,car10$factor.brand.buick)
- # R-Squared improved , corr checked between brands, carwidth/enginesize
- model.12 <- lm(formula=price ~ carwidth + enginelocation + factor.brand.bmw + factor.brand.buick +
- factor.brand.jaguar + avg.mpg , data = train)
- summary(model.12)
- vif(model.12)
- summary(model.12)
- # Model.12 with Adj R-Squared 0.9055 and all p-values *** seems good
- # Model.11 with Adj R-sqaured 0.9243 and good p-values is the FINAL model.
- # Prediction on model.11 will be done on test data
- Predict_1 <- predict(model.11,test[,-1])
- test$test_price <- Predict_1
- r <- cor(test$price,test$test_price)
- rsquared <- cor(test$price,test$test_price)^2
- rsquared
- # Checcking test data with model.12
- Predict_4 <- predict(model.12,test[,-1])
- test$test_price <- Predict_4
- r <- cor(test$price,test$test_price)
- rsquared <- cor(test$price,test$test_price)^2
- rsquared
- # rsquared for model.11 is a decent 0.8555
- # Geely can model their car price on significant specifications avg.mpg, carwidth, enginesize
- # Geely may closely look at car makers BMW,Buick and Jaguar
Add Comment
Please, Sign In to add comment