Guest User

Untitled

a guest
Jun 25th, 2018
136
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 12.13 KB | None | 0 0
  1. #Linear Regression Model
  2.  
  3. car <- read.csv("CarPrice_Assignment.csv", stringsAsFactors = F)
  4.  
  5. View(car)
  6. str(car)
  7. summary(car)
  8.  
  9. # datadictonary
  10. # CarPrice_Assignment data.frame contains 205 obs with 26 variables
  11. # Variables give technical and performance specifications of car models and price
  12. # Variables are of class 'num' , 'int' & 'char'
  13. # Business objective is to foucus on independant variables that have a tight correlation with price of the car
  14.  
  15. ############################################## Data Preparation ##################################################################################
  16.  
  17. # Checking for duplicate obs in car data.frame
  18.  
  19. sum(duplicated(car))
  20. sum(duplicated(car$car_ID))
  21.  
  22. # NIL duplicates found , looking for NAs
  23.  
  24. sum(is.na(car))
  25.  
  26. # NIL NAs found
  27.  
  28. # Categorical variables of 'char' type will be converted to binaries for ease of anlysis
  29.  
  30. car$fueltype<-as.factor(car$fueltype)
  31. levels(car$fueltype) <- c(1,0)
  32. car$fueltype<-as.numeric(levels(car$fueltype))[car$fueltype]
  33. summary(car$fueltype)
  34.  
  35. # Checking the number of distinct 'levels' or 'factors' before conversion to binary
  36. # When only two levels are present , variable values can be set to (1,0)
  37.  
  38. array_name = sqldf("select DISTINCT aspiration from car")
  39. View(array_name)
  40. car$aspiration<-as.factor(car$aspiration)
  41. levels(car$aspiration) <- c(1,0)
  42. car$aspiration<-as.numeric(levels(car$aspiration))[car$aspiration]
  43.  
  44.  
  45. array_name = sqldf("select DISTINCT doornumber from car")
  46. View(array_name)
  47. car$doornumber<-as.factor(car$doornumber)
  48. levels(car$doornumber) <- c(1,0)
  49. car$doornumber<-as.numeric(levels(car$doornumber))[car$doornumber]
  50. summary(car$doornumber)
  51.  
  52. array_name = sqldf("select DISTINCT carbody from car")
  53. View(array_name)
  54.  
  55. # No.of levels more than two, need to assign dummy values
  56. # create a matrix of dummy variables, convert to data.frame and cbind with main data.frame
  57. # This helps in significance analysis of all possible independent variables
  58.  
  59. dummy <- data.frame(model.matrix(~ factor(carbody), data=car))
  60. View(dummy)
  61. dummy <- dummy[,-1]
  62. View(dummy)
  63. car1 <- cbind(car[,-7],dummy)
  64. summary(car1)
  65.  
  66. array_name = sqldf("select DISTINCT drivewheel from car")
  67. View(array_name)
  68. dummy1 <- data.frame(model.matrix(~ factor(drivewheel), data=car1))
  69. dummy1 <- dummy1[,-1]
  70. car2 <- cbind(car1[,-7], dummy1)
  71.  
  72. array_name = sqldf("select DISTINCT enginelocation from car")
  73. View(array_name)
  74. car2$enginelocation<-as.factor(car2$enginelocation)
  75. levels(car2$enginelocation) <- c(1,0)
  76. car2$enginelocation<-as.numeric(levels(car2$enginelocation))[car2$enginelocation]
  77.  
  78. array_name = sqldf("select DISTINCT enginetype from car2")
  79. View(array_name)
  80. dummy2 <- data.frame(model.matrix(~ factor(enginetype), data=car2))
  81. dummy2 <- dummy2[,-1]
  82. car3 <- cbind(car2[,-13], dummy2)
  83.  
  84. array_name = sqldf("select DISTINCT fuelsystem from car3")
  85. View(array_name)
  86. dummy3 <- data.frame(model.matrix(~ factor(fuelsystem), data=car3))
  87. View(dummy3)
  88. dummy3 <- dummy3[,-1]
  89. car4 <- cbind(car3[,-15], dummy3)
  90.  
  91. # As problem statement, only name of car company is to be retained.
  92. # Car company name is separated as 'brand'
  93.  
  94. car5 <- separate(car4, CarName, c("brand", "model"), sep = " ")
  95. car5 <- car5[,-4]
  96.  
  97. # Spell errors in brand names are rectified
  98.  
  99. car5$brand <- replace(car5$brand, car5$brand=="toyouta", "toyota")
  100. car5$brand <- replace(car5$brand, car5$brand=="vokswagen", "volkswagen")
  101. car5$brand <- replace(car5$brand, car5$brand=="vw", "volkswagen")
  102. car5$brand <- replace(car5$brand, car5$brand=="Nissan", "nissan")
  103. car5$brand <- replace(car5$brand, car5$brand=="porchsce", "porsche")
  104. car5$brand <- replace(car5$brand, car5$brand=="maxda", "mazda")
  105.  
  106. # Dummy variables created for brand names to analyse significance
  107.  
  108. dummy4 <- data.frame(model.matrix(~ factor(brand), data=car5))
  109. dummy4 <- dummy4[,-1]
  110. car6 <- cbind(car5[,-3], dummy4)
  111.  
  112. # Dummy variables for 'symboling'
  113.  
  114. array_name = sqldf("select DISTINCT symboling from car6")
  115. View(array_name)
  116. dummy5 <- data.frame(model.matrix(~ factor(symboling), data=car6))
  117. dummy5 <- dummy5[,-1]
  118. car7 <- cbind(car6[,-2], dummy5)
  119.  
  120. # Dummy variables for no.of.cylinders
  121.  
  122. array_name = sqldf("select DISTINCT cylindernumber from car7")
  123. View(array_name)
  124. dummy6 <- data.frame(model.matrix(~ factor(symboling), data=car7))
  125. dummy6 <- dummy6[,-1]
  126. car8 <- cbind(car7[,-2], dummy6)
  127.  
  128. # Derived metrics: Average of mpg in city and highway has been taken as a independent variable
  129.  
  130. car8$avg.mpg <- (car8$citympg+car8$highwaympg)/2
  131. View(car8)
  132. car9 <- car8[,-c(17,18)]
  133.  
  134. # Carlength has been classified as "Small car" (141-157) , "Midsize" (158-197) & "Luxury" (Above 197)
  135. # Dummy variables et for the three segments
  136.  
  137. car9$carlength <- as.factor(car9$carlength)
  138. unique(car9$carlength)
  139. levels(car9$carlength)[1:8] <- "smallcar"
  140. levels(car9$carlength)[2:62] <- "midsize"
  141. levels(car9$carlength)[3:8] <- "luxury"
  142. dummy7 <- data.frame(model.matrix(~ carlength, data=car9))
  143. dummy7 <- dummy7[,-1]
  144. car10 <- cbind(car9[,-7],dummy7)
  145.  
  146. # Variables with just '1' obs have been identified
  147.  
  148. sum(car9$factor.cylindernumber.twelve)
  149. sum(car9$factor.cylindernumber.three)
  150. sum(car9$factor.fuelsystem.spfi)
  151. sum(car9$factor.fuelsystem.mfi)
  152. sum(car9$factor.enginetype.dohcv)
  153. sum(car9$factor.brand.renault)
  154. sum(car9$factor.brand.mercury)
  155.  
  156. # Variables with just 1 obs have been discarded as they would be insignificant
  157.  
  158. car10 <- car10[,-c(22,31,34,43,45,51)]
  159.  
  160. # CarID discarded as it does not add to price significance
  161.  
  162. car10 <- car9[,-1]
  163.  
  164. #### Checking for outliers in numerical variables ##############
  165.  
  166. ggplot(data=car10, aes(car10$wheelbase)) + geom_histogram( col="red", fill="green", alpha=.2)
  167.  
  168. quantile(car10$wheelbase, probs = c(0.05,0.95))
  169.  
  170. quantile(car10$wheelbase,seq(0,1,0.01))
  171.  
  172. # Capping the outliers at 93-02 and 110.00 beyond whhich there are sharp changes
  173.  
  174. car10$wheelbase[which(car10$wheelbase>110.00)] <- 110.00
  175.  
  176. car10$wheelbase[which(car10$wheelbase<93.02)] <- 93.02
  177.  
  178. # Outliers for enginesize
  179.  
  180.  
  181. outlier <- boxplot.stats(car$enginesize)$out
  182.  
  183. car$enginesize[which(car$enginesize>209.00)] <- 209.00
  184.  
  185. # Outliers for curbweight
  186.  
  187. quantile(car10$curbweight,seq(0,1,0.01))
  188.  
  189. # Sharp changes at 1% and 98% in curbweight capped
  190.  
  191. carprice$curbweight[which(carprice$curbweight<1819.72)] <- 1819.72
  192. carprice$curbweight[which(carprice$curbweight>3768.40)] <- 3768.40
  193.  
  194. # Outliers check for carprice
  195.  
  196. outlier <- boxplot.stats(car10$price)$out
  197.  
  198. ggplot(data=car10, aes(car10$price)) + geom_histogram( col="red", fill="green", alpha=.2)
  199.  
  200.  
  201. # Outlier check for horsepower
  202.  
  203. ggplot(data=car10, aes(car10$horsepower)) + geom_histogram( col="red", fill="green", alpha=.2)
  204.  
  205. outlier <- boxplot.stats(car10$horsepower)$out
  206.  
  207. car10$horsepower[which(car10$horsepower>184.00)] <- 184.00
  208.  
  209. ######################################### Linear Regression Modeling ####################################
  210.  
  211. #set the seed to 100, let's run it
  212. set.seed(100)
  213.  
  214. # randomly generate row indices for train dataset
  215. trainindices= sample(1:nrow(car10), 0.7*nrow(car10))
  216.  
  217. # generate the train data set
  218. train = car10[trainindices,]
  219.  
  220. #Similarly store the rest of the observations into an object "test".
  221. test = car10[-trainindices,]
  222.  
  223. # Run the model on the training dataset
  224.  
  225. model.1 <- lm(price~., data = car10)
  226. summary(model.1)
  227.  
  228. # Check correlation among all variables
  229.  
  230. corrs = cor(car10)
  231. View(corrs)
  232.  
  233. # Run the AIC model steps to examine what variables are insignificant ( '+' sign)
  234.  
  235. step<-stepAIC(model.1,direction="both")
  236.  
  237. # Based on inputs from correlation matrix and stepAIC , insignificant variables removed
  238. # Model.2 cleaned of carheight,doornumber,compressionratio,drivewheel,cylindernumber...
  239. # Checked for P-value more than 0.35 , VIF in double-digits
  240.  
  241. model.2 <-lm(formula=price ~ fueltype + aspiration + enginelocation + carwidth + curbweight +
  242. enginesize + boreratio + stroke + horsepower + peakrpm +
  243. factor.carbody.hardtop + factor.carbody.hatchback + factor.carbody.sedan + factor.carbody.wagon +
  244. factor.enginetype.rotor + factor.brand.audi + factor.brand.bmw + factor.brand.buick + factor.brand.chevrolet +
  245. factor.brand.jaguar + factor.brand.peugeot + factor.brand.plymouth + factor.brand.porsche +
  246. factor.brand.saab + factor.brand.toyota + factor.symboling..1 +
  247. factor.cylindernumber.five + factor.cylindernumber.twelve +
  248. avg.mpg + carlengthmidsize + carlengthluxury, data = train)
  249. summary(model.2)
  250.  
  251. # Model.2 has high adjusted R-squared but insignificant variables p-value >0.35
  252.  
  253. model_3 <-lm(formula=price ~ aspiration + enginelocation + carwidth + curbweight + enginesize + boreratio + stroke +
  254. horsepower + peakrpm + factor.carbody.hardtop + factor.carbody.hatchback + factor.carbody.sedan +
  255. factor.carbody.wagon + factor.enginetype.rotor + factor.brand.audi + factor.brand.bmw + factor.brand.buick +
  256. factor.brand.chevrolet + factor.brand.jaguar + factor.brand.peugeot + factor.brand.porsche + factor.brand.saab +
  257. factor.symboling..1 + factor.cylindernumber.five + avg.mpg + carlengthmidsize + carlengthluxury, data = train)
  258. summary(model_3)
  259. vif(model_3)
  260. summary(model_3)
  261.  
  262. # Model_3 has variables with VIF > 6 and high P-value
  263. # corr check between technically related variables curbweight and engine size
  264.  
  265. cor(car10$curbweight,car10$enginesize)
  266.  
  267. # At 0.85 corr, curbweight with higher VIF and P-value has to go
  268.  
  269. model.4 <- lm(formula=price ~ aspiration + enginelocation + carwidth + enginesize + boreratio + stroke + peakrpm +
  270. factor.carbody.hardtop + factor.carbody.hatchback + factor.carbody.sedan + factor.carbody.wagon +
  271. factor.enginetype.rotor + factor.brand.audi + factor.brand.bmw + factor.brand.buick + factor.brand.chevrolet +
  272. factor.brand.jaguar + factor.brand.peugeot + factor.brand.porsche + factor.brand.saab + factor.symboling..1 +
  273. factor.cylindernumber.five + avg.mpg + carlengthmidsize + carlengthluxury, data = train)
  274. summary(model.4)
  275. vif(model.4)
  276. summary(model.4)
  277.  
  278. # carwidth,enginesize,cylindernumber removed
  279. # Avgmpg though seeming insignificant , domain knowledge prompts retention
  280.  
  281. model.5 <- lm(formula = price ~ aspiration + enginelocation + boreratio +
  282. stroke + peakrpm + factor.carbody.hardtop + factor.carbody.hatchback +
  283. factor.carbody.sedan + factor.enginetype.rotor + factor.brand.audi +
  284. factor.brand.bmw + factor.brand.buick + factor.brand.chevrolet +
  285. factor.brand.jaguar + factor.brand.porsche + factor.brand.saab +
  286. factor.symboling..1 + avg.mpg + carlengthmidsize, data = train)
  287.  
  288. summary(model.5)
  289. vif(model-5)
  290. summary(model.5)
  291.  
  292. # Car brands , aspiration,cylindernumber, stroke, bore, rpm, carbodytypes, carwidth, symboling removed for high P-value
  293. # R squared decreased
  294. # enginesize re-introduced
  295.  
  296. model.10 <- lm(formula=price ~ enginesize + enginelocation + factor.brand.bmw + factor.brand.buick + factor.brand.jaguar
  297. + avg.mpg + carlengthluxury, data = train)
  298. summary(model.10)
  299. vif(model.10)
  300. summary(model.10)
  301. cor(car10$factor.brand.jaguar, car10$carlengthluxury)
  302.  
  303. # Corr value luxury removed, carwidth re-introduced to check R-squared
  304.  
  305. model.11 <- lm(formula=price ~ carwidth+enginesize + enginelocation + factor.brand.bmw + factor.brand.buick +
  306. factor.brand.jaguar + avg.mpg , data = train)
  307. summary(model.11)
  308. vif(model.11)
  309. summary(model.11)
  310.  
  311. cor(car10$carwidth,car10$enginesize)
  312. cor(car10$factor.brand.bmw,car10$factor.brand.buick)
  313.  
  314. # R-Squared improved , corr checked between brands, carwidth/enginesize
  315.  
  316. model.12 <- lm(formula=price ~ carwidth + enginelocation + factor.brand.bmw + factor.brand.buick +
  317. factor.brand.jaguar + avg.mpg , data = train)
  318. summary(model.12)
  319. vif(model.12)
  320. summary(model.12)
  321.  
  322. # Model.12 with Adj R-Squared 0.9055 and all p-values *** seems good
  323. # Model.11 with Adj R-sqaured 0.9243 and good p-values is the FINAL model.
  324.  
  325. # Prediction on model.11 will be done on test data
  326.  
  327. Predict_1 <- predict(model.11,test[,-1])
  328. test$test_price <- Predict_1
  329. r <- cor(test$price,test$test_price)
  330. rsquared <- cor(test$price,test$test_price)^2
  331. rsquared
  332.  
  333. # Checcking test data with model.12
  334.  
  335. Predict_4 <- predict(model.12,test[,-1])
  336. test$test_price <- Predict_4
  337. r <- cor(test$price,test$test_price)
  338. rsquared <- cor(test$price,test$test_price)^2
  339. rsquared
  340.  
  341. # rsquared for model.11 is a decent 0.8555
  342. # Geely can model their car price on significant specifications avg.mpg, carwidth, enginesize
  343. # Geely may closely look at car makers BMW,Buick and Jaguar
Add Comment
Please, Sign In to add comment