Advertisement
Guest User

Untitled

a guest
Apr 29th, 2017
72
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 11.98 KB | None | 0 0
  1. #Data visualization
  2. cars <- read.table("Cars.csv", header = TRUE, sep=";")
  3. plot(cars$cena~cars$rok_vyroby)
  4. plot(cars$objem~cars$vykon)
  5. boxplot(cars$cena~cars$vyrobce, las = 2)
  6. table(cars$vyrobce)
  7.  
  8.  
  9.  
  10.  
  11. vyrobcee <- c(mdt2$vyrobce)
  12.  
  13. plot(mdt$rating~mdt$vyrobce, las = 2)
  14. plot(mdt$rating~mdt$model, las = 2)
  15. plot(mdt$rating~mdt$rok_vyroby)
  16. plot(mdt$rating~mdt$najete_km)
  17. plot(mdt$rating~mdt$objem)
  18. plot(mdt$rating~mdt$vykon)
  19. plot(mdt$rating~mdt$spotreba)
  20. plot(mdt$rating~mdt$cena)
  21.  
  22. abline(lm(mdt$rating~mdt$vykon))
  23.  
  24. #
  25. dt1 <- c(0,1,2,3,4,5,6,7,8,9)
  26. dt2 <- c(0,1,2,3,3.4,3.9,5,4.5,3,2)
  27. df <- data.frame(dt1, dt2)
  28.  
  29. #data discretization
  30. library(arules)
  31. discretize(dt1)
  32. discretize(dt1, method="frequency", categories=4)
  33.  
  34. #various regression methods
  35. library(earth)
  36.  
  37. mod <- earth(dt2~dt1, df)
  38. mod2 <- lm(dt2~dt1, df)
  39.  
  40. summary(mod)
  41. summary(mod2)
  42.  
  43. plot(c(0,9), c(0,5), type = "n")
  44. plotmo(mod)
  45. points(dt1, dt2, type = "p")
  46. lines(dt1,dt2 ,col="green")
  47.  
  48. plot(c(0,9), c(0,5), type = "n")
  49. abline(mod2)
  50. points(dt1, dt2, type = "p")
  51. lines(dt1,dt2 ,col="green")
  52.  
  53. peak <- which.max(df$dt2)
  54. lm1 <- lm(dt2~dt1, df[0:peak,])
  55. lm2 <- lm(dt2~dt1, df[peak:nrow(df),])
  56. plot(c(0,9), c(0,5), type = "n")
  57. abline(lm1)
  58. abline(lm2)
  59. points(dt1, dt2, type = "p")
  60.  
  61.  
  62. peak <- which.max(df$dt2)
  63. plot(c(0,9), c(0,5), type = "n")
  64. points(c(df$dt1[1], df$dt1[peak], df$dt1[nrow(df)]) , c(df$dt2[1], df$dt2[peak], df$dt2[nrow(df)]), type = "l")
  65. points(dt1, dt2, type = "p")
  66.  
  67. # error on the train set
  68.  
  69. mae(predict(mod, newdata = df), dt2)
  70. mae(predict(mod2, newdata = df), dt2)
  71.  
  72. mod2 <- lm(dt2~dt1, df)
  73.  
  74. predict(mod2, data.frame(dt1 = c(1.5, 5, 8)))
  75.  
  76. model <- lm(dt2 ~ dt1, data=df)
  77. new.df <- data.frame(dt1=c(7, 8, 4))
  78. wtk <- predict(model, new.df)
  79. dfd <- data.frame(new.df, wtf)
  80.  
  81. myVar = 6;
  82. if(myVar<peak){
  83. predict(lm1, newdata = data.frame(dt1=c(myVar)))
  84. } else {
  85. predict(lm2, newdata = data.frame(dt1=c(myVar)))
  86. }
  87.  
  88. dt3 <- c(
  89.   predict(lm1, newdata = df[0:peak,]),
  90.   predict(lm2, newdata = df[(peak+1):nrow(df),])
  91.   )
  92. mae(dt3, dt2)
  93.  
  94. rmse(dt3, dt2)
  95.  
  96. dtt1 <- c(7,4,6,9)
  97. dtt2 <- c(1,3,5,7)
  98. dft <- data.frame(dtt1, dtt2)
  99.  
  100. m <- cbind(predict(mod2, newdata = dft), dtt2)
  101. cor(m, method="kendall", use="pairwise")
  102.  
  103.  
  104.  
  105.  
  106. # error on the test set (your task) is more important due to overfitting
  107.  
  108. ###############################   HOMEWORK   #############################################
  109. library(Metrics)
  110.  
  111.  
  112. FINAL_RESULTS <- data.frame(uid = integer(0), oid= integer(0), estimated_rating = integer(0))
  113.  
  114.                            
  115. for (i in 1:100) {
  116.     train <- read.table("Train.csv", header = TRUE, sep=";")
  117.     train <- train[which(train$uid==i),]
  118.    
  119.     test <- read.table("Test.csv", header = TRUE, sep=";")
  120.     test <- test[which(test$uid==i),]
  121.    
  122.     mdt <- merge(cars, train, by.x="id", by.y="oid")
  123.     mdt2 <- merge(cars, test, by.x="id", by.y="oid")
  124.    
  125. ######## 1) REGRESSIONS FOR NUMERICAL :
  126.  
  127. #1 Vykon
  128.  
  129. vdt1 <- c(mdt$vykon)
  130. vdt2 <- c(mdt$rating)
  131. vdf <- data.frame(vdt1, vdt2)
  132.  
  133. vdf <- vdf[order(vdt1),]
  134. vpeak <- which.max(vdf$vdt2)
  135. vlm1 <- lm(vdt2~vdt1, vdf[which(vdf$vdt1 <= vdf$vdt1[vpeak]),])
  136. vlm2 <- lm(vdt2~vdt1, vdf[which(vdf$vdt1 >= vdf$vdt1[vpeak]),])
  137. plot(mdt$rating~mdt$vykon)
  138. if(is.na(vlm2$coefficients[2])){ #HERE!!!
  139.   vlm2=vlm1
  140. }
  141. if(is.na(vlm1$coefficients[2])){
  142.   vlm1=vlm2
  143. }
  144. abline(vlm1)
  145. abline(vlm2)
  146. points(vdt1, vdt2, type = "p")
  147.  
  148. # vpeak <- which.max(vdf$vdt2)
  149. # plot(mdt$rating~mdt$vykon)
  150. # points(c(vdf$vdt1[1], vdf$vdt1[vpeak], vdf$vdt1[nrow(vdf)]) , c(vdf$vdt2[1], vdf$vdt2[vpeak], vdf$vdt2[nrow(vdf)]), type = "l")
  151. # points(vdt1, vdt2, type = "p")
  152.  
  153. #2 Rok wyroby
  154.  
  155. rwdt1 <- c(mdt$rok_vyroby)
  156. rwdt2 <- c(mdt$rating)
  157. rwdf <- data.frame(rwdt1, rwdt2)
  158.  
  159. rwdf <- rwdf[order(rwdt1),]
  160.  
  161. rwpeak <- which.max(rwdf$rwdt2)
  162. rwlm1 <- lm(rwdt2~rwdt1, rwdf[which(rwdf$rwdt1 <= rwdf$rwdt1[rwpeak]),])
  163. rwlm2 <- lm(rwdt2~rwdt1, rwdf[which(rwdf$rwdt1 >= rwdf$rwdt1[rwpeak]),])
  164. plot(mdt$rating~mdt$rok_vyroby)
  165. if(is.na(rwlm2$coefficients[2])){
  166.   rwlm2=rwlm1
  167. }
  168. if(is.na(rwlm1$coefficients[2])){
  169.   rwlm1=rwlm2
  170. }
  171. abline(rwlm1)
  172. abline(rwlm2)
  173. points(rwdt1, rwdt2, type = "p")
  174.  
  175. # rwpeak <- which.max(rwdf$rwdt2)
  176. # plot(mdt$rating~mdt$rok_vyroby)
  177. # points(c(rwdf$rwdt1[1], rwdf$rwdt1[rwpeak], rwdf$rwdt1[nrow(rwdf)]) , c(rwdf$rwdt2[1], rwdf$rwdt2[rwpeak], rwdf$rwdt2[nrow(rwdf)]), type = "l")
  178. # points(rwdt1, rwdt2, type = "p")
  179.  
  180. #3 Najete km
  181.  
  182. ndt1 <- c(mdt$najete_km)
  183. ndt2 <- c(mdt$rating)
  184. ndf <- data.frame(ndt1, ndt2)
  185.  
  186. ndf <- ndf[order(ndt1),]
  187.  
  188. npeak <- which.max(ndf$ndt2)
  189. nlm1 <- lm(ndt2~ndt1, ndf[which(ndf$ndt1 <= ndf$ndt1[npeak]),])
  190. nlm2 <- lm(ndt2~ndt1, ndf[which(ndf$ndt1 >= ndf$ndt1[npeak]),])
  191. plot(mdt$rating~mdt$najete_km)
  192. if(is.na(nlm2$coefficients[2])){
  193.   nlm2=nlm1
  194. }
  195. if(is.na(nlm1$coefficients[2])){
  196.   nlm1=nlm2
  197. }
  198. abline(nlm1)
  199. abline(nlm2)
  200. points(ndt1, ndt2, type = "p")
  201.  
  202. # npeak <- which.max(ndf$ndt2)
  203. # plot(mdt$rating~mdt$najete_km)
  204. # points(c(ndf$ndt1[1], ndf$ndt1[npeak], ndf$ndt1[nrow(ndf)]) , c(ndf$ndt2[1], ndf$ndt2[npeak], ndf$ndt2[nrow(ndf)]), type = "l")
  205. # points(ndt1, ndt2, type = "p")
  206.  
  207. #4 Objem
  208.  
  209. odt1 <- c(mdt$objem)
  210. odt2 <- c(mdt$rating)
  211. odf <- data.frame(odt1, odt2)
  212.  
  213. odf <- odf[order(odt1),]
  214.  
  215. opeak <- which.max(odf$odt2)
  216. olm1 <- lm(odt2~odt1, odf[which(odf$odt1 <= odf$odt1[opeak]),])
  217. olm2 <- lm(odt2~odt1, odf[which(odf$odt1 >= odf$odt1[opeak]),])
  218. plot(mdt$rating~mdt$objem)
  219. if(is.na(olm2$coefficients[2])){
  220.   olm2=olm1
  221. }
  222. if(is.na(olm1$coefficients[2])){
  223.   olm1=olm2
  224. }
  225. abline(olm1)
  226. abline(olm2)
  227. points(odt1, odt2, type = "p")
  228.  
  229. # opeak <- which.max(odf$odt2)
  230. # plot(mdt$rating~mdt$objem)
  231. # points(c(odf$odt1[1], odf$odt1[opeak], odf$odt1[nrow(odf)]) , c(odf$odt2[1], odf$odt2[opeak], odf$odt2[nrow(odf)]), type = "l")
  232. # points(odt1, odt2, type = "p")
  233.  
  234. #5 Spotreba
  235.  
  236. sdt1 <- c(mdt$spotreba)
  237. sdt2 <- c(mdt$rating)
  238. sdf <- data.frame(sdt1, sdt2)
  239.  
  240. sdf <- sdf[order(sdt1),]
  241.  
  242. speak <- which.max(sdf$sdt2)
  243. slm1 <- lm(sdt2~sdt1, sdf[which(sdf$sdt1 <= sdf$sdt1[speak]),])
  244. slm2 <- lm(sdt2~sdt1, sdf[which(sdf$sdt1 >= sdf$sdt1[speak]),])
  245. plot(mdt$rating~mdt$spotreba)
  246. if(is.na(slm2$coefficients[2])){
  247.   slm2=slm1
  248. }
  249. if(is.na(slm1$coefficients[2])){
  250.   slm1=slm2
  251. }
  252. abline(slm1)
  253. abline(slm2)
  254. points(sdt1, sdt2, type = "p")
  255.  
  256. # speak <- which.max(sdf$sdt2)
  257. # plot(mdt$rating~mdt$spotreba)
  258. # points(c(sdf$sdt1[1], sdf$sdt1[speak], sdf$sdt1[nrow(sdf)]) , c(sdf$sdt2[1], sdf$sdt2[speak], sdf$sdt2[nrow(sdf)]), type = "l")
  259. # points(sdt1, sdt2, type = "p")
  260.  
  261. #6 Cena
  262.  
  263. cdt1 <- c(mdt$cena)
  264. cdt2 <- c(mdt$rating)
  265. cdf <- data.frame(cdt1, cdt2)
  266.  
  267. cdf <- cdf[order(cdt1),]
  268.  
  269. cpeak <- which.max(cdf$cdt2)
  270. clm1 <- lm(cdt2~cdt1, cdf[which(cdf$cdt1 <= cdf$cdt1[cpeak]),])
  271. clm2 <- lm(cdt2~cdt1, cdf[which(cdf$cdt1 >= cdf$cdt1[cpeak]),])
  272. plot(mdt$rating~mdt$cena)
  273. if(is.na(clm2$coefficients[2])){
  274.   clm2=clm1
  275. }
  276. if(is.na(clm1$coefficients[2])){
  277.   clm1=clm2
  278. }
  279. abline(clm1)
  280. abline(clm2)
  281. points(cdt1, cdt2, type = "p")
  282.  
  283. # cpeak <- which.max(cdf$cdt2)
  284. # plot(mdt$rating~mdt$cena)
  285. # points(c(cdf$cdt1[1], cdf$cdt1[cpeak], cdf$cdt1[nrow(cdf)]) , c(cdf$cdt2[1], cdf$cdt2[cpeak], cdf$cdt2[nrow(cdf)]), type = "l")
  286. # points(cdt1, cdt2, type = "p")
  287.  
  288. ########### 2) AVERAGES FOR NOMINAL:
  289. vqdt2 <- c(mdt$rating)
  290. vqdt3 <- c(mdt$model)
  291. vqdt4 <- c(mdt$vyrobce)
  292.  
  293. vqdf2 <- data.frame(vqdt3, vqdt2)
  294. vqdf3 <- data.frame(vqdt4, vqdt2)
  295.  
  296. #1 Model
  297. agrmod <- aggregate(.~mdt$model, data=vqdf2, mean)
  298.  
  299. #2 Vyrobce
  300. agrvyr <- aggregate(.~mdt$vyrobce, data=vqdf3, mean)
  301.  
  302. ######### 3) ERRORS + WEIGHTS ON NUMERICAL
  303.  
  304. #Vykon peak:
  305. vdt3 <- c(
  306.   predict(vlm1, newdata = vdf[which(vdf$vdt1 < vdf$vdt1[vpeak]),]),
  307.   predict(vlm2, newdata = vdf[which(vdf$vdt1 >= vdf$vdt1[vpeak]),])
  308. )
  309. mae(vdt3, vdt2)
  310. vweight <- 1/rmse(vdt3, vdt2)
  311.  
  312. #Rok wyroby peak:
  313. rwdt3 <- c(
  314.   predict(rwlm1, newdata = rwdf[which(rwdf$rwdt1 < rwdf$rwdt1[rwpeak]),]),
  315.   predict(rwlm2, newdata = rwdf[which(rwdf$rwdt1 >= rwdf$rwdt1[rwpeak]),])
  316. )
  317. mae(rwdt3, rwdt2)
  318. rwweight <- 1/rmse(rwdt3, rwdt2)
  319.  
  320. #objem peak:
  321. odt3 <- c(
  322.   predict(olm1, newdata = odf[which(odf$odt1 < odf$odt1[opeak]),]),
  323.   predict(olm2, newdata = odf[which(odf$odt1 >= odf$odt1[opeak]),])
  324. )
  325. mae(odt3, odt2)
  326. oweight <- 1/rmse(odt3, odt2)
  327.  
  328.  
  329. #Cena peak:
  330. cdt3 <- c(
  331.   predict(clm1, newdata = cdf[which(cdf$cdt1 < cdf$cdt1[cpeak]),]),
  332.   predict(clm2, newdata = cdf[which(cdf$cdt1 >= cdf$cdt1[cpeak]),])
  333. )
  334. mae(cdt3, cdt2)
  335. cweight <- 1/rmse(cdt3, cdt2)
  336.  
  337.  
  338. #Spotreba peak
  339. sdt3 <- c(
  340.   predict(slm1, newdata = sdf[which(sdf$sdt1 < sdf$sdt1[speak]),]),
  341.   predict(slm2, newdata = sdf[which(sdf$sdt1 >= sdf$sdt1[speak]),])
  342. )
  343. mae(sdt3, sdt2)
  344. sweight <- 1/rmse(sdt3, sdt2)
  345.  
  346.  
  347. #najete km
  348. ndt3 <- c(
  349.   predict(nlm1, newdata = ndf[which(ndf$ndt1 < ndf$ndt1[npeak]),]),
  350.   predict(nlm2, newdata = ndf[which(ndf$ndt1 >= ndf$ndt1[npeak]),])
  351. )
  352. mae(ndt3, ndt2)
  353. nweight <- 1/rmse(ndt3, ndt2)
  354.  
  355. ##### 4) ERRORS + WEIGHTS ON NOMINAL
  356.  
  357. #weight of vyrobice:
  358. barbar <- merge(mdt, agrvyr, by.x="vyrobce", by.y="mdt$vyrobce")
  359. vyweight <- 1/rmse(barbar$rating,barbar$vqdt2)
  360.  
  361. #weight of model:
  362.  
  363. birbir <- merge(mdt, agrmod, by.x="model", by.y="mdt$model")
  364. mweight <- 1/rmse(birbir$rating,birbir$vqdt2)
  365.  
  366.  
  367.  
  368. ###### 5) PREDICTING NUMERICAL ATRIBUTES VALUES
  369.  
  370. tvykon <- c(mdt2$vykon)
  371. tvdf <- data.frame(tvykon,c(1:49))
  372. colnames(tvdf) <- colnames(vdf)
  373. tvdt3 <- c(
  374.   predict(vlm1, newdata = tvdf[which(tvdf$vdt1 < vdf$vdt1[vpeak]),]),
  375.   predict(vlm2, newdata = tvdf[which(tvdf$vdt1 >= vdf$vdt1[vpeak]),])
  376. )
  377.  
  378. tcena <- c(mdt2$cena)
  379. tcdf <- data.frame(tcena,c(1:49))
  380. colnames(tcdf) <- colnames(cdf)
  381. tcdt3 <- c(
  382.   predict(clm1, newdata = tcdf[which(tcdf$cdt1 < cdf$cdt1[cpeak]),]),
  383.   predict(clm2, newdata = tcdf[which(tcdf$cdt1 >= cdf$cdt1[cpeak]),])
  384. )
  385.  
  386. tobjem <- c(mdt2$objem)
  387. todf <- data.frame(tobjem, c(1:49))
  388. colnames(todf) <- colnames(odf)
  389. todt3 <- c(
  390.   predict(olm1, newdata = todf[which(todf$odt1 < odf$odt1[opeak]),]),
  391.   predict(olm2, newdata = todf[which(todf$odt1 >= odf$odt1[opeak]),])
  392. )
  393.  
  394. trokvyroby <- c(mdt2$rok_vyroby)
  395. trwdf <- data.frame(trokvyroby, c(1:49))
  396. colnames(trwdf) <- colnames(rwdf)
  397. trwdt3 <- c(
  398.   predict(rwlm1, newdata = trwdf[which(trwdf$rwdt1 < rwdf$rwdt1[rwpeak]),]),
  399.   predict(rwlm2, newdata = trwdf[which(trwdf$rwdt1 >= rwdf$rwdt1[rwpeak]),])
  400. )
  401.  
  402. tnajeteKm <- c(mdt2$najete_km)
  403. tndf <- data.frame(tnajeteKm, c(1:49))
  404. colnames(tndf) <- colnames(ndf)
  405. tndt3 <- c(
  406.   predict(nlm1, newdata = tndf[which(tndf$ndt1 < ndf$ndt1[npeak]),]),
  407.   predict(nlm2, newdata = tndf[which(tndf$ndt1 >= ndf$ndt1[npeak]),])
  408. )
  409.  
  410. tspotreba <- c(mdt2$spotreba)
  411. tsdf <- data.frame(tspotreba, c(1:49))
  412. colnames(tsdf) <- colnames(sdf)
  413. tsdt3 <- c(
  414.   predict(slm1, newdata = tsdf[which(tsdf$sdt1 < sdf$sdt1[speak]),]),
  415.   predict(slm2, newdata = tsdf[which(tsdf$sdt1 >= sdf$sdt1[speak]),])
  416. )
  417.  
  418.  
  419. ####### 6) PREDICTING NOMINAL ATRIBUTES VALUES
  420.  
  421. #(replacing NA's with averages)
  422. tvyrobce <- mdt2$vyrobce
  423. byrbyr <- merge(mdt2, agrvyr, by.x="vyrobce", by.y="mdt$vyrobce")
  424. tvydf <- c(byrbyr$vqdt2)
  425. meanTvydf  <- mean(tvydf)
  426. byrbyr <- merge(mdt2, agrvyr, by.x="vyrobce", by.y="mdt$vyrobce", all.x = TRUE)
  427. tvydf <- c(byrbyr$vqdt2)
  428. tvydf[is.na(tvydf)] <- meanTvydf
  429.  
  430. tmodel <- mdt2$model
  431. byrbyr2 <- merge(mdt2, agrmod, by.x="model", by.y="mdt$model")
  432. tmdf <- c(byrbyr2$vqdt2)
  433. meanTmf  <- mean(tmdf)
  434. byrbyr2 <- merge(mdt2, agrmod, by.x="model", by.y="mdt$model", all.x = TRUE)
  435. tmdf <- c(byrbyr2$vqdt2)
  436. tmdf[is.na(tmdf)] <- meanTmf
  437.  
  438. ####### 7) AGGREGATING WEIGHTED VALUES TO PREDICT RATINGS ON TEST SET
  439.  
  440. #without weights:
  441. rsNoWeights <- (tvdt3 + tcdt3 + todt3+ tsdt3 + tndt3 + trwdt3 + tvydf +tmdf)/8
  442.  
  443. weightsum <- vweight+cweight+oweight+sweight+nweight+rwweight+vyweight+mweight
  444.  
  445. results <- (tvdt3 *vweight + tcdt3 *cweight + todt3*oweight + tsdt3*sweight + tndt3*nweight + trwdt3*rwweight + tvydf*vyweight +tmdf*mweight)/weightsum
  446. RESULT_SET <- data.frame(i,mdt2$id,results)
  447. FINAL_RESULTS <- rbind(FINAL_RESULTS,RESULT_SET)
  448. }
  449. colnames(FINAL_RESULTS)[1] <- "uid"
  450. colnames(FINAL_RESULTS)[2] <- "oid"
  451. colnames(FINAL_RESULTS)[3] <- "estimated_rating"
  452.  
  453. write.table(FINAL_RESULTS, file = "Results.csv", row.names = FALSE)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement