Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #Data visualization
- cars <- read.table("Cars.csv", header = TRUE, sep=";")
- plot(cars$cena~cars$rok_vyroby)
- plot(cars$objem~cars$vykon)
- boxplot(cars$cena~cars$vyrobce, las = 2)
- table(cars$vyrobce)
- vyrobcee <- c(mdt2$vyrobce)
- plot(mdt$rating~mdt$vyrobce, las = 2)
- plot(mdt$rating~mdt$model, las = 2)
- plot(mdt$rating~mdt$rok_vyroby)
- plot(mdt$rating~mdt$najete_km)
- plot(mdt$rating~mdt$objem)
- plot(mdt$rating~mdt$vykon)
- plot(mdt$rating~mdt$spotreba)
- plot(mdt$rating~mdt$cena)
- abline(lm(mdt$rating~mdt$vykon))
- #
- dt1 <- c(0,1,2,3,4,5,6,7,8,9)
- dt2 <- c(0,1,2,3,3.4,3.9,5,4.5,3,2)
- df <- data.frame(dt1, dt2)
- #data discretization
- library(arules)
- discretize(dt1)
- discretize(dt1, method="frequency", categories=4)
- #various regression methods
- library(earth)
- mod <- earth(dt2~dt1, df)
- mod2 <- lm(dt2~dt1, df)
- summary(mod)
- summary(mod2)
- plot(c(0,9), c(0,5), type = "n")
- plotmo(mod)
- points(dt1, dt2, type = "p")
- lines(dt1,dt2 ,col="green")
- plot(c(0,9), c(0,5), type = "n")
- abline(mod2)
- points(dt1, dt2, type = "p")
- lines(dt1,dt2 ,col="green")
- peak <- which.max(df$dt2)
- lm1 <- lm(dt2~dt1, df[0:peak,])
- lm2 <- lm(dt2~dt1, df[peak:nrow(df),])
- plot(c(0,9), c(0,5), type = "n")
- abline(lm1)
- abline(lm2)
- points(dt1, dt2, type = "p")
- peak <- which.max(df$dt2)
- plot(c(0,9), c(0,5), type = "n")
- points(c(df$dt1[1], df$dt1[peak], df$dt1[nrow(df)]) , c(df$dt2[1], df$dt2[peak], df$dt2[nrow(df)]), type = "l")
- points(dt1, dt2, type = "p")
- # error on the train set
- mae(predict(mod, newdata = df), dt2)
- mae(predict(mod2, newdata = df), dt2)
- mod2 <- lm(dt2~dt1, df)
- predict(mod2, data.frame(dt1 = c(1.5, 5, 8)))
- model <- lm(dt2 ~ dt1, data=df)
- new.df <- data.frame(dt1=c(7, 8, 4))
- wtk <- predict(model, new.df)
- dfd <- data.frame(new.df, wtf)
- myVar = 6;
- if(myVar<peak){
- predict(lm1, newdata = data.frame(dt1=c(myVar)))
- } else {
- predict(lm2, newdata = data.frame(dt1=c(myVar)))
- }
- dt3 <- c(
- predict(lm1, newdata = df[0:peak,]),
- predict(lm2, newdata = df[(peak+1):nrow(df),])
- )
- mae(dt3, dt2)
- rmse(dt3, dt2)
- dtt1 <- c(7,4,6,9)
- dtt2 <- c(1,3,5,7)
- dft <- data.frame(dtt1, dtt2)
- m <- cbind(predict(mod2, newdata = dft), dtt2)
- cor(m, method="kendall", use="pairwise")
- # error on the test set (your task) is more important due to overfitting
- ############################### HOMEWORK #############################################
- library(Metrics)
- FINAL_RESULTS <- data.frame(uid = integer(0), oid= integer(0), estimated_rating = integer(0))
- for (i in 1:100) {
- train <- read.table("Train.csv", header = TRUE, sep=";")
- train <- train[which(train$uid==i),]
- test <- read.table("Test.csv", header = TRUE, sep=";")
- test <- test[which(test$uid==i),]
- mdt <- merge(cars, train, by.x="id", by.y="oid")
- mdt2 <- merge(cars, test, by.x="id", by.y="oid")
- ######## 1) REGRESSIONS FOR NUMERICAL :
- #1 Vykon
- vdt1 <- c(mdt$vykon)
- vdt2 <- c(mdt$rating)
- vdf <- data.frame(vdt1, vdt2)
- vdf <- vdf[order(vdt1),]
- vpeak <- which.max(vdf$vdt2)
- vlm1 <- lm(vdt2~vdt1, vdf[which(vdf$vdt1 <= vdf$vdt1[vpeak]),])
- vlm2 <- lm(vdt2~vdt1, vdf[which(vdf$vdt1 >= vdf$vdt1[vpeak]),])
- plot(mdt$rating~mdt$vykon)
- if(is.na(vlm2$coefficients[2])){ #HERE!!!
- vlm2=vlm1
- }
- if(is.na(vlm1$coefficients[2])){
- vlm1=vlm2
- }
- abline(vlm1)
- abline(vlm2)
- points(vdt1, vdt2, type = "p")
- # vpeak <- which.max(vdf$vdt2)
- # plot(mdt$rating~mdt$vykon)
- # points(c(vdf$vdt1[1], vdf$vdt1[vpeak], vdf$vdt1[nrow(vdf)]) , c(vdf$vdt2[1], vdf$vdt2[vpeak], vdf$vdt2[nrow(vdf)]), type = "l")
- # points(vdt1, vdt2, type = "p")
- #2 Rok wyroby
- rwdt1 <- c(mdt$rok_vyroby)
- rwdt2 <- c(mdt$rating)
- rwdf <- data.frame(rwdt1, rwdt2)
- rwdf <- rwdf[order(rwdt1),]
- rwpeak <- which.max(rwdf$rwdt2)
- rwlm1 <- lm(rwdt2~rwdt1, rwdf[which(rwdf$rwdt1 <= rwdf$rwdt1[rwpeak]),])
- rwlm2 <- lm(rwdt2~rwdt1, rwdf[which(rwdf$rwdt1 >= rwdf$rwdt1[rwpeak]),])
- plot(mdt$rating~mdt$rok_vyroby)
- if(is.na(rwlm2$coefficients[2])){
- rwlm2=rwlm1
- }
- if(is.na(rwlm1$coefficients[2])){
- rwlm1=rwlm2
- }
- abline(rwlm1)
- abline(rwlm2)
- points(rwdt1, rwdt2, type = "p")
- # rwpeak <- which.max(rwdf$rwdt2)
- # plot(mdt$rating~mdt$rok_vyroby)
- # points(c(rwdf$rwdt1[1], rwdf$rwdt1[rwpeak], rwdf$rwdt1[nrow(rwdf)]) , c(rwdf$rwdt2[1], rwdf$rwdt2[rwpeak], rwdf$rwdt2[nrow(rwdf)]), type = "l")
- # points(rwdt1, rwdt2, type = "p")
- #3 Najete km
- ndt1 <- c(mdt$najete_km)
- ndt2 <- c(mdt$rating)
- ndf <- data.frame(ndt1, ndt2)
- ndf <- ndf[order(ndt1),]
- npeak <- which.max(ndf$ndt2)
- nlm1 <- lm(ndt2~ndt1, ndf[which(ndf$ndt1 <= ndf$ndt1[npeak]),])
- nlm2 <- lm(ndt2~ndt1, ndf[which(ndf$ndt1 >= ndf$ndt1[npeak]),])
- plot(mdt$rating~mdt$najete_km)
- if(is.na(nlm2$coefficients[2])){
- nlm2=nlm1
- }
- if(is.na(nlm1$coefficients[2])){
- nlm1=nlm2
- }
- abline(nlm1)
- abline(nlm2)
- points(ndt1, ndt2, type = "p")
- # npeak <- which.max(ndf$ndt2)
- # plot(mdt$rating~mdt$najete_km)
- # points(c(ndf$ndt1[1], ndf$ndt1[npeak], ndf$ndt1[nrow(ndf)]) , c(ndf$ndt2[1], ndf$ndt2[npeak], ndf$ndt2[nrow(ndf)]), type = "l")
- # points(ndt1, ndt2, type = "p")
- #4 Objem
- odt1 <- c(mdt$objem)
- odt2 <- c(mdt$rating)
- odf <- data.frame(odt1, odt2)
- odf <- odf[order(odt1),]
- opeak <- which.max(odf$odt2)
- olm1 <- lm(odt2~odt1, odf[which(odf$odt1 <= odf$odt1[opeak]),])
- olm2 <- lm(odt2~odt1, odf[which(odf$odt1 >= odf$odt1[opeak]),])
- plot(mdt$rating~mdt$objem)
- if(is.na(olm2$coefficients[2])){
- olm2=olm1
- }
- if(is.na(olm1$coefficients[2])){
- olm1=olm2
- }
- abline(olm1)
- abline(olm2)
- points(odt1, odt2, type = "p")
- # opeak <- which.max(odf$odt2)
- # plot(mdt$rating~mdt$objem)
- # points(c(odf$odt1[1], odf$odt1[opeak], odf$odt1[nrow(odf)]) , c(odf$odt2[1], odf$odt2[opeak], odf$odt2[nrow(odf)]), type = "l")
- # points(odt1, odt2, type = "p")
- #5 Spotreba
- sdt1 <- c(mdt$spotreba)
- sdt2 <- c(mdt$rating)
- sdf <- data.frame(sdt1, sdt2)
- sdf <- sdf[order(sdt1),]
- speak <- which.max(sdf$sdt2)
- slm1 <- lm(sdt2~sdt1, sdf[which(sdf$sdt1 <= sdf$sdt1[speak]),])
- slm2 <- lm(sdt2~sdt1, sdf[which(sdf$sdt1 >= sdf$sdt1[speak]),])
- plot(mdt$rating~mdt$spotreba)
- if(is.na(slm2$coefficients[2])){
- slm2=slm1
- }
- if(is.na(slm1$coefficients[2])){
- slm1=slm2
- }
- abline(slm1)
- abline(slm2)
- points(sdt1, sdt2, type = "p")
- # speak <- which.max(sdf$sdt2)
- # plot(mdt$rating~mdt$spotreba)
- # points(c(sdf$sdt1[1], sdf$sdt1[speak], sdf$sdt1[nrow(sdf)]) , c(sdf$sdt2[1], sdf$sdt2[speak], sdf$sdt2[nrow(sdf)]), type = "l")
- # points(sdt1, sdt2, type = "p")
- #6 Cena
- cdt1 <- c(mdt$cena)
- cdt2 <- c(mdt$rating)
- cdf <- data.frame(cdt1, cdt2)
- cdf <- cdf[order(cdt1),]
- cpeak <- which.max(cdf$cdt2)
- clm1 <- lm(cdt2~cdt1, cdf[which(cdf$cdt1 <= cdf$cdt1[cpeak]),])
- clm2 <- lm(cdt2~cdt1, cdf[which(cdf$cdt1 >= cdf$cdt1[cpeak]),])
- plot(mdt$rating~mdt$cena)
- if(is.na(clm2$coefficients[2])){
- clm2=clm1
- }
- if(is.na(clm1$coefficients[2])){
- clm1=clm2
- }
- abline(clm1)
- abline(clm2)
- points(cdt1, cdt2, type = "p")
- # cpeak <- which.max(cdf$cdt2)
- # plot(mdt$rating~mdt$cena)
- # points(c(cdf$cdt1[1], cdf$cdt1[cpeak], cdf$cdt1[nrow(cdf)]) , c(cdf$cdt2[1], cdf$cdt2[cpeak], cdf$cdt2[nrow(cdf)]), type = "l")
- # points(cdt1, cdt2, type = "p")
- ########### 2) AVERAGES FOR NOMINAL:
- vqdt2 <- c(mdt$rating)
- vqdt3 <- c(mdt$model)
- vqdt4 <- c(mdt$vyrobce)
- vqdf2 <- data.frame(vqdt3, vqdt2)
- vqdf3 <- data.frame(vqdt4, vqdt2)
- #1 Model
- agrmod <- aggregate(.~mdt$model, data=vqdf2, mean)
- #2 Vyrobce
- agrvyr <- aggregate(.~mdt$vyrobce, data=vqdf3, mean)
- ######### 3) ERRORS + WEIGHTS ON NUMERICAL
- #Vykon peak:
- vdt3 <- c(
- predict(vlm1, newdata = vdf[which(vdf$vdt1 < vdf$vdt1[vpeak]),]),
- predict(vlm2, newdata = vdf[which(vdf$vdt1 >= vdf$vdt1[vpeak]),])
- )
- mae(vdt3, vdt2)
- vweight <- 1/rmse(vdt3, vdt2)
- #Rok wyroby peak:
- rwdt3 <- c(
- predict(rwlm1, newdata = rwdf[which(rwdf$rwdt1 < rwdf$rwdt1[rwpeak]),]),
- predict(rwlm2, newdata = rwdf[which(rwdf$rwdt1 >= rwdf$rwdt1[rwpeak]),])
- )
- mae(rwdt3, rwdt2)
- rwweight <- 1/rmse(rwdt3, rwdt2)
- #objem peak:
- odt3 <- c(
- predict(olm1, newdata = odf[which(odf$odt1 < odf$odt1[opeak]),]),
- predict(olm2, newdata = odf[which(odf$odt1 >= odf$odt1[opeak]),])
- )
- mae(odt3, odt2)
- oweight <- 1/rmse(odt3, odt2)
- #Cena peak:
- cdt3 <- c(
- predict(clm1, newdata = cdf[which(cdf$cdt1 < cdf$cdt1[cpeak]),]),
- predict(clm2, newdata = cdf[which(cdf$cdt1 >= cdf$cdt1[cpeak]),])
- )
- mae(cdt3, cdt2)
- cweight <- 1/rmse(cdt3, cdt2)
- #Spotreba peak
- sdt3 <- c(
- predict(slm1, newdata = sdf[which(sdf$sdt1 < sdf$sdt1[speak]),]),
- predict(slm2, newdata = sdf[which(sdf$sdt1 >= sdf$sdt1[speak]),])
- )
- mae(sdt3, sdt2)
- sweight <- 1/rmse(sdt3, sdt2)
- #najete km
- ndt3 <- c(
- predict(nlm1, newdata = ndf[which(ndf$ndt1 < ndf$ndt1[npeak]),]),
- predict(nlm2, newdata = ndf[which(ndf$ndt1 >= ndf$ndt1[npeak]),])
- )
- mae(ndt3, ndt2)
- nweight <- 1/rmse(ndt3, ndt2)
- ##### 4) ERRORS + WEIGHTS ON NOMINAL
- #weight of vyrobice:
- barbar <- merge(mdt, agrvyr, by.x="vyrobce", by.y="mdt$vyrobce")
- vyweight <- 1/rmse(barbar$rating,barbar$vqdt2)
- #weight of model:
- birbir <- merge(mdt, agrmod, by.x="model", by.y="mdt$model")
- mweight <- 1/rmse(birbir$rating,birbir$vqdt2)
- ###### 5) PREDICTING NUMERICAL ATRIBUTES VALUES
- tvykon <- c(mdt2$vykon)
- tvdf <- data.frame(tvykon,c(1:49))
- colnames(tvdf) <- colnames(vdf)
- tvdt3 <- c(
- predict(vlm1, newdata = tvdf[which(tvdf$vdt1 < vdf$vdt1[vpeak]),]),
- predict(vlm2, newdata = tvdf[which(tvdf$vdt1 >= vdf$vdt1[vpeak]),])
- )
- tcena <- c(mdt2$cena)
- tcdf <- data.frame(tcena,c(1:49))
- colnames(tcdf) <- colnames(cdf)
- tcdt3 <- c(
- predict(clm1, newdata = tcdf[which(tcdf$cdt1 < cdf$cdt1[cpeak]),]),
- predict(clm2, newdata = tcdf[which(tcdf$cdt1 >= cdf$cdt1[cpeak]),])
- )
- tobjem <- c(mdt2$objem)
- todf <- data.frame(tobjem, c(1:49))
- colnames(todf) <- colnames(odf)
- todt3 <- c(
- predict(olm1, newdata = todf[which(todf$odt1 < odf$odt1[opeak]),]),
- predict(olm2, newdata = todf[which(todf$odt1 >= odf$odt1[opeak]),])
- )
- trokvyroby <- c(mdt2$rok_vyroby)
- trwdf <- data.frame(trokvyroby, c(1:49))
- colnames(trwdf) <- colnames(rwdf)
- trwdt3 <- c(
- predict(rwlm1, newdata = trwdf[which(trwdf$rwdt1 < rwdf$rwdt1[rwpeak]),]),
- predict(rwlm2, newdata = trwdf[which(trwdf$rwdt1 >= rwdf$rwdt1[rwpeak]),])
- )
- tnajeteKm <- c(mdt2$najete_km)
- tndf <- data.frame(tnajeteKm, c(1:49))
- colnames(tndf) <- colnames(ndf)
- tndt3 <- c(
- predict(nlm1, newdata = tndf[which(tndf$ndt1 < ndf$ndt1[npeak]),]),
- predict(nlm2, newdata = tndf[which(tndf$ndt1 >= ndf$ndt1[npeak]),])
- )
- tspotreba <- c(mdt2$spotreba)
- tsdf <- data.frame(tspotreba, c(1:49))
- colnames(tsdf) <- colnames(sdf)
- tsdt3 <- c(
- predict(slm1, newdata = tsdf[which(tsdf$sdt1 < sdf$sdt1[speak]),]),
- predict(slm2, newdata = tsdf[which(tsdf$sdt1 >= sdf$sdt1[speak]),])
- )
- ####### 6) PREDICTING NOMINAL ATRIBUTES VALUES
- #(replacing NA's with averages)
- tvyrobce <- mdt2$vyrobce
- byrbyr <- merge(mdt2, agrvyr, by.x="vyrobce", by.y="mdt$vyrobce")
- tvydf <- c(byrbyr$vqdt2)
- meanTvydf <- mean(tvydf)
- byrbyr <- merge(mdt2, agrvyr, by.x="vyrobce", by.y="mdt$vyrobce", all.x = TRUE)
- tvydf <- c(byrbyr$vqdt2)
- tvydf[is.na(tvydf)] <- meanTvydf
- tmodel <- mdt2$model
- byrbyr2 <- merge(mdt2, agrmod, by.x="model", by.y="mdt$model")
- tmdf <- c(byrbyr2$vqdt2)
- meanTmf <- mean(tmdf)
- byrbyr2 <- merge(mdt2, agrmod, by.x="model", by.y="mdt$model", all.x = TRUE)
- tmdf <- c(byrbyr2$vqdt2)
- tmdf[is.na(tmdf)] <- meanTmf
- ####### 7) AGGREGATING WEIGHTED VALUES TO PREDICT RATINGS ON TEST SET
- #without weights:
- rsNoWeights <- (tvdt3 + tcdt3 + todt3+ tsdt3 + tndt3 + trwdt3 + tvydf +tmdf)/8
- weightsum <- vweight+cweight+oweight+sweight+nweight+rwweight+vyweight+mweight
- results <- (tvdt3 *vweight + tcdt3 *cweight + todt3*oweight + tsdt3*sweight + tndt3*nweight + trwdt3*rwweight + tvydf*vyweight +tmdf*mweight)/weightsum
- RESULT_SET <- data.frame(i,mdt2$id,results)
- FINAL_RESULTS <- rbind(FINAL_RESULTS,RESULT_SET)
- }
- colnames(FINAL_RESULTS)[1] <- "uid"
- colnames(FINAL_RESULTS)[2] <- "oid"
- colnames(FINAL_RESULTS)[3] <- "estimated_rating"
- write.table(FINAL_RESULTS, file = "Results.csv", row.names = FALSE)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement