Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(randomForest)
- library(MASS)
- set.seed(123)
- Mode <- function(x) {
- ux <- unique(x)
- return(ux[which.max(tabulate(match(x, ux))) & is.na(ux) == F])
- }
- df <- read.csv("train.csv")
- df1 <- df
- df2 <- read.csv("test.csv")
- tt <- read.csv("sample_submission.csv")
- for(i in 1:ncol(df))
- {
- col <- df[[i]]
- if(is.numeric(col))
- {
- mn <- (mean(col, na.rm = T))
- col[is.na(col)] <- mn
- col <- log(col + 1)
- }
- if(is.factor(col[1]))
- {
- md <- Mode(col)
- # print(md[1])
- col[is.na(col)] <- md[1]
- }
- df1[[i]] <- col
- }
- apply(df1, 2, function(x) length(unique(x)))
- #df1 <- ifelse(length(unique(df1)) < 8, as.factor(df1), df1)
- View(df1)
- summary(df1)
- #apply(df, 2, function(x){sum(is.na(x))})
- #df$PoolQC <- ifelse(is.na(df$PoolQC)==TRUE,mode(df$PoolQC), df$PoolQC)
- #fit <- lm(SalePrice ~ ., df1, family = "lm")
- #ifit <- step(fit, direction = "backward")
- #df1$y <- predict(ifit, df1)
- for(i in 1:ncol(df2))
- {
- col <- df2[[i]]
- if(is.numeric(col))
- {
- mn <- (mean(col, na.rm = T))
- col[is.na(col)] <- mn
- col <- log(col + 1)
- }
- if(is.factor(col[1]))
- {
- md <- Mode(col)
- # print(md[1])
- col[is.na(col)] <- md[1]
- }
- df2[[i]] <- col
- }
- #model3 <- randomForest(SalePrice~., df1)
- #install.packages("ROSE")
- #library(ROSE)
- #roc.curve(df1$SalePric e, predict(model3, df2))
- rf <- randomForest(SalePrice ~ ., df1, mtry = 9, ntree = 500)
- length(levels(df1))
- df2$SalePrice <- 0
- df1 <- rbind(df1, df2)
- tt$SalePrice <- predict(rf, df1[1461:2919, 1:80])
- #tt$SalePrice <- predict(ifit, df2)
- #df2$SalePrice <- predict(ifit, df2)
- #tt$SalePrice <- df2$SalePrice
- tt$SalePrice <- exp(tt$SalePrice) - 1
- write.csv(tt, file = "sm.csv", row.names = F)
- q <- read.csv("sm.csv")
- q
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement