Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(xgboost)
- library(BayesTree)
- library(mice)
- clean_data_impute = function(df) {
- preds_remove <- c("sale", "author", "price", "authorstyle",
- "count", "Surface_Rect", "Surface_Rnd",
- "diff_origin", "singlefig", "lot")
- preds_num <- c("position", "year", "logprice", "Height_in",
- "Width_in", "Diam_in", "Surface", "nfigures")
- preds_keep <- setdiff(names(df), preds_remove)
- preds_factors <- setdiff(preds_keep, preds_num)
- cleaned_df <- df %>%
- mutate(nfigures = nfigures + singlefig,
- Shape = dplyr::recode(Shape, "ovale" = "oval",
- "ronde" = "round"),
- type_intermed = replace(type_intermed,
- Interm == 0, "N"),
- material = replace(material, material %in%
- c("huile", "huile sur papier", "pastel", "rond"), "other"),
- material = replace(material, material %in%
- c("octogone", "tableau", "tableaux pendants"), "canvas"),
- Surface = replace(Surface, Surface == 0, NA),
- logSurface = log(Surface),
- Width_in = pmax(Width_in, Diam_in, na.rm = T),
- Height_in = pmax(Height_in, Diam_in, na.rm = T)) %>%
- mutate_all(funs(replace(., . == "n/a", NA))) %>%
- mutate_all(funs(replace(., . == "", NA))) %>%
- mutate_at(preds_factors, funs(replace(., is.na(.), "NA"))) %>%
- mutate_at(preds_factors, factor) %>%
- select(preds_keep, logSurface)
- impute_cols = c("Height_in", "Width_in", "Surface")
- imputed = mice(cleaned_df[,impute_cols], method = "pmm", maxit = 50, seed = 1)
- imputed = complete(imputed)
- cleaned_df[,impute_cols] = imputed
- cleaned_df$logSurface = log(cleaned_df$Surface)
- return(cleaned_df)
- }
- #### xgboost ######
- train = paint_train %>% select(-logprice) %>% data.matrix()
- train.y = paint_train %>% select(logprice) %>% pull()
- data.new <- paint_test %>% select(-logprice) %>% data.matrix()
- xgb.fit <- xgboost(data = dtrain,
- label = train.y,
- objective = "reg:linear",
- eval_metric = "rmse",
- max.depth = 10,
- eta = 0.3,
- nround = 100,
- verbose =0)
- xgb.pred <- exp(predict(xgb.fit, newdata = data.new))
- ###### BART ####
- paint_train <- clean_data_impute(paintings_train) %>% distinct()
- paint_test <- clean_data_impute(paintings_test)
- train <- paint_train %>% select(dealer, year, Interm, origin_cat, endbuyer,
- engraved, prevcoll, finished, lrgfont, discauth, logSurface,
- winningbiddertype, position)
- train.y <- paint_train$logprice
- test <- paint_test %>% select(dealer, year, Interm, origin_cat, endbuyer,
- engraved, prevcoll, finished, lrgfont, discauth, logSurface,
- winningbiddertype, position)
- lmbart <- bart(x.train=train,
- y.train=train.y,
- x.test=test)
- bart.pred <- colMeans(lmbart$yhat.test)
Add Comment
Please, Sign In to add comment