Guest User

Untitled

a guest
Dec 17th, 2017
117
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.05 KB | None | 0 0
  1. library(xgboost)
  2. library(BayesTree)
  3. library(mice)
  4.  
  5. clean_data_impute = function(df) {
  6.  
  7. preds_remove <- c("sale", "author", "price", "authorstyle",
  8. "count", "Surface_Rect", "Surface_Rnd",
  9. "diff_origin", "singlefig", "lot")
  10. preds_num <- c("position", "year", "logprice", "Height_in",
  11. "Width_in", "Diam_in", "Surface", "nfigures")
  12. preds_keep <- setdiff(names(df), preds_remove)
  13. preds_factors <- setdiff(preds_keep, preds_num)
  14.  
  15. cleaned_df <- df %>%
  16. mutate(nfigures = nfigures + singlefig,
  17. Shape = dplyr::recode(Shape, "ovale" = "oval",
  18. "ronde" = "round"),
  19. type_intermed = replace(type_intermed,
  20. Interm == 0, "N"),
  21. material = replace(material, material %in%
  22. c("huile", "huile sur papier", "pastel", "rond"), "other"),
  23. material = replace(material, material %in%
  24. c("octogone", "tableau", "tableaux pendants"), "canvas"),
  25. Surface = replace(Surface, Surface == 0, NA),
  26. logSurface = log(Surface),
  27. Width_in = pmax(Width_in, Diam_in, na.rm = T),
  28. Height_in = pmax(Height_in, Diam_in, na.rm = T)) %>%
  29. mutate_all(funs(replace(., . == "n/a", NA))) %>%
  30. mutate_all(funs(replace(., . == "", NA))) %>%
  31. mutate_at(preds_factors, funs(replace(., is.na(.), "NA"))) %>%
  32. mutate_at(preds_factors, factor) %>%
  33. select(preds_keep, logSurface)
  34.  
  35. impute_cols = c("Height_in", "Width_in", "Surface")
  36. imputed = mice(cleaned_df[,impute_cols], method = "pmm", maxit = 50, seed = 1)
  37. imputed = complete(imputed)
  38. cleaned_df[,impute_cols] = imputed
  39.  
  40. cleaned_df$logSurface = log(cleaned_df$Surface)
  41.  
  42. return(cleaned_df)
  43. }
  44.  
  45.  
  46.  
  47. #### xgboost ######
  48. train = paint_train %>% select(-logprice) %>% data.matrix()
  49. train.y = paint_train %>% select(logprice) %>% pull()
  50. data.new <- paint_test %>% select(-logprice) %>% data.matrix()
  51. xgb.fit <- xgboost(data = dtrain,
  52. label = train.y,
  53. objective = "reg:linear",
  54. eval_metric = "rmse",
  55. max.depth = 10,
  56. eta = 0.3,
  57. nround = 100,
  58. verbose =0)
  59.  
  60. xgb.pred <- exp(predict(xgb.fit, newdata = data.new))
  61.  
  62.  
  63. ###### BART ####
  64.  
  65. paint_train <- clean_data_impute(paintings_train) %>% distinct()
  66. paint_test <- clean_data_impute(paintings_test)
  67. train <- paint_train %>% select(dealer, year, Interm, origin_cat, endbuyer,
  68. engraved, prevcoll, finished, lrgfont, discauth, logSurface,
  69. winningbiddertype, position)
  70. train.y <- paint_train$logprice
  71. test <- paint_test %>% select(dealer, year, Interm, origin_cat, endbuyer,
  72. engraved, prevcoll, finished, lrgfont, discauth, logSurface,
  73. winningbiddertype, position)
  74.  
  75. lmbart <- bart(x.train=train,
  76. y.train=train.y,
  77. x.test=test)
  78.  
  79. bart.pred <- colMeans(lmbart$yhat.test)
Add Comment
Please, Sign In to add comment