Advertisement
Guest User

Untitled

a guest
Mar 22nd, 2019
68
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.51 KB | None | 0 0
  1. suppressPackageStartupMessages({
  2. library(rsample)
  3. library(dplyr)
  4. library(rboot) # devtools::install_github("DavisVaughan/rboot")
  5. })
  6. #> Warning: package 'tidyr' was built under R version 3.5.2
  7.  
  8. dat <- bootstraps(as_tibble(iris), times = 500, apparent = TRUE)
  9.  
  10. ## /////////////////////////////////////////////////////////////////////////////
  11. # High level interface for rset objects
  12.  
  13. # we need a function that returns a list of 2 elements:
  14. # - a df with two columns `.statistic` and `.estimate`
  15. # - a catchall "extras" for other things you may want to compute
  16. # boot_result() helps with that
  17. f <- function(df) {
  18. res <- df %>%
  19. summarise(
  20. mean = mean(Sepal.Width),
  21. mean2 = mean(Sepal.Length)
  22. ) %>%
  23. gather(".statistic", ".estimate")
  24.  
  25. boot_result(.result = res, .extra = NULL)
  26. }
  27.  
  28. # this is called for each analsis() set of the splits
  29. f(analysis(dat$splits[[1]]))
  30. #> $.result
  31. #> # A tibble: 2 x 2
  32. #> .statistic .estimate
  33. #> <chr> <dbl>
  34. #> 1 mean 3.05
  35. #> 2 mean2 5.85
  36. #>
  37. #> $.extra
  38. #> NULL
  39.  
  40. # do it!
  41. mapped_dat <- boot_map(dat, f)
  42.  
  43. head(mapped_dat)
  44. #> # A tibble: 6 x 4
  45. #> splits id .result .extra
  46. #> * <list> <chr> <list> <list>
  47. #> 1 <split [150/57]> Bootstrap001 <tibble [2 × 2]> <NULL>
  48. #> 2 <split [150/49]> Bootstrap002 <tibble [2 × 2]> <NULL>
  49. #> 3 <split [150/57]> Bootstrap003 <tibble [2 × 2]> <NULL>
  50. #> 4 <split [150/54]> Bootstrap004 <tibble [2 × 2]> <NULL>
  51. #> 5 <split [150/56]> Bootstrap005 <tibble [2 × 2]> <NULL>
  52. #> 6 <split [150/53]> Bootstrap006 <tibble [2 × 2]> <NULL>
  53.  
  54. # with this we can compute the CIs
  55. boot_pctl_rset(mapped_dat)
  56. #> # A tibble: 2 x 6
  57. #> .statistic lower estimate upper alpha .method
  58. #> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
  59. #> 1 mean 2.99 3.06 3.13 0.05 percentile
  60. #> 2 mean2 5.71 5.84 5.96 0.05 percentile
  61.  
  62. boot_bca_rset(mapped_dat)
  63. #> # A tibble: 2 x 6
  64. #> .statistic lower estimate upper alpha .method
  65. #> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
  66. #> 1 mean 2.99 3.06 3.13 0.05 BCa
  67. #> 2 mean2 5.72 5.84 5.97 0.05 BCa
  68.  
  69. # for boot_bca_rset() we need the
  70. # - the original data (thats why the splits are returned)
  71. # - the .f() used to compute the statistic (.f is an attribute of mapped_dat)
  72. attr(mapped_dat, ".f_callr")
  73. #> function(x) {
  74. #> x_data <- rsample::analysis(x)
  75. #> res <- .f(x_data, ...)
  76. #>
  77. #> # TODO validation
  78. #>
  79. #> res
  80. #> }
  81. #> <bytecode: 0x7fdc1554f290>
  82. #> <environment: 0x7fdc15474ff8>
  83.  
  84. # This column structure may seem awkward, but it works well for
  85. # broom::tidy() results!
  86. # - A `.std_error` column is also required for boot_t_rset()
  87. # - we could also use jacknife nested resampling
  88. .f_with_var <- function(df) {
  89. coefs <- broom::tidy(lm(Sepal.Length ~ Sepal.Width, data = df))
  90. res <- dplyr::select(coefs, .statistic = term, .estimate = estimate, .std_error = std.error)
  91. boot_result(res)
  92. }
  93.  
  94. # So for each analysis() set...
  95. .f_with_var(analysis(dat$splits[[1]]))
  96. #> $.result
  97. #> # A tibble: 2 x 3
  98. #> .statistic .estimate .std_error
  99. #> <chr> <dbl> <dbl>
  100. #> 1 (Intercept) 6.36 0.475
  101. #> 2 Sepal.Width -0.167 0.154
  102. #>
  103. #> $.extra
  104. #> NULL
  105.  
  106. mapped_dat_w_var <- boot_map(dat, .f_with_var)
  107.  
  108. head(mapped_dat_w_var)
  109. #> # A tibble: 6 x 4
  110. #> splits id .result .extra
  111. #> * <list> <chr> <list> <list>
  112. #> 1 <split [150/57]> Bootstrap001 <tibble [2 × 3]> <NULL>
  113. #> 2 <split [150/49]> Bootstrap002 <tibble [2 × 3]> <NULL>
  114. #> 3 <split [150/57]> Bootstrap003 <tibble [2 × 3]> <NULL>
  115. #> 4 <split [150/54]> Bootstrap004 <tibble [2 × 3]> <NULL>
  116. #> 5 <split [150/56]> Bootstrap005 <tibble [2 × 3]> <NULL>
  117. #> 6 <split [150/53]> Bootstrap006 <tibble [2 × 3]> <NULL>
  118.  
  119. boot_pctl_rset(mapped_dat_w_var)
  120. #> # A tibble: 2 x 6
  121. #> .statistic lower estimate upper alpha .method
  122. #> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
  123. #> 1 (Intercept) 5.65 6.55 7.37 0.05 percentile
  124. #> 2 Sepal.Width -0.500 -0.231 0.0409 0.05 percentile
  125.  
  126. # this uses that `.std_error` column
  127. # the other two methods ignore it
  128. boot_t_rset(mapped_dat_w_var)
  129. #> # A tibble: 2 x 6
  130. #> .statistic lower estimate upper alpha .method
  131. #> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
  132. #> 1 (Intercept) 5.70 6.53 7.39 0.05 student-t
  133. #> 2 Sepal.Width -0.488 -0.223 0.0579 0.05 student-t
  134.  
  135. boot_bca_rset(mapped_dat_w_var)
  136. #> # A tibble: 2 x 6
  137. #> .statistic lower estimate upper alpha .method
  138. #> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
  139. #> 1 (Intercept) 5.62 6.53 7.35 0.05 BCa
  140. #> 2 Sepal.Width -0.473 -0.223 0.0886 0.05 BCa
  141.  
  142. ## /////////////////////////////////////////////////////////////////////////////
  143. # High level interface for data frames
  144.  
  145. iris %>%
  146. boot_compute(mn = mean(Sepal.Length), vr = var(Sepal.Length)) %>%
  147. boot_pctl_df(times = 1000)
  148. #> # A tibble: 2 x 6
  149. #> .statistic lower estimate upper alpha .method
  150. #> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
  151. #> 1 mn 5.71 5.84 5.97 0.05 percentile
  152. #> 2 vr 0.557 0.680 0.817 0.05 percentile
  153.  
  154. iris %>%
  155. boot_compute(mn = mean(Sepal.Length), vr = var(Sepal.Length)) %>%
  156. boot_pctl_df(times = 1000, alpha = .2)
  157. #> # A tibble: 2 x 6
  158. #> .statistic lower estimate upper alpha .method
  159. #> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
  160. #> 1 mn 5.76 5.84 5.93 0.2 percentile
  161. #> 2 vr 0.596 0.683 0.770 0.2 percentile
  162.  
  163. # limit the returned columns
  164. # this is useful if you need to `boot_compute()` intermediate
  165. # things just to get to the statistic you actually want
  166. iris %>%
  167. boot_compute(dummy = mean(Sepal.Length), important = mean(dummy)) %>%
  168. boot_pctl_df(important, times = 1000)
  169. #> # A tibble: 1 x 6
  170. #> .statistic lower estimate upper alpha .method
  171. #> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
  172. #> 1 important 5.71 5.84 5.98 0.05 percentile
  173.  
  174. # bca works too
  175. iris %>%
  176. boot_compute(mn = mean(Sepal.Length), vr = var(Sepal.Length)) %>%
  177. boot_bca_df(times = 1000)
  178. #> # A tibble: 2 x 6
  179. #> .statistic lower estimate upper alpha .method
  180. #> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
  181. #> 1 mn 5.72 5.84 5.98 0.05 BCa
  182. #> 2 vr 0.575 0.686 0.835 0.05 BCa
  183.  
  184. # for the boot_t method, we compute the std error columns
  185. # and then tell boot_t_df()'s `std_error_vars` argument
  186. # about it with the syntax `vars(<estimate> = <std-error>)`
  187. iris %>%
  188. boot_compute(mn = mean(Sepal.Length), vr = var(Sepal.Length)) %>%
  189. boot_t_df(times = 1000, std_error_vars = vars(mn = vr))
  190. #> # A tibble: 1 x 6
  191. #> .statistic lower estimate upper alpha .method
  192. #> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
  193. #> 1 mn 5.72 5.84 5.98 0.05 student-t
  194.  
  195. # we can group by columns
  196. iris %>%
  197. group_by(Species) %>%
  198. boot_compute(mn = mean(Sepal.Length), vr = var(Sepal.Length)) %>%
  199. boot_pctl_df(times = 1000)
  200. #> # A tibble: 6 x 7
  201. #> Species .statistic lower estimate upper alpha .method
  202. #> <fct> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
  203. #> 1 setosa mn 4.9 5.01 5.11 0.05 percentile
  204. #> 2 setosa vr 0.0779 0.120 0.166 0.05 percentile
  205. #> 3 versicolor mn 5.79 5.94 6.07 0.05 percentile
  206. #> 4 versicolor vr 0.173 0.261 0.347 0.05 percentile
  207. #> 5 virginica mn 6.42 6.59 6.77 0.05 percentile
  208. #> 6 virginica vr 0.262 0.397 0.557 0.05 percentile
  209.  
  210. iris %>%
  211. group_by(Species) %>%
  212. boot_compute(mn = mean(Sepal.Length), vr = var(Sepal.Length)) %>%
  213. boot_bca_df(times = 10000)
  214. #> # A tibble: 6 x 7
  215. #> Species .statistic lower estimate upper alpha .method
  216. #> <fct> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
  217. #> 1 setosa mn 4.91 5.01 5.11 0.05 BCa
  218. #> 2 setosa vr 0.0889 0.124 0.181 0.05 BCa
  219. #> 3 versicolor mn 5.80 5.94 6.08 0.05 BCa
  220. #> 4 versicolor vr 0.193 0.266 0.373 0.05 BCa
  221. #> 5 virginica mn 6.42 6.59 6.76 0.05 BCa
  222. #> 6 virginica vr 0.286 0.404 0.617 0.05 BCa
  223.  
  224. iris %>%
  225. group_by(Species) %>%
  226. boot_compute(mn = mean(Sepal.Length), vr = var(Sepal.Length)) %>%
  227. boot_t_df(std_error_vars = vars(mn = vr), times = 1000)
  228. #> # A tibble: 3 x 7
  229. #> Species .statistic lower estimate upper alpha .method
  230. #> <fct> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
  231. #> 1 setosa mn 4.92 5.01 5.10 0.05 student-t
  232. #> 2 versicolor mn 5.79 5.94 6.09 0.05 student-t
  233. #> 3 virginica mn 6.41 6.59 6.78 0.05 student-t
  234.  
  235.  
  236. # the model example is possible this way too, but really ugly
  237. # so you'd want to use the rsample approach
  238. iris %>%
  239. boot_compute(
  240. model = list(lm(Sepal.Length ~ Sepal.Width)),
  241. coefs = list(broom::tidy(model)),
  242. intercept = coefs$estimate[1],
  243. sepalwidth = coefs$estimate[2]
  244. ) %>%
  245. boot_pctl_df(intercept, sepalwidth)
  246. #> # A tibble: 2 x 6
  247. #> .statistic lower estimate upper alpha .method
  248. #> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
  249. #> 1 intercept 5.59 6.53 7.38 0.05 percentile
  250. #> 2 sepalwidth -0.515 -0.224 0.0826 0.05 percentile
  251.  
  252.  
  253. ## /////////////////////////////////////////////////////////////////////////////
  254. # Low level interface
  255.  
  256. # if you already have all of the required info you can use these lower level
  257. # functions:
  258.  
  259. args(calc_boot_pctl)
  260. #> function (estimate, alpha = 0.05)
  261. #> NULL
  262.  
  263. args(calc_boot_t)
  264. #> function (estimate, variance, apparent_estimate, apparent_variance,
  265. #> alpha = 0.05)
  266. #> NULL
  267.  
  268. args(calc_boot_bca)
  269. #> function (estimate, apparent_estimate, a, alpha = 0.05)
  270. #> NULL
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement