Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- suppressPackageStartupMessages({
- library(rsample)
- library(dplyr)
- library(rboot) # devtools::install_github("DavisVaughan/rboot")
- })
- #> Warning: package 'tidyr' was built under R version 3.5.2
- dat <- bootstraps(as_tibble(iris), times = 500, apparent = TRUE)
- ## /////////////////////////////////////////////////////////////////////////////
- # High level interface for rset objects
- # we need a function that returns a list of 2 elements:
- # - a df with two columns `.statistic` and `.estimate`
- # - a catchall "extras" for other things you may want to compute
- # boot_result() helps with that
- f <- function(df) {
- res <- df %>%
- summarise(
- mean = mean(Sepal.Width),
- mean2 = mean(Sepal.Length)
- ) %>%
- gather(".statistic", ".estimate")
- boot_result(.result = res, .extra = NULL)
- }
- # this is called for each analsis() set of the splits
- f(analysis(dat$splits[[1]]))
- #> $.result
- #> # A tibble: 2 x 2
- #> .statistic .estimate
- #> <chr> <dbl>
- #> 1 mean 3.05
- #> 2 mean2 5.85
- #>
- #> $.extra
- #> NULL
- # do it!
- mapped_dat <- boot_map(dat, f)
- head(mapped_dat)
- #> # A tibble: 6 x 4
- #> splits id .result .extra
- #> * <list> <chr> <list> <list>
- #> 1 <split [150/57]> Bootstrap001 <tibble [2 × 2]> <NULL>
- #> 2 <split [150/49]> Bootstrap002 <tibble [2 × 2]> <NULL>
- #> 3 <split [150/57]> Bootstrap003 <tibble [2 × 2]> <NULL>
- #> 4 <split [150/54]> Bootstrap004 <tibble [2 × 2]> <NULL>
- #> 5 <split [150/56]> Bootstrap005 <tibble [2 × 2]> <NULL>
- #> 6 <split [150/53]> Bootstrap006 <tibble [2 × 2]> <NULL>
- # with this we can compute the CIs
- boot_pctl_rset(mapped_dat)
- #> # A tibble: 2 x 6
- #> .statistic lower estimate upper alpha .method
- #> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
- #> 1 mean 2.99 3.06 3.13 0.05 percentile
- #> 2 mean2 5.71 5.84 5.96 0.05 percentile
- boot_bca_rset(mapped_dat)
- #> # A tibble: 2 x 6
- #> .statistic lower estimate upper alpha .method
- #> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
- #> 1 mean 2.99 3.06 3.13 0.05 BCa
- #> 2 mean2 5.72 5.84 5.97 0.05 BCa
- # for boot_bca_rset() we need the
- # - the original data (thats why the splits are returned)
- # - the .f() used to compute the statistic (.f is an attribute of mapped_dat)
- attr(mapped_dat, ".f_callr")
- #> function(x) {
- #> x_data <- rsample::analysis(x)
- #> res <- .f(x_data, ...)
- #>
- #> # TODO validation
- #>
- #> res
- #> }
- #> <bytecode: 0x7fdc1554f290>
- #> <environment: 0x7fdc15474ff8>
- # This column structure may seem awkward, but it works well for
- # broom::tidy() results!
- # - A `.std_error` column is also required for boot_t_rset()
- # - we could also use jacknife nested resampling
- .f_with_var <- function(df) {
- coefs <- broom::tidy(lm(Sepal.Length ~ Sepal.Width, data = df))
- res <- dplyr::select(coefs, .statistic = term, .estimate = estimate, .std_error = std.error)
- boot_result(res)
- }
- # So for each analysis() set...
- .f_with_var(analysis(dat$splits[[1]]))
- #> $.result
- #> # A tibble: 2 x 3
- #> .statistic .estimate .std_error
- #> <chr> <dbl> <dbl>
- #> 1 (Intercept) 6.36 0.475
- #> 2 Sepal.Width -0.167 0.154
- #>
- #> $.extra
- #> NULL
- mapped_dat_w_var <- boot_map(dat, .f_with_var)
- head(mapped_dat_w_var)
- #> # A tibble: 6 x 4
- #> splits id .result .extra
- #> * <list> <chr> <list> <list>
- #> 1 <split [150/57]> Bootstrap001 <tibble [2 × 3]> <NULL>
- #> 2 <split [150/49]> Bootstrap002 <tibble [2 × 3]> <NULL>
- #> 3 <split [150/57]> Bootstrap003 <tibble [2 × 3]> <NULL>
- #> 4 <split [150/54]> Bootstrap004 <tibble [2 × 3]> <NULL>
- #> 5 <split [150/56]> Bootstrap005 <tibble [2 × 3]> <NULL>
- #> 6 <split [150/53]> Bootstrap006 <tibble [2 × 3]> <NULL>
- boot_pctl_rset(mapped_dat_w_var)
- #> # A tibble: 2 x 6
- #> .statistic lower estimate upper alpha .method
- #> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
- #> 1 (Intercept) 5.65 6.55 7.37 0.05 percentile
- #> 2 Sepal.Width -0.500 -0.231 0.0409 0.05 percentile
- # this uses that `.std_error` column
- # the other two methods ignore it
- boot_t_rset(mapped_dat_w_var)
- #> # A tibble: 2 x 6
- #> .statistic lower estimate upper alpha .method
- #> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
- #> 1 (Intercept) 5.70 6.53 7.39 0.05 student-t
- #> 2 Sepal.Width -0.488 -0.223 0.0579 0.05 student-t
- boot_bca_rset(mapped_dat_w_var)
- #> # A tibble: 2 x 6
- #> .statistic lower estimate upper alpha .method
- #> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
- #> 1 (Intercept) 5.62 6.53 7.35 0.05 BCa
- #> 2 Sepal.Width -0.473 -0.223 0.0886 0.05 BCa
- ## /////////////////////////////////////////////////////////////////////////////
- # High level interface for data frames
- iris %>%
- boot_compute(mn = mean(Sepal.Length), vr = var(Sepal.Length)) %>%
- boot_pctl_df(times = 1000)
- #> # A tibble: 2 x 6
- #> .statistic lower estimate upper alpha .method
- #> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
- #> 1 mn 5.71 5.84 5.97 0.05 percentile
- #> 2 vr 0.557 0.680 0.817 0.05 percentile
- iris %>%
- boot_compute(mn = mean(Sepal.Length), vr = var(Sepal.Length)) %>%
- boot_pctl_df(times = 1000, alpha = .2)
- #> # A tibble: 2 x 6
- #> .statistic lower estimate upper alpha .method
- #> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
- #> 1 mn 5.76 5.84 5.93 0.2 percentile
- #> 2 vr 0.596 0.683 0.770 0.2 percentile
- # limit the returned columns
- # this is useful if you need to `boot_compute()` intermediate
- # things just to get to the statistic you actually want
- iris %>%
- boot_compute(dummy = mean(Sepal.Length), important = mean(dummy)) %>%
- boot_pctl_df(important, times = 1000)
- #> # A tibble: 1 x 6
- #> .statistic lower estimate upper alpha .method
- #> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
- #> 1 important 5.71 5.84 5.98 0.05 percentile
- # bca works too
- iris %>%
- boot_compute(mn = mean(Sepal.Length), vr = var(Sepal.Length)) %>%
- boot_bca_df(times = 1000)
- #> # A tibble: 2 x 6
- #> .statistic lower estimate upper alpha .method
- #> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
- #> 1 mn 5.72 5.84 5.98 0.05 BCa
- #> 2 vr 0.575 0.686 0.835 0.05 BCa
- # for the boot_t method, we compute the std error columns
- # and then tell boot_t_df()'s `std_error_vars` argument
- # about it with the syntax `vars(<estimate> = <std-error>)`
- iris %>%
- boot_compute(mn = mean(Sepal.Length), vr = var(Sepal.Length)) %>%
- boot_t_df(times = 1000, std_error_vars = vars(mn = vr))
- #> # A tibble: 1 x 6
- #> .statistic lower estimate upper alpha .method
- #> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
- #> 1 mn 5.72 5.84 5.98 0.05 student-t
- # we can group by columns
- iris %>%
- group_by(Species) %>%
- boot_compute(mn = mean(Sepal.Length), vr = var(Sepal.Length)) %>%
- boot_pctl_df(times = 1000)
- #> # A tibble: 6 x 7
- #> Species .statistic lower estimate upper alpha .method
- #> <fct> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
- #> 1 setosa mn 4.9 5.01 5.11 0.05 percentile
- #> 2 setosa vr 0.0779 0.120 0.166 0.05 percentile
- #> 3 versicolor mn 5.79 5.94 6.07 0.05 percentile
- #> 4 versicolor vr 0.173 0.261 0.347 0.05 percentile
- #> 5 virginica mn 6.42 6.59 6.77 0.05 percentile
- #> 6 virginica vr 0.262 0.397 0.557 0.05 percentile
- iris %>%
- group_by(Species) %>%
- boot_compute(mn = mean(Sepal.Length), vr = var(Sepal.Length)) %>%
- boot_bca_df(times = 10000)
- #> # A tibble: 6 x 7
- #> Species .statistic lower estimate upper alpha .method
- #> <fct> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
- #> 1 setosa mn 4.91 5.01 5.11 0.05 BCa
- #> 2 setosa vr 0.0889 0.124 0.181 0.05 BCa
- #> 3 versicolor mn 5.80 5.94 6.08 0.05 BCa
- #> 4 versicolor vr 0.193 0.266 0.373 0.05 BCa
- #> 5 virginica mn 6.42 6.59 6.76 0.05 BCa
- #> 6 virginica vr 0.286 0.404 0.617 0.05 BCa
- iris %>%
- group_by(Species) %>%
- boot_compute(mn = mean(Sepal.Length), vr = var(Sepal.Length)) %>%
- boot_t_df(std_error_vars = vars(mn = vr), times = 1000)
- #> # A tibble: 3 x 7
- #> Species .statistic lower estimate upper alpha .method
- #> <fct> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
- #> 1 setosa mn 4.92 5.01 5.10 0.05 student-t
- #> 2 versicolor mn 5.79 5.94 6.09 0.05 student-t
- #> 3 virginica mn 6.41 6.59 6.78 0.05 student-t
- # the model example is possible this way too, but really ugly
- # so you'd want to use the rsample approach
- iris %>%
- boot_compute(
- model = list(lm(Sepal.Length ~ Sepal.Width)),
- coefs = list(broom::tidy(model)),
- intercept = coefs$estimate[1],
- sepalwidth = coefs$estimate[2]
- ) %>%
- boot_pctl_df(intercept, sepalwidth)
- #> # A tibble: 2 x 6
- #> .statistic lower estimate upper alpha .method
- #> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
- #> 1 intercept 5.59 6.53 7.38 0.05 percentile
- #> 2 sepalwidth -0.515 -0.224 0.0826 0.05 percentile
- ## /////////////////////////////////////////////////////////////////////////////
- # Low level interface
- # if you already have all of the required info you can use these lower level
- # functions:
- args(calc_boot_pctl)
- #> function (estimate, alpha = 0.05)
- #> NULL
- args(calc_boot_t)
- #> function (estimate, variance, apparent_estimate, apparent_variance,
- #> alpha = 0.05)
- #> NULL
- args(calc_boot_bca)
- #> function (estimate, apparent_estimate, a, alpha = 0.05)
- #> NULL
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement