Untitled

suppressPackageStartupMessages({
  library(rsample)
  library(dplyr)
  library(rboot) # devtools::install_github("DavisVaughan/rboot")
})
#> Warning: package 'tidyr' was built under R version 3.5.2

dat <- bootstraps(as_tibble(iris), times = 500, apparent = TRUE)

## /////////////////////////////////////////////////////////////////////////////
# High level interface for rset objects

# we need a function that returns a list of 2 elements:
# - a df with two columns `.statistic` and `.estimate`
# - a catchall "extras" for other things you may want to compute
# boot_result() helps with that
f <- function(df) {
  res <- df %>%
    summarise(
      mean = mean(Sepal.Width),
      mean2 = mean(Sepal.Length)
    ) %>%
    gather(".statistic", ".estimate")

  boot_result(.result = res, .extra = NULL)
}

# this is called for each analsis() set of the splits
f(analysis(dat$splits[[1]]))
#> $.result
#> # A tibble: 2 x 2
#>   .statistic .estimate
#>   <chr>          <dbl>
#> 1 mean            3.05
#> 2 mean2           5.85
#>
#> $.extra
#> NULL

# do it!
mapped_dat <- boot_map(dat, f)

head(mapped_dat)
#> # A tibble: 6 x 4
#>   splits           id           .result          .extra
#> * <list>           <chr>        <list>           <list>
#> 1 <split [150/57]> Bootstrap001 <tibble [2 × 2]> <NULL>
#> 2 <split [150/49]> Bootstrap002 <tibble [2 × 2]> <NULL>
#> 3 <split [150/57]> Bootstrap003 <tibble [2 × 2]> <NULL>
#> 4 <split [150/54]> Bootstrap004 <tibble [2 × 2]> <NULL>
#> 5 <split [150/56]> Bootstrap005 <tibble [2 × 2]> <NULL>
#> 6 <split [150/53]> Bootstrap006 <tibble [2 × 2]> <NULL>

# with this we can compute the CIs
boot_pctl_rset(mapped_dat)
#> # A tibble: 2 x 6
#>   .statistic lower estimate upper alpha .method
#>   <chr>      <dbl>    <dbl> <dbl> <dbl> <chr>
#> 1 mean        2.99     3.06  3.13  0.05 percentile
#> 2 mean2       5.71     5.84  5.96  0.05 percentile

boot_bca_rset(mapped_dat)
#> # A tibble: 2 x 6
#>   .statistic lower estimate upper alpha .method
#>   <chr>      <dbl>    <dbl> <dbl> <dbl> <chr>
#> 1 mean        2.99     3.06  3.13  0.05 BCa
#> 2 mean2       5.72     5.84  5.97  0.05 BCa

# for boot_bca_rset() we need the
# - the original data (thats why the splits are returned)
# - the .f() used to compute the statistic (.f is an attribute of mapped_dat)
attr(mapped_dat, ".f_callr")
#> function(x) {
#>     x_data <- rsample::analysis(x)
#>     res <- .f(x_data, ...)
#>
#>     # TODO validation
#>
#>     res
#>   }
#> <bytecode: 0x7fdc1554f290>
#> <environment: 0x7fdc15474ff8>

# This column structure may seem awkward, but it works well for
# broom::tidy() results!
# - A `.std_error` column is also required for boot_t_rset()
#   - we could also use jacknife nested resampling
.f_with_var <- function(df) {
  coefs <- broom::tidy(lm(Sepal.Length ~ Sepal.Width, data = df))
  res <- dplyr::select(coefs, .statistic = term, .estimate = estimate, .std_error = std.error)
  boot_result(res)
}

# So for each analysis() set...
.f_with_var(analysis(dat$splits[[1]]))
#> $.result
#> # A tibble: 2 x 3
#>   .statistic  .estimate .std_error
#>   <chr>           <dbl>      <dbl>
#> 1 (Intercept)     6.36       0.475
#> 2 Sepal.Width    -0.167      0.154
#>
#> $.extra
#> NULL

mapped_dat_w_var <- boot_map(dat, .f_with_var)

head(mapped_dat_w_var)
#> # A tibble: 6 x 4
#>   splits           id           .result          .extra
#> * <list>           <chr>        <list>           <list>
#> 1 <split [150/57]> Bootstrap001 <tibble [2 × 3]> <NULL>
#> 2 <split [150/49]> Bootstrap002 <tibble [2 × 3]> <NULL>
#> 3 <split [150/57]> Bootstrap003 <tibble [2 × 3]> <NULL>
#> 4 <split [150/54]> Bootstrap004 <tibble [2 × 3]> <NULL>
#> 5 <split [150/56]> Bootstrap005 <tibble [2 × 3]> <NULL>
#> 6 <split [150/53]> Bootstrap006 <tibble [2 × 3]> <NULL>

boot_pctl_rset(mapped_dat_w_var)
#> # A tibble: 2 x 6
#>   .statistic   lower estimate  upper alpha .method
#>   <chr>        <dbl>    <dbl>  <dbl> <dbl> <chr>
#> 1 (Intercept)  5.65     6.55  7.37    0.05 percentile
#> 2 Sepal.Width -0.500   -0.231 0.0409  0.05 percentile

# this uses that `.std_error` column
# the other two methods ignore it
boot_t_rset(mapped_dat_w_var)
#> # A tibble: 2 x 6
#>   .statistic   lower estimate  upper alpha .method
#>   <chr>        <dbl>    <dbl>  <dbl> <dbl> <chr>
#> 1 (Intercept)  5.70     6.53  7.39    0.05 student-t
#> 2 Sepal.Width -0.488   -0.223 0.0579  0.05 student-t

boot_bca_rset(mapped_dat_w_var)
#> # A tibble: 2 x 6
#>   .statistic   lower estimate  upper alpha .method
#>   <chr>        <dbl>    <dbl>  <dbl> <dbl> <chr>
#> 1 (Intercept)  5.62     6.53  7.35    0.05 BCa
#> 2 Sepal.Width -0.473   -0.223 0.0886  0.05 BCa

## /////////////////////////////////////////////////////////////////////////////
# High level interface for data frames

iris %>%
  boot_compute(mn = mean(Sepal.Length), vr = var(Sepal.Length)) %>%
  boot_pctl_df(times = 1000)
#> # A tibble: 2 x 6
#>   .statistic lower estimate upper alpha .method
#>   <chr>      <dbl>    <dbl> <dbl> <dbl> <chr>
#> 1 mn         5.71     5.84  5.97   0.05 percentile
#> 2 vr         0.557    0.680 0.817  0.05 percentile

iris %>%
  boot_compute(mn = mean(Sepal.Length), vr = var(Sepal.Length)) %>%
  boot_pctl_df(times = 1000, alpha = .2)
#> # A tibble: 2 x 6
#>   .statistic lower estimate upper alpha .method
#>   <chr>      <dbl>    <dbl> <dbl> <dbl> <chr>
#> 1 mn         5.76     5.84  5.93    0.2 percentile
#> 2 vr         0.596    0.683 0.770   0.2 percentile

# limit the returned columns
# this is useful if you need to `boot_compute()` intermediate
# things just to get to the statistic you actually want
iris %>%
  boot_compute(dummy = mean(Sepal.Length), important = mean(dummy)) %>%
  boot_pctl_df(important, times = 1000)
#> # A tibble: 1 x 6
#>   .statistic lower estimate upper alpha .method
#>   <chr>      <dbl>    <dbl> <dbl> <dbl> <chr>
#> 1 important   5.71     5.84  5.98  0.05 percentile

# bca works too
iris %>%
  boot_compute(mn = mean(Sepal.Length), vr = var(Sepal.Length)) %>%
  boot_bca_df(times = 1000)
#> # A tibble: 2 x 6
#>   .statistic lower estimate upper alpha .method
#>   <chr>      <dbl>    <dbl> <dbl> <dbl> <chr>
#> 1 mn         5.72     5.84  5.98   0.05 BCa
#> 2 vr         0.575    0.686 0.835  0.05 BCa

# for the boot_t method, we compute the std error columns
# and then tell boot_t_df()'s `std_error_vars` argument
# about it with the syntax `vars(<estimate> = <std-error>)`
iris %>%
  boot_compute(mn = mean(Sepal.Length), vr = var(Sepal.Length)) %>%
  boot_t_df(times = 1000, std_error_vars = vars(mn = vr))
#> # A tibble: 1 x 6
#>   .statistic lower estimate upper alpha .method
#>   <chr>      <dbl>    <dbl> <dbl> <dbl> <chr>
#> 1 mn          5.72     5.84  5.98  0.05 student-t

# we can group by columns
iris %>%
  group_by(Species) %>%
  boot_compute(mn = mean(Sepal.Length), vr = var(Sepal.Length)) %>%
  boot_pctl_df(times = 1000)
#> # A tibble: 6 x 7
#>   Species    .statistic  lower estimate upper alpha .method
#>   <fct>      <chr>       <dbl>    <dbl> <dbl> <dbl> <chr>
#> 1 setosa     mn         4.9       5.01  5.11   0.05 percentile
#> 2 setosa     vr         0.0779    0.120 0.166  0.05 percentile
#> 3 versicolor mn         5.79      5.94  6.07   0.05 percentile
#> 4 versicolor vr         0.173     0.261 0.347  0.05 percentile
#> 5 virginica  mn         6.42      6.59  6.77   0.05 percentile
#> 6 virginica  vr         0.262     0.397 0.557  0.05 percentile

iris %>%
  group_by(Species) %>%
  boot_compute(mn = mean(Sepal.Length), vr = var(Sepal.Length)) %>%
  boot_bca_df(times = 10000)
#> # A tibble: 6 x 7
#>   Species    .statistic  lower estimate upper alpha .method
#>   <fct>      <chr>       <dbl>    <dbl> <dbl> <dbl> <chr>
#> 1 setosa     mn         4.91      5.01  5.11   0.05 BCa
#> 2 setosa     vr         0.0889    0.124 0.181  0.05 BCa
#> 3 versicolor mn         5.80      5.94  6.08   0.05 BCa
#> 4 versicolor vr         0.193     0.266 0.373  0.05 BCa
#> 5 virginica  mn         6.42      6.59  6.76   0.05 BCa
#> 6 virginica  vr         0.286     0.404 0.617  0.05 BCa

iris %>%
  group_by(Species) %>%
  boot_compute(mn = mean(Sepal.Length), vr = var(Sepal.Length)) %>%
  boot_t_df(std_error_vars = vars(mn = vr), times = 1000)
#> # A tibble: 3 x 7
#>   Species    .statistic lower estimate upper alpha .method
#>   <fct>      <chr>      <dbl>    <dbl> <dbl> <dbl> <chr>
#> 1 setosa     mn          4.92     5.01  5.10  0.05 student-t
#> 2 versicolor mn          5.79     5.94  6.09  0.05 student-t
#> 3 virginica  mn          6.41     6.59  6.78  0.05 student-t


# the model example is possible this way too, but really ugly
# so you'd want to use the rsample approach
iris %>%
  boot_compute(
    model = list(lm(Sepal.Length ~ Sepal.Width)),
    coefs = list(broom::tidy(model)),
    intercept = coefs$estimate[1],
    sepalwidth = coefs$estimate[2]
  ) %>%
  boot_pctl_df(intercept, sepalwidth)
#> # A tibble: 2 x 6
#>   .statistic  lower estimate  upper alpha .method
#>   <chr>       <dbl>    <dbl>  <dbl> <dbl> <chr>
#> 1 intercept   5.59     6.53  7.38    0.05 percentile
#> 2 sepalwidth -0.515   -0.224 0.0826  0.05 percentile


## /////////////////////////////////////////////////////////////////////////////
# Low level interface

# if you already have all of the required info you can use these lower level
# functions:

args(calc_boot_pctl)
#> function (estimate, alpha = 0.05)
#> NULL

args(calc_boot_t)
#> function (estimate, variance, apparent_estimate, apparent_variance,
#>     alpha = 0.05)
#> NULL

args(calc_boot_bca)
#> function (estimate, apparent_estimate, a, alpha = 0.05)
#> NULL