Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Session 2 - Other statistical tests (30 minutes)
- # Lecture: Understanding two-sample t-tests and ANOVA, the curse of dimensionality
- # Discussion: Similarities to regression
- # Demonstration: Executing t-test and ANOVA as linear models
- # Hands-on Exercises: Exploring model forms in R
- # ---- ANOVA ----
- # ANALYSIS OF VARIANCE (i.e. SST, SSM, SSR, also... SST = SSM + SSR)
- # classical approach
- PlantGrowth
- Plant_lm <- lm(weight ~ group, data = PlantGrowth)
- Plant_lm
- # (Intercept) grouptrt1 grouptrt2
- # 5.032 -0.371 0.494
- # What do these coefficients mean?
- PlantGrowth %>%
- group_by(group) %>%
- summarise(avg = mean(weight))
- # 1 ctrl 5.03
- # 2 trt1 4.66
- # 3 trt2 5.53
- # name the coefficients:
- b_0 <- 5.032
- b_1 <- -0.371 # Difference in mean between ctrl & trt1
- b_2 <- 0.494 # Difference in mean between ctrl & trt2
- # Let's define the two models (ANOVA, NULL)
- PlantGrowth %>%
- mutate(global_mean = mean(weight)) %>%
- group_by(group) %>%
- mutate(group_mean = mean(weight)) -> Plant_stats
- # Calculate the Variances: SSR, SSM, SST
- SST <- sum(( Plant_stats$weight - Plant_stats$global_mean )^2)
- SSR <- sum(( Plant_stats$weight - Plant_stats$group_mean )^2)
- SSM <- sum(( Plant_stats$global_mean - Plant_stats$group_mean )^2)
- SSM + SSR
- SST
- N <- nrow(PlantGrowth)
- K <- 3 # The number of coefficients (b_0, b_1, b_2)
- MSM <- SSM/(K - 1) # The MEAN squared model
- MSR <- SSR/(N - K) # The MEAN squared residuals
- # Consequence is ... MSR will decrease as sample size increases
- # ratio will increase
- MSM/MSR # 4.846088 # put this on an F distribution
- # The F dist is just the T dist squared:
- pf(MSM/MSR, df1 = (K - 1), df2 = (N - K), lower=FALSE) # 0.01590996
- # Typical:
- anova(Plant_lm) # 0.01591
- # Same as calculating above
- # low p-value indicates that it is unlikely to see this data
- # Is there was no influcence of X on Y, here 1.5% chance of observing this
- # Results purely by chance alone.
- # Can we do this with a two-sample t-test??
- # ---- Two-Sample t-test ----
- # Do different treatments result in extra sleep?
- sleep
- # typical:
- t.test(extra ~ group, data = sleep, var.equal = TRUE)
- # p-value = 0.07919
- # Let's define the two models (ANOVA, NULL)
- sleep %>%
- mutate(global_mean = mean(extra)) %>%
- group_by(group) %>%
- mutate(group_mean = mean(extra)) -> sleep_stats
- # What is b_1? 1.58
- lm(extra ~ group, data = sleep)
- # mean in group 1 mean in group 2
- # 0.75 2.33
- 2.33 - 0.75 # b_1
- # Let's treat this as an ANOVA:
- # Calculate the Variances: SSR, SSM, SST
- SST <- sum(( sleep_stats$extra - sleep_stats$global_mean )^2)
- SSR <- sum(( sleep_stats$extra - sleep_stats$group_mean )^2)
- SSM <- sum(( sleep_stats$global_mean - sleep_stats$group_mean )^2)
- SSM + SSR
- SST
- N <- nrow(sleep)
- K <- 2 # The number of coefficients (b_0, b_1, b_2)
- MSM <- SSM/(K - 1) # The MEAN squared model
- MSR <- SSR/(N - K) # The MEAN squared residuals
- # Consequence is ... MSR will decrease as sample size increases
- # ratio will increase
- MSM/MSR # 4.846088 # put this on an F distribution
- # The F dist is just the T dist squared:
- pf(MSM/MSR, df1 = (K - 1), df2 = (N - K), lower=FALSE) # 0.07918671
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement