ANOVA and t-test

# Session 2 - Other statistical tests (30 minutes)

# Lecture: Understanding two-sample t-tests and ANOVA, the curse of dimensionality
# Discussion: Similarities to regression
# Demonstration: Executing t-test and ANOVA as linear models
# Hands-on Exercises: Exploring model forms in R

# ---- ANOVA ----
# ANALYSIS OF VARIANCE (i.e. SST, SSM, SSR, also... SST = SSM + SSR)
# classical approach
PlantGrowth

Plant_lm <- lm(weight ~ group, data = PlantGrowth)
Plant_lm
# (Intercept)    grouptrt1    grouptrt2
# 5.032       -0.371        0.494

# What do these coefficients mean?
PlantGrowth %>%
  group_by(group) %>%
  summarise(avg = mean(weight))
# 1 ctrl   5.03
# 2 trt1   4.66
# 3 trt2   5.53

# name the coefficients:
b_0 <- 5.032
b_1 <- -0.371 # Difference in mean between ctrl & trt1
b_2 <- 0.494  # Difference in mean between ctrl & trt2

# Let's define the two models (ANOVA, NULL)
PlantGrowth %>%
  mutate(global_mean = mean(weight)) %>%
  group_by(group) %>%
  mutate(group_mean = mean(weight)) -> Plant_stats

# Calculate the Variances: SSR, SSM, SST
SST <- sum(( Plant_stats$weight  - Plant_stats$global_mean )^2)
SSR <- sum(( Plant_stats$weight  - Plant_stats$group_mean )^2)
SSM <- sum(( Plant_stats$global_mean  - Plant_stats$group_mean )^2)

SSM + SSR
SST

N <- nrow(PlantGrowth)
K <- 3 # The number of coefficients (b_0, b_1, b_2)
MSM <- SSM/(K - 1) # The MEAN squared model
MSR <- SSR/(N - K) # The MEAN squared residuals

# Consequence is ... MSR will decrease as sample size increases
# ratio will increase
MSM/MSR # 4.846088 # put this on an F distribution

# The F dist is just the T dist squared:
pf(MSM/MSR, df1 = (K - 1), df2 = (N - K), lower=FALSE) # 0.01590996

# Typical:
anova(Plant_lm) # 0.01591
# Same as calculating above
# low p-value indicates that it is unlikely to see this data
# Is there was no influcence of X on Y, here 1.5% chance of observing this
# Results purely by chance alone.

# Can we do this with a two-sample t-test??
# ---- Two-Sample t-test ----

# Do different treatments result in extra sleep?
sleep

# typical:
t.test(extra ~ group, data = sleep, var.equal = TRUE)
# p-value = 0.07919

# Let's define the two models (ANOVA, NULL)
sleep %>%
  mutate(global_mean = mean(extra)) %>%
  group_by(group) %>%
  mutate(group_mean = mean(extra)) -> sleep_stats

# What is b_1? 1.58
lm(extra ~ group, data = sleep)
# mean in group 1 mean in group 2
# 0.75            2.33
2.33 - 0.75 # b_1

# Let's treat this as an ANOVA:
# Calculate the Variances: SSR, SSM, SST
SST <- sum(( sleep_stats$extra  - sleep_stats$global_mean )^2)
SSR <- sum(( sleep_stats$extra  - sleep_stats$group_mean )^2)
SSM <- sum(( sleep_stats$global_mean  - sleep_stats$group_mean )^2)

SSM + SSR
SST

N <- nrow(sleep)
K <- 2 # The number of coefficients (b_0, b_1, b_2)
MSM <- SSM/(K - 1) # The MEAN squared model
MSR <- SSR/(N - K) # The MEAN squared residuals

# Consequence is ... MSR will decrease as sample size increases
# ratio will increase
MSM/MSR # 4.846088 # put this on an F distribution

# The F dist is just the T dist squared:
pf(MSM/MSR, df1 = (K - 1), df2 = (N - K), lower=FALSE) # 0.07918671