Untitled

# the upshot is that in this case it seemed to work fine - maybe it depends
# the degree of collinearity between the variables?

library(tibble)
library(dplyr)
library(broom)

set.seed(nchar("them regression models, eh?") ^ 3)

# set some background conditions

sample_size <- 10000
prop <- 0.1

dat <-
  tibble(
    iv1 = sample(c(1, 0), size = sample_size, replace = TRUE, prob = c(prop, 1 - prop)),
    iv2 = sample(c(1, 0), size = sample_size, replace = TRUE, prob = c(prop, 1 - prop)),
  ) %>%
  mutate(iv2 = if_else(iv1 == 0, 0, iv2),
         dv = rnorm(sample_size, mean = iv1 + (iv2 * 0.5), sd = 1))

# when iv1 is 0 iv2 is zero,
# the real parameters for iv1 is 1 and for iv2 is 0.5


dat %>%
  summarise(cor = cor(iv1, iv2))

# they're weakly correlated

dat %>%
  count(iv1, iv2)

# we've got a good enough number of observations in each group

broom::tidy(lm(dv ~ iv1 + iv2, data = dat))
# this separates out the two effects correctly

glance(lm(dv ~ iv1 + iv2, data = dat))

broom::tidy(lm(dv ~ iv2, data = dat))

# the estimate for iv2 picks up the effect of both iv1 and iv2

glance(lm(dv ~ iv2, data = dat))

# and the model fits pretty badly


broom::tidy(lm(dv ~ iv1, data = dat))
# this picks up the effect of iv1 correctly

glance(lm(dv ~ iv1, data = dat))

# and it fits almost as well as the model with both variables
# ... I guess because there are so few cases with iv2

dat %>%
  ggplot(aes(x = dv)) +
  geom_density()