grouping_estimator_selection_bias_simulation

#simulate selection bias in extensive margin using grouping estimation
#You can just copy-paste this into your R, the biased coefficient is on the last row, with "right" coefficients above it.
#
#Individuals choose between two states based on their relative utilities.
#State 0 has some income Y0, state 1 has some income Y1 and some disutility v1.
#Incomes determined by group, time, group-time and individual factors.
#Disutility determined by group, time and individual factors.
#All of these "effects" are individually distributed.
#Y0 and Y1 are "observed" if individual chooses states 0 or 1, respectively.
#
#Estimate the effect of (unobserved) income differential on state in a linear probability model using three approaches:
#
#"True" model: use unobserved income differential directly
#2SLS using group-time dummies as instruments for (unobserved) incomes.
#Group-averaged OLS for (unobserved) incomes.
#Group-averaged OLS for observed incomes.
#
#The second and the third  produce very similar results to the first, as they should. They are "correct". Also, they produce identical results, as they should.
#The last one produces larger effects: this is bias.
#
set.seed(123)
#Number of groups
nG = 20
#Number of periods
nT = 10
#Number of individuals in group (group-time)
nObsInG=1000
#Number of rows
nrow=nObsInG*nG*nT
#Declare data frame
df<-data.frame(s=rep(NA,nrow),G=rep(rep(c(1:nG),each=nObsInG),nT),T=rep(rep(c(1:nT),each=nObsInG*nG)))
#Compute group effects:
groupEffects<-data.frame(G=c(1:nG))
groupEffects$GE_Y0 = rnorm(nG)
groupEffects$GE_Y1 = rnorm(nG)
groupEffects$GE_v1 = runif(nG)
#Compute time effects:
timeEffects<-data.frame(T=c(1:nG))
timeEffects$TE_Y0 = rnorm(nG)
timeEffects$TE_Y1 = rnorm(nG)
timeEffects$TE_v1 = runif(nG)
#Group-time effects ("shocks"):
groupTimeEffects<-data.frame(G=rep(c(1:nG),each=nT),T=rep(c(1:nT)))
groupTimeEffects$GTE_Y1<-rnorm(nG*nT)
groupTimeEffects$GTE_Y0<-rnorm(nG*nT)
#Merge
df<-merge(df,groupEffects,by=c("G"),all.x=T)
df<-merge(df,timeEffects,by=c("T"),all.x=T)
df<-merge(df,groupTimeEffects,by=c("G","T"),all.x=T)
#Compute incomes and disutilities
#Assume disutilites are uniformly distributed.
#The deviations from uniformity I've calibrated to make sure there's individuals in both states in all group-times.
df$Y1=df$GE_Y1+df$TE_Y1+df$GTE_Y1+rnorm(nrow,mean=0,sd=2)
df$Y0=df$GE_Y0+df$TE_Y0+df$GTE_Y0+rnorm(nrow,mean=0,sd=2)
df$v1=df$GE_v1+df$TE_v1+runif(nrow,min=-2,max=2)
#Compute (unobserved) difference in disposable income
df$DID<-df$Y1-df$Y0
#Individual chooses state s=1 if difference in disposable income is larger than disutility v1
df$s<-ifelse(df$DID>df$v1,1,0)
#Check how states are distributed
table(df$s)
#This is the "correct" model:
summary(lm(s~DID,data=df))
#2SLS:
df$G.f=factor(df$G)
df$T.f=factor(df$T)
df$GT.f=with(df, interaction(G,  T))
firstStageY1<-lm(Y1~G.f+T.f+GT.f,data=df)
firstStageY0<-lm(Y0~G.f+T.f+GT.f,data=df)
df$Y1_hat<-firstStageY1$fitted.values
df$Y0_hat<-firstStageY0$fitted.values
df$DID_hat<-df$Y1_hat-df$Y0_hat
summary(lm(s~G.f+T.f+DID_hat,data=df))$coefficients["DID_hat","Estimate"]
#Group-averaged model:
Y1_GT_AVG<-aggregate(Y1~G+T,data=df,FUN=mean)
colnames(Y1_GT_AVG)[3]<-"Y1_GT_AVG"
Y0_GT_AVG<-aggregate(Y0~G+T,data=df,FUN=mean)
colnames(Y0_GT_AVG)[3]<-"Y0_GT_AVG"
s_GT_AVG<-aggregate(s~G+T,data=df,FUN=mean)
colnames(s_GT_AVG)[3]<-"s_GT_AVG"
df_GT_AVG<-merge(s_GT_AVG,Y1_GT_AVG,by=c("G","T"))
df_GT_AVG<-merge(df_GT_AVG,Y0_GT_AVG,by=c("G","T"))
head(df_GT_AVG)
df_GT_AVG$G.f=factor(df_GT_AVG$G)
df_GT_AVG$T.f=factor(df_GT_AVG$T)
df_GT_AVG$DID_AVG=df_GT_AVG$Y1_GT_AVG-df_GT_AVG$Y0_GT_AVG
summary(lm(s_GT_AVG~G.f+T.f+DID_AVG,data=df_GT_AVG))
#Group-averaged model on observed values:
Y1_GT_AVG_OBS<-aggregate(Y1~G+T,data=df[df$s==1,],FUN=mean)
colnames(Y1_GT_AVG_OBS)[3]<-"Y1_GT_AVG_OBS"
Y0_GT_AVG_OBS<-aggregate(Y0~G+T,data=df[df$s==0,],FUN=mean)
colnames(Y0_GT_AVG_OBS)[3]<-"Y0_GT_AVG_OBS"
s_GT_AVG_OBS<-aggregate(s~G+T,data=df,FUN=mean)
colnames(s_GT_AVG_OBS)[3]<-"s_GT_AVG_OBS"
df_GT_AVG_OBS<-merge(s_GT_AVG_OBS,Y1_GT_AVG_OBS,by=c("G","T"))
df_GT_AVG_OBS<-merge(df_GT_AVG_OBS,Y0_GT_AVG_OBS,by=c("G","T"))
head(df_GT_AVG_OBS)
df_GT_AVG_OBS$G.f=factor(df_GT_AVG_OBS$G)
df_GT_AVG_OBS$T.f=factor(df_GT_AVG_OBS$T)
df_GT_AVG_OBS$DID_AVG_OBS=df_GT_AVG_OBS$Y1_GT_AVG_OBS-df_GT_AVG_OBS$Y0_GT_AVG_OBS
summary(lm(s_GT_AVG_OBS~G.f+T.f+DID_AVG_OBS,data=df_GT_AVG_OBS))
#Collecting coefficients:
#"True" model
summary(lm(s~DID,data=df))$coefficients["DID","Estimate"]
#2SLS
summary(lm(s~G.f+T.f+DID_hat,data=df))$coefficients["DID_hat","Estimate"]
#Group-averaged
summary(lm(s_GT_AVG~G.f+T.f+DID_AVG,data=df_GT_AVG))$coefficients["DID_AVG","Estimate"]
#Group-averaged using observed incomes
summary(lm(s_GT_AVG_OBS~G.f+T.f+DID_AVG_OBS,data=df_GT_AVG_OBS))$coefficients["DID_AVG_OBS","Estimate"]