Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #simulate selection bias in extensive margin using grouping estimation
- #You can just copy-paste this into your R, the biased coefficient is on the last row, with "right" coefficients above it.
- #
- #Individuals choose between two states based on their relative utilities.
- #State 0 has some income Y0, state 1 has some income Y1 and some disutility v1.
- #Incomes determined by group, time, group-time and individual factors.
- #Disutility determined by group, time and individual factors.
- #All of these "effects" are individually distributed.
- #Y0 and Y1 are "observed" if individual chooses states 0 or 1, respectively.
- #
- #Estimate the effect of (unobserved) income differential on state in a linear probability model using three approaches:
- #
- #"True" model: use unobserved income differential directly
- #2SLS using group-time dummies as instruments for (unobserved) incomes.
- #Group-averaged OLS for (unobserved) incomes.
- #Group-averaged OLS for observed incomes.
- #
- #The second and the third produce very similar results to the first, as they should. They are "correct". Also, they produce identical results, as they should.
- #The last one produces larger effects: this is bias.
- #
- set.seed(123)
- #Number of groups
- nG = 20
- #Number of periods
- nT = 10
- #Number of individuals in group (group-time)
- nObsInG=1000
- #Number of rows
- nrow=nObsInG*nG*nT
- #Declare data frame
- df<-data.frame(s=rep(NA,nrow),G=rep(rep(c(1:nG),each=nObsInG),nT),T=rep(rep(c(1:nT),each=nObsInG*nG)))
- #Compute group effects:
- groupEffects<-data.frame(G=c(1:nG))
- groupEffects$GE_Y0 = rnorm(nG)
- groupEffects$GE_Y1 = rnorm(nG)
- groupEffects$GE_v1 = runif(nG)
- #Compute time effects:
- timeEffects<-data.frame(T=c(1:nG))
- timeEffects$TE_Y0 = rnorm(nG)
- timeEffects$TE_Y1 = rnorm(nG)
- timeEffects$TE_v1 = runif(nG)
- #Group-time effects ("shocks"):
- groupTimeEffects<-data.frame(G=rep(c(1:nG),each=nT),T=rep(c(1:nT)))
- groupTimeEffects$GTE_Y1<-rnorm(nG*nT)
- groupTimeEffects$GTE_Y0<-rnorm(nG*nT)
- #Merge
- df<-merge(df,groupEffects,by=c("G"),all.x=T)
- df<-merge(df,timeEffects,by=c("T"),all.x=T)
- df<-merge(df,groupTimeEffects,by=c("G","T"),all.x=T)
- #Compute incomes and disutilities
- #Assume disutilites are uniformly distributed.
- #The deviations from uniformity I've calibrated to make sure there's individuals in both states in all group-times.
- df$Y1=df$GE_Y1+df$TE_Y1+df$GTE_Y1+rnorm(nrow,mean=0,sd=2)
- df$Y0=df$GE_Y0+df$TE_Y0+df$GTE_Y0+rnorm(nrow,mean=0,sd=2)
- df$v1=df$GE_v1+df$TE_v1+runif(nrow,min=-2,max=2)
- #Compute (unobserved) difference in disposable income
- df$DID<-df$Y1-df$Y0
- #Individual chooses state s=1 if difference in disposable income is larger than disutility v1
- df$s<-ifelse(df$DID>df$v1,1,0)
- #Check how states are distributed
- table(df$s)
- #This is the "correct" model:
- summary(lm(s~DID,data=df))
- #2SLS:
- df$G.f=factor(df$G)
- df$T.f=factor(df$T)
- df$GT.f=with(df, interaction(G, T))
- firstStageY1<-lm(Y1~G.f+T.f+GT.f,data=df)
- firstStageY0<-lm(Y0~G.f+T.f+GT.f,data=df)
- df$Y1_hat<-firstStageY1$fitted.values
- df$Y0_hat<-firstStageY0$fitted.values
- df$DID_hat<-df$Y1_hat-df$Y0_hat
- summary(lm(s~G.f+T.f+DID_hat,data=df))$coefficients["DID_hat","Estimate"]
- #Group-averaged model:
- Y1_GT_AVG<-aggregate(Y1~G+T,data=df,FUN=mean)
- colnames(Y1_GT_AVG)[3]<-"Y1_GT_AVG"
- Y0_GT_AVG<-aggregate(Y0~G+T,data=df,FUN=mean)
- colnames(Y0_GT_AVG)[3]<-"Y0_GT_AVG"
- s_GT_AVG<-aggregate(s~G+T,data=df,FUN=mean)
- colnames(s_GT_AVG)[3]<-"s_GT_AVG"
- df_GT_AVG<-merge(s_GT_AVG,Y1_GT_AVG,by=c("G","T"))
- df_GT_AVG<-merge(df_GT_AVG,Y0_GT_AVG,by=c("G","T"))
- head(df_GT_AVG)
- df_GT_AVG$G.f=factor(df_GT_AVG$G)
- df_GT_AVG$T.f=factor(df_GT_AVG$T)
- df_GT_AVG$DID_AVG=df_GT_AVG$Y1_GT_AVG-df_GT_AVG$Y0_GT_AVG
- summary(lm(s_GT_AVG~G.f+T.f+DID_AVG,data=df_GT_AVG))
- #Group-averaged model on observed values:
- Y1_GT_AVG_OBS<-aggregate(Y1~G+T,data=df[df$s==1,],FUN=mean)
- colnames(Y1_GT_AVG_OBS)[3]<-"Y1_GT_AVG_OBS"
- Y0_GT_AVG_OBS<-aggregate(Y0~G+T,data=df[df$s==0,],FUN=mean)
- colnames(Y0_GT_AVG_OBS)[3]<-"Y0_GT_AVG_OBS"
- s_GT_AVG_OBS<-aggregate(s~G+T,data=df,FUN=mean)
- colnames(s_GT_AVG_OBS)[3]<-"s_GT_AVG_OBS"
- df_GT_AVG_OBS<-merge(s_GT_AVG_OBS,Y1_GT_AVG_OBS,by=c("G","T"))
- df_GT_AVG_OBS<-merge(df_GT_AVG_OBS,Y0_GT_AVG_OBS,by=c("G","T"))
- head(df_GT_AVG_OBS)
- df_GT_AVG_OBS$G.f=factor(df_GT_AVG_OBS$G)
- df_GT_AVG_OBS$T.f=factor(df_GT_AVG_OBS$T)
- df_GT_AVG_OBS$DID_AVG_OBS=df_GT_AVG_OBS$Y1_GT_AVG_OBS-df_GT_AVG_OBS$Y0_GT_AVG_OBS
- summary(lm(s_GT_AVG_OBS~G.f+T.f+DID_AVG_OBS,data=df_GT_AVG_OBS))
- #Collecting coefficients:
- #"True" model
- summary(lm(s~DID,data=df))$coefficients["DID","Estimate"]
- #2SLS
- summary(lm(s~G.f+T.f+DID_hat,data=df))$coefficients["DID_hat","Estimate"]
- #Group-averaged
- summary(lm(s_GT_AVG~G.f+T.f+DID_AVG,data=df_GT_AVG))$coefficients["DID_AVG","Estimate"]
- #Group-averaged using observed incomes
- summary(lm(s_GT_AVG_OBS~G.f+T.f+DID_AVG_OBS,data=df_GT_AVG_OBS))$coefficients["DID_AVG_OBS","Estimate"]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement