Advertisement
Alppen

grouping_estimator_selection_bias_simulation

Nov 4th, 2016
105
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 4.80 KB | None | 0 0
  1. #simulate selection bias in extensive margin using grouping estimation
  2. #You can just copy-paste this into your R, the biased coefficient is on the last row, with "right" coefficients above it.
  3. #
  4. #Individuals choose between two states based on their relative utilities.
  5. #State 0 has some income Y0, state 1 has some income Y1 and some disutility v1.
  6. #Incomes determined by group, time, group-time and individual factors.
  7. #Disutility determined by group, time and individual factors.
  8. #All of these "effects" are individually distributed.
  9. #Y0 and Y1 are "observed" if individual chooses states 0 or 1, respectively.
  10. #
  11. #Estimate the effect of (unobserved) income differential on state in a linear probability model using three approaches:
  12. #
  13. #"True" model: use unobserved income differential directly
  14. #2SLS using group-time dummies as instruments for (unobserved) incomes.
  15. #Group-averaged OLS for (unobserved) incomes.
  16. #Group-averaged OLS for observed incomes.
  17. #
  18. #The second and the third  produce very similar results to the first, as they should. They are "correct". Also, they produce identical results, as they should.
  19. #The last one produces larger effects: this is bias.
  20. #
  21. set.seed(123)
  22. #Number of groups
  23. nG = 20
  24. #Number of periods
  25. nT = 10
  26. #Number of individuals in group (group-time)
  27. nObsInG=1000
  28. #Number of rows
  29. nrow=nObsInG*nG*nT
  30. #Declare data frame
  31. df<-data.frame(s=rep(NA,nrow),G=rep(rep(c(1:nG),each=nObsInG),nT),T=rep(rep(c(1:nT),each=nObsInG*nG)))
  32. #Compute group effects:
  33. groupEffects<-data.frame(G=c(1:nG))
  34. groupEffects$GE_Y0 = rnorm(nG)
  35. groupEffects$GE_Y1 = rnorm(nG)
  36. groupEffects$GE_v1 = runif(nG)
  37. #Compute time effects:
  38. timeEffects<-data.frame(T=c(1:nG))
  39. timeEffects$TE_Y0 = rnorm(nG)
  40. timeEffects$TE_Y1 = rnorm(nG)
  41. timeEffects$TE_v1 = runif(nG)
  42. #Group-time effects ("shocks"):
  43. groupTimeEffects<-data.frame(G=rep(c(1:nG),each=nT),T=rep(c(1:nT)))
  44. groupTimeEffects$GTE_Y1<-rnorm(nG*nT)
  45. groupTimeEffects$GTE_Y0<-rnorm(nG*nT)
  46. #Merge
  47. df<-merge(df,groupEffects,by=c("G"),all.x=T)
  48. df<-merge(df,timeEffects,by=c("T"),all.x=T)
  49. df<-merge(df,groupTimeEffects,by=c("G","T"),all.x=T)
  50. #Compute incomes and disutilities
  51. #Assume disutilites are uniformly distributed.
  52. #The deviations from uniformity I've calibrated to make sure there's individuals in both states in all group-times.
  53. df$Y1=df$GE_Y1+df$TE_Y1+df$GTE_Y1+rnorm(nrow,mean=0,sd=2)
  54. df$Y0=df$GE_Y0+df$TE_Y0+df$GTE_Y0+rnorm(nrow,mean=0,sd=2)
  55. df$v1=df$GE_v1+df$TE_v1+runif(nrow,min=-2,max=2)
  56. #Compute (unobserved) difference in disposable income
  57. df$DID<-df$Y1-df$Y0
  58. #Individual chooses state s=1 if difference in disposable income is larger than disutility v1
  59. df$s<-ifelse(df$DID>df$v1,1,0)
  60. #Check how states are distributed
  61. table(df$s)
  62. #This is the "correct" model:
  63. summary(lm(s~DID,data=df))
  64. #2SLS:
  65. df$G.f=factor(df$G)
  66. df$T.f=factor(df$T)
  67. df$GT.f=with(df, interaction(G,  T))
  68. firstStageY1<-lm(Y1~G.f+T.f+GT.f,data=df)
  69. firstStageY0<-lm(Y0~G.f+T.f+GT.f,data=df)
  70. df$Y1_hat<-firstStageY1$fitted.values
  71. df$Y0_hat<-firstStageY0$fitted.values
  72. df$DID_hat<-df$Y1_hat-df$Y0_hat
  73. summary(lm(s~G.f+T.f+DID_hat,data=df))$coefficients["DID_hat","Estimate"]
  74. #Group-averaged model:
  75. Y1_GT_AVG<-aggregate(Y1~G+T,data=df,FUN=mean)
  76. colnames(Y1_GT_AVG)[3]<-"Y1_GT_AVG"
  77. Y0_GT_AVG<-aggregate(Y0~G+T,data=df,FUN=mean)
  78. colnames(Y0_GT_AVG)[3]<-"Y0_GT_AVG"
  79. s_GT_AVG<-aggregate(s~G+T,data=df,FUN=mean)
  80. colnames(s_GT_AVG)[3]<-"s_GT_AVG"
  81. df_GT_AVG<-merge(s_GT_AVG,Y1_GT_AVG,by=c("G","T"))
  82. df_GT_AVG<-merge(df_GT_AVG,Y0_GT_AVG,by=c("G","T"))
  83. head(df_GT_AVG)
  84. df_GT_AVG$G.f=factor(df_GT_AVG$G)
  85. df_GT_AVG$T.f=factor(df_GT_AVG$T)
  86. df_GT_AVG$DID_AVG=df_GT_AVG$Y1_GT_AVG-df_GT_AVG$Y0_GT_AVG
  87. summary(lm(s_GT_AVG~G.f+T.f+DID_AVG,data=df_GT_AVG))
  88. #Group-averaged model on observed values:
  89. Y1_GT_AVG_OBS<-aggregate(Y1~G+T,data=df[df$s==1,],FUN=mean)
  90. colnames(Y1_GT_AVG_OBS)[3]<-"Y1_GT_AVG_OBS"
  91. Y0_GT_AVG_OBS<-aggregate(Y0~G+T,data=df[df$s==0,],FUN=mean)
  92. colnames(Y0_GT_AVG_OBS)[3]<-"Y0_GT_AVG_OBS"
  93. s_GT_AVG_OBS<-aggregate(s~G+T,data=df,FUN=mean)
  94. colnames(s_GT_AVG_OBS)[3]<-"s_GT_AVG_OBS"
  95. df_GT_AVG_OBS<-merge(s_GT_AVG_OBS,Y1_GT_AVG_OBS,by=c("G","T"))
  96. df_GT_AVG_OBS<-merge(df_GT_AVG_OBS,Y0_GT_AVG_OBS,by=c("G","T"))
  97. head(df_GT_AVG_OBS)
  98. df_GT_AVG_OBS$G.f=factor(df_GT_AVG_OBS$G)
  99. df_GT_AVG_OBS$T.f=factor(df_GT_AVG_OBS$T)
  100. df_GT_AVG_OBS$DID_AVG_OBS=df_GT_AVG_OBS$Y1_GT_AVG_OBS-df_GT_AVG_OBS$Y0_GT_AVG_OBS
  101. summary(lm(s_GT_AVG_OBS~G.f+T.f+DID_AVG_OBS,data=df_GT_AVG_OBS))
  102. #Collecting coefficients:
  103. #"True" model
  104. summary(lm(s~DID,data=df))$coefficients["DID","Estimate"]
  105. #2SLS
  106. summary(lm(s~G.f+T.f+DID_hat,data=df))$coefficients["DID_hat","Estimate"]
  107. #Group-averaged
  108. summary(lm(s_GT_AVG~G.f+T.f+DID_AVG,data=df_GT_AVG))$coefficients["DID_AVG","Estimate"]
  109. #Group-averaged using observed incomes
  110. summary(lm(s_GT_AVG_OBS~G.f+T.f+DID_AVG_OBS,data=df_GT_AVG_OBS))$coefficients["DID_AVG_OBS","Estimate"]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement