Advertisement
Guest User

Example

a guest
Sep 30th, 2016
65
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.73 KB | None | 0 0
  1. *======================================================================*
  2. *
  3. * Defining local macros for use in de-duplication process
  4. *
  5. *======================================================================*;
  6.  
  7. /* This macro defines variables where we want to replace potential duplicate observations
  8. with the most recent, non-missing observation */;
  9.  
  10. local last BIRTHYR CONTINT GENDER
  11. ICIG30 ICGR30 IPIP30 ISLS30
  12. ICIGUPD ICGRWK IPIPWK ISLSWK
  13. ICIGSOON ZIP
  14. INSURANCE PREGNANT MHSA_INTAKE INTERPRETER LANGUAGE;
  15.  
  16. /* This macro defines variables where we want to replace potential duplicate observations
  17. with the lowest value non-missing observation for the unique year & quarter combination in question.
  18. We would do this for cases where 1 = a positive occurence (such as race or referral source)
  19. and 2 or higher = some other outcome of interest. */;
  20.  
  21. local lowest HISPANIC REASCL SELFPROX ICONSNT
  22. ICIGTYP ICGRTYP IPIPTYP ISLSTYP IOTYP ITYP_NONE
  23. ICIGUSE ICGRUSE IPIPUSE ISLSUSE
  24. ITOQUITCIG ITOQUITCGR ITOQUITPIP ITOQUITSLS ITOQUITO
  25. RACEWHT RACEBLK RACEAIAN RACEASN RACEHPI RACEOTH
  26. REFSAFL REFSAO REFSAPB REFSRCOM REFSRFAM REFSRHP
  27. REFSRINS REFSRMNP REFSRMNT REFSRMO REFSRMRA REFSRMTV
  28. REFSRO REFSRWK;
  29.  
  30. /* This macro defines variables where we want to replace potential duplicate observations
  31. with the highest value non-missing observation for the unique year & quarter combination in question.
  32. We would do this for cases where values ascend in relative importance (such as 5 representing greater
  33. education than 4) or for cases where a numerically greater outcome (2 > 1) carries meaningful significance
  34. (for FRSTCALL) */;
  35.  
  36. local highest EDUCATE FRSTCALL;
  37.  
  38. /* This macro defines variables where we want to replace potential duplicate observations
  39. with the most recent (chronologically) non-missing observation for the unique year & quarter combination in question. */;
  40.  
  41. local datefix ICIGLST ICGRLST IPIPLST ISLSLST
  42. INTDATE;
  43.  
  44.  
  45. *======================================================================*
  46. *
  47. * De-duplication process
  48. *
  49. *======================================================================*;
  50.  
  51. *======================================================================*
  52. * Capturing potential duplicates and readying data for de-duplication by CALLERID, YEAR & QTR
  53. *======================================================================*;
  54.  
  55. forvalues j = 1/26{;
  56. preserve;
  57. keep if YEARQTR == `j';
  58. bysort CALLERID: gen pop = _N;
  59. egen maxpop = max(pop);
  60. local maxpop = maxpop;
  61.  
  62. * This section replaces each variable in list 'last' with the most recent non-missing entry for that variable;
  63.  
  64. foreach var of varlist `last'{;
  65. gen temp_`var' = "" ;
  66. gsort CALLERID -INTDATE_dt, gen(CALLERIDBYDATE_`var');
  67. sort CALLERID CALLERIDBYDATE_`var';
  68. forvalues i = 1/`maxpop'{;
  69. gsort CALLERID;
  70. by CALLERID: replace temp_`var' = `var'[`i'] if !missing(`var'[`i']) & missing(temp_`var');
  71. };
  72. bysort CALLERID: replace `var' = temp_`var';
  73. drop CALLERIDBYDATE_`var';
  74. drop temp_`var';
  75. };
  76.  
  77. * This section replaces each variable in list 'lowest' with the lowest non-missing value observed for that variable;
  78.  
  79. foreach var of varlist `lowest'{;
  80. destring `var', replace;
  81. gen byte temp_`var' = .m;
  82. gsort CALLERID +`var', gen(CALLERLOW_`var');
  83. sort CALLERID CALLERLOW_`var';
  84. forvalues i = 1/`maxpop'{;
  85. gsort CALLERID;
  86. by CALLERID: replace temp_`var' = `var'[`i'] if !missing(`var'[`i']) & missing(temp_`var');
  87. };
  88. bysort CALLERID: replace `var' = temp_`var';
  89. drop CALLERLOW_`var';
  90. drop temp_`var';
  91. };
  92.  
  93. * This section replaces each variable in list 'highest' with the highest non-missing value observed for that variable;
  94.  
  95. foreach var of varlist `highest'{;
  96. destring `var', replace;
  97. gen byte temp_`var' = .m;
  98. gsort CALLERID -`var', gen(CALLERHIGH_`var');
  99. sort CALLERID CALLERHIGH_`var';
  100. forvalues i = 1/`maxpop'{;
  101. gsort CALLERID;
  102. by CALLERID: replace temp_`var' = `var'[`i'] if !missing(`var'[`i']) & missing(temp_`var');
  103. };
  104. bysort CALLERID: replace `var' = temp_`var';
  105. drop CALLERHIGH_`var';
  106. drop temp_`var';
  107. };
  108.  
  109. * This section replaces each variable in list 'datefix' with the most recent (chronologically) value observed for that variable;
  110.  
  111. foreach var of varlist `datefix'{;
  112. gen temp_`var' = "" ;
  113. gsort CALLERID -INTDATE_dt, gen(CALLDATEFIX_`var');
  114. sort CALLERID CALLDATEFIX_`var';
  115. forvalues i = 1/`maxpop'{;
  116. gsort CALLERID;
  117. by CALLERID: replace temp_`var' = `var'[`i'] if !missing(`var'[`i']) & missing(temp_`var');
  118. };
  119. bysort CALLERID: replace `var' = temp_`var';
  120. drop CALLDATEFIX_`var';
  121. drop temp_`var';
  122. };
  123.  
  124. * This section limits cleaned data to one observation per CALLERID per QTR;
  125.  
  126. bysort CALLERID: gen NEWPOP = _N;
  127. keep if NEWPOP == 1;
  128.  
  129. quietly compress;
  130. quietly tempfile data_`j';
  131. quietly save `data_`j'', replace;
  132.  
  133. * This section is used to generate Excel workbooks for manual cross-reference against source data;
  134. * It is currently commented out as it is only needed for testing purposes.
  135. * export excel using clean_`j'_test.xlsx, firstrow(variables) replace;
  136.  
  137. restore;
  138. };
  139.  
  140. clear;
  141.  
  142. *======================================================================*
  143. * Re-assembling de-duplicated data into one large data set
  144. *======================================================================*;
  145.  
  146. use `data_1';
  147. forvalues j = 2/26{;
  148. display "now appending `j'";
  149. append using `data_`j'';
  150. };
  151.  
  152. desc;
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement