Advertisement
Guest User

Untitled

a guest
Apr 21st, 2019
97
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 14.96 KB | None | 0 0
  1. FILENAME REFFILE '/home/apexmuaado0/dap_assignment/dap_data.xls';
  2.  
  3. PROC IMPORT DATAFILE=REFFILE
  4. DBMS=XLS
  5. OUT=dap_data;
  6. GETNAMES=YES;
  7. RUN;
  8.  
  9. /* drop the Donor ID column */
  10. data dap_data(drop=D_ID);
  11. set dap_data;
  12. run;
  13.  
  14.  
  15.  
  16.  
  17. /* list the first 10 observations */
  18. proc print data=work.dap_data (obs=10);
  19. title 'Listing of the Data Set';
  20. run;
  21. title;
  22.  
  23. /* summary of the table contents */
  24. PROC CONTENTS DATA=work.dap_1;
  25. RUN;
  26.  
  27. /* *******************************************************ENCODE CATEGORICAL VARIABLES*****************************************************************/
  28. /* ****************************************************************************************************************************************************/
  29. /* encode categorical variable in char to numeric
  30. Donor_Status:: A if active funder, S if star funder, N if new funder, E if inactive funder, F if first time funder, L if lapsing funder
  31. DUMMY VARS CREAteD For all THE DONORS EXCEPT for donor_status_l
  32. */
  33. data work.dap_data;
  34. set work.dap_data;
  35. if Donor_Status = 'A' then Donor_Status_a = 1;
  36. else if Donor_Status = 'S' then Donor_Status_s = 1;
  37. else if Donor_Status = 'N' then Donor_Status_n = 1;
  38. else if Donor_Status = 'E' then Donor_Status_e = 1;
  39. else if Donor_Status = 'F' then Donor_Status_f = 1;
  40. run;
  41.  
  42. /* missing values in the dummy variables are declared as zero : except donor status l*/
  43. data work.dap_data;
  44. set work.dap_data;
  45. if Donor_Status_a = . then Donor_Status_a = 0;
  46. if Donor_Status_s = . then Donor_Status_s = 0;
  47. if Donor_Status_n = . then Donor_Status_n = 0;
  48. if Donor_Status_e = . then Donor_Status_e = 0;
  49. if Donor_Status_f = . then Donor_Status_f = 0;
  50. run;
  51.  
  52. /* confirm the masking values are changed in the dataset */
  53. proc freq data = work.dap_1;
  54. table Donor_Status_n;
  55. run;
  56.  
  57. /*
  58. DemHomeOwner:: H - if home owner U - if unknown
  59. CREAtE ONE DUMMY Variable FOR HOME OWNER
  60. */
  61. data work.dap_data;
  62. set work.dap_data;
  63. if DemHomeOwner = 'H' then DemHomeOwner_n = 1;
  64. if DemHomeOwner = 'U' then DemHomeOwner_n = 0;
  65. run;
  66.  
  67. /* confirm the masking values are changed in the dataset */
  68. proc freq data = work.dap_data;
  69. table DemHomeOwner_n;
  70. run;
  71.  
  72. /*
  73. Gender:: M for Male : F for Female
  74. create two dummy variables for male and female
  75. */
  76. data work.dap_data;
  77. set work.dap_data;
  78. if Gender = 'M' then Gender_m = 1;
  79. if Gender = 'F' then Gender_f = 1;
  80. run;
  81.  
  82. /*declare misssing values of gender dummy variables as zero */
  83. data work.dap_data;
  84. set work.dap_data;
  85. if Gender_m = . then Gender_m = 0;
  86. if Gender_f = . then Gender_f = 0;
  87. run;
  88.  
  89. /* check if any missing values exisit */
  90. proc means data = work.dap_1 n nmiss ;
  91. var _numeric_;
  92. run;
  93.  
  94.  
  95. /* *********************END*************************************END**************************************END*******************************************/
  96. /* ****************************************************************************************************************************************************/
  97.  
  98. /* create a new dataset with the new encoded variables */
  99. data dap_1;
  100. set work.dap_data (drop=Gender DemHomeOwner Donor_Status);
  101. run;
  102.  
  103. /* view the new dataset with encoded variables */
  104. PROC CONTENTS DATA=work.dap_1;
  105. RUN;
  106.  
  107. /* change the vraiable type of the DemArea from char to numeric */
  108. data dap_1;
  109. set work.dap_1;
  110. DemArea2 = input(DemArea, best12.);
  111. run;
  112.  
  113. data dap_1;
  114. set work.dap_1 (drop = DemArea);
  115. run;
  116.  
  117. /* Outlier treatment *****************************************************************************************************************************/
  118. proc univariate data= WORK.dap_1 robustscale plot;
  119. var age areahomevalue areamedincome callcntall callcntcardall callcntcardp1
  120. callcntcardp2 callcntp1 callcntp2 demarea2 donavgcardp1 donavgall
  121. donavgcardp1 donavglast donavgp1 doncntall doncntcardall doncntcardp1 doncntp1
  122. donor donor_D donor_status_prev_camp dontimefirst dontimelast;
  123. run;
  124.  
  125.  
  126. /* get the values for iqr 3rd quartile and 1st quartile*/
  127. proc means data=work.dap_1 stackods n qrange p25 p75;
  128. var age areahomevalue areamedincome callcntall callcntcardall callcntcardp1
  129. callcntcardp2 callcntp1 callcntp2 demarea2 donavgcardp1 donavgall
  130. donavgcardp1 donavglast donavgp1 doncntall doncntcardall doncntcardp1 doncntp1
  131. donor donor_D donor_status_prev_camp dontimefirst dontimelast;
  132. ods output summary=ranges;
  133. run;
  134.  
  135. *macro to cap outliers;
  136. %macro cap(dset=,var=, lower=, upper=);
  137. data &dset;
  138. set &dset;
  139. if &var>&upper then &var=&upper;
  140. if &var<&lower then &var=&lower;
  141. run;
  142. %mend;
  143.  
  144. *create cutoffs and execute macro for each variable;
  145. data cutoffs;
  146. set ranges;
  147. lower=p25-1.5*qrange;
  148. upper=p75+1.5*qrange;
  149. string = catt('%cap(dset=dap_1, var=', variable, ", lower=", lower, ", upper=", upper ,");");
  150. call execute(string);
  151. run;
  152.  
  153. /*assign labels */
  154. data dap_1; set dap_1;
  155. label
  156. Donor_D = "Amount donated for previous campaigns"
  157. DonCntP1 = "Times the person funded in the last 36 months"
  158. DONCntAll = "Times the person donated over period of few years"
  159. DONCntCardP1 = "Times the person funded in the last 36 months with the help of references"
  160. DONCntCardAll = "Times the person donated over period of few years with the help of references"
  161. DONAvgLast = "Amount funded last by the person (in $ thousands)"
  162. DONAvgP1 = "Fund Amount in Average for 36 Months"
  163. DONAvgAll = "Fund Amount in Average for overall period of few years"
  164. DONAvgCardP1 = "Fund Amount in Average for 36 Months from the individual with the help of references"
  165. DONTimeLast = "No of months since the donator made last donation"
  166. DONTimeFirst = "No of months since the donator made first donation"
  167. CallCntP2 = "No of Calls made by GRDS to the individual for the last 12 months"
  168. CallCntP1 = "No of Calls made by GRDS to the individual for the last 36 months"
  169. CallCntAll = "No of Calls made by GRDS to the individual over the period of few years"
  170. CallCntCardP2 = "No of Calls made by References to the individual for the last 12 months"
  171. CallCntCardP1 = "No of Calls made by References to the individual for the last 36 months"
  172. CallCntCardAll = "No of Calls made by References to the individual over the period of few years"
  173. Donor_Status_a = "active funder"
  174. Donor_Status_s = "star funder"
  175. Donor_Status_n = "new funder"
  176. Donor_Status_e = "inactive funder"
  177. Donor_Status_f = "first time funder"
  178. DemHomeOwner_n = "Owns home or not"
  179. Donor_Status_Prev_Camp = "Donated to the last response"
  180. DemArea2 = "Demographic Area"
  181. Age = "Age"
  182. Gender_m = "Male"
  183. Gender_f = "Female"
  184. AreaHomeValue = "median home value"
  185. AreaMedIncome = "median income";
  186. RUN;
  187.  
  188.  
  189. data dap_1;
  190. set work.dap_1 (drop = Donor_D);
  191. run;
  192.  
  193. data dap_1_mc;
  194. set work.dap_1;
  195. run;
  196.  
  197.  
  198.  
  199.  
  200.  
  201. /* ############################################################################################################################################################################################### */
  202. /* ############################################################################################################################################################################################### */
  203. /* ############################################################################################################################################################################################### */
  204. /* ############################################################################################################################################################################################### */
  205. /* ############################################################################################################################################################################################### */
  206. /* ############################################################################################################################################################################################### */
  207. /* ############################################################################################################################################################################################### */
  208. /* ############################################################################################################################################################################################### */
  209. /* ############################################################################################################################################################################################### */
  210.  
  211.  
  212.  
  213.  
  214.  
  215.  
  216.  
  217.  
  218.  
  219. /* DAP_1 ::: Final dataset with outliers treated */
  220.  
  221.  
  222. /***** MEAN IMPUTATION ******/
  223. /* confirm the masking values are changed in the dataset */
  224. proc means data = work.dap_1 N nmiss;
  225. var _numeric_;
  226. run;
  227.  
  228. proc stdize data=work.dap_1 out=Imputed_mean
  229. /* oprefix=Orig_ /* prefix for original variables */
  230. reponly /* only replace; do not standardize */
  231. method=MEAN; /* or MEDIAN, MINIMUM, MIDRANGE, etc. */
  232. var Donor_D DonCntP1 DONCntCardP1 DONCntCardAll DONAvgLast DONTimeLast CallCntP1 CallCntCardP2 DONTimeFirst Age CallCntCardAll; /* All continuous variables here */
  233. run;
  234.  
  235. proc means data = dap_1_mc N nmiss;
  236. var _numeric_;
  237. run;
  238.  
  239. proc freq data=dap_1_mc;
  240. table _numeric_;
  241. run;
  242.  
  243. /************************ MULTICOLLINEARITY TREATEMENT ***********************************/
  244. /* STEP 1: Check VIF for all indp variables --> take max variable (X) */
  245. proc reg data=work.dap_1_mc;
  246. model Donor = DonCntP1 DONCntAll DONCntCardP1 DONCntCardAll DONAvgLast DONAvgP1
  247. DONAvgAll DONAvgCardP1 DONTimeLast DONTimeFirst CallCntP2 CallCntP1 CallCntAll
  248. CallCntCardP2 CallCntCardP1 CallCntCardAll Donor_Status_Prev_Camp Age
  249. AreaHomeValue AreaMedIncome Donor_Status_a Donor_Status_s Donor_Status_n
  250. Donor_Status_e Donor_Status_f DemHomeOwner_n Gender_m Gender_f DemArea2
  251. / tol vif collin;
  252. run;
  253.  
  254.  
  255. /* STEP 2: Take the variable with highest collinearity with X --> (X)*(Y) */
  256.  
  257. /* STEP 3: Check Chi Sqr for (X)*(Y) */
  258. proc logistic data = work.dap_1_mc;
  259. CLASS Donor_Status_a Donor_Status_s Donor_Status_n Donor_Status_e Donor_Status_f DemHomeOwner_n Gender_m Gender_f;
  260. MODEL Donor(event='1')= DonCntP1 DONCntAll DONCntCardP1 DONCntCardAll DONAvgLast DONAvgP1
  261. DONAvgAll DONAvgCardP1 DONTimeLast DONTimeFirst CallCntP2 CallCntP1 CallCntAll
  262. CallCntCardP2 CallCntCardP1 CallCntCardAll Donor_Status_Prev_Camp Age
  263. AreaHomeValue AreaMedIncome Donor_Status_a Donor_Status_s Donor_Status_n
  264. Donor_Status_e Donor_Status_f DemHomeOwner_n Gender_m Gender_f DemArea2 / OUTROC = ROC;
  265. ROC;
  266. ODS OUTPUT ROCASSOCIATION = AUC;
  267. run;
  268.  
  269. /* STEP 4: Drop the variable with LOWER chisq */
  270. /* 1. remove Donor_Status_a */
  271. data work.dap_1_mc;
  272. set work.dap_1_mc;
  273. drop Donor_Status_a;
  274. run;
  275.  
  276. /* STEP 5: Repeat steps 1-4 until the highest VIF is below cutt-off (<2) */
  277.  
  278.  
  279. /********************************************************************************************************************/
  280. /* ROUND 2 */
  281.  
  282. proc reg data=work.dap_1_mc;
  283. model Donor = DonCntP1 DONCntAll DONCntCardP1 DONCntCardAll DONAvgLast DONAvgP1
  284. DONAvgAll DONAvgCardP1 DONTimeLast DONTimeFirst CallCntP2 CallCntP1 CallCntAll
  285. CallCntCardP2 CallCntCardP1 CallCntCardAll Donor_Status_Prev_Camp Age
  286. AreaHomeValue AreaMedIncome Donor_Status_s Donor_Status_n
  287. Donor_Status_e Donor_Status_f DemHomeOwner_n Gender_m Gender_f DemArea2
  288. / tol vif collin;
  289. run;
  290.  
  291.  
  292. /* STEP 2: Take the variable with highest collinearity with X --> (X)*(Y) */
  293.  
  294. /* STEP 3: Check Chi Sqr for (X)*(Y) */
  295. proc logistic data = work.dap_1_mc;
  296. CLASS Donor_Status_s Donor_Status_n Donor_Status_e Donor_Status_f DemHomeOwner_n Gender_m Gender_f;
  297. MODEL Donor(event='1')= DonCntP1 DONCntAll DONCntCardP1 DONCntCardAll DONAvgLast DONAvgP1
  298. DONAvgAll DONAvgCardP1 DONTimeLast DONTimeFirst CallCntP2 CallCntP1 CallCntAll
  299. CallCntCardP2 CallCntCardP1 CallCntCardAll Donor_Status_Prev_Camp Age
  300. AreaHomeValue AreaMedIncome Donor_Status_s Donor_Status_n
  301. Donor_Status_e Donor_Status_f DemHomeOwner_n Gender_m Gender_f DemArea2 / OUTROC = ROC;
  302. ROC;
  303. ODS OUTPUT ROCASSOCIATION = AUC;
  304. run;
  305.  
  306. /* STEP 4: Drop the variable with LOWER chisq */
  307. /* 1. remove CallCntCardAll */
  308. data work.dap_1_mc;
  309. set work.dap_1_mc;
  310. drop CallCntCardAll;
  311. run;
  312.  
  313. /* STEP 5: Repeat steps 1-4 until the highest VIF is below cutt-off (<2) */
  314.  
  315. /********************************************************************************************************************/
  316. /* ROUND 3 */
  317. proc reg data=work.dap_1_mc;
  318. model Donor = DonCntP1 DONCntAll DONCntCardP1 DONCntCardAll DONAvgLast DONAvgP1
  319. DONAvgAll DONAvgCardP1 DONTimeLast DONTimeFirst CallCntP2 CallCntP1 CallCntAll
  320. CallCntCardP2 CallCntCardP1 Donor_Status_Prev_Camp Age
  321. AreaHomeValue AreaMedIncome Donor_Status_s Donor_Status_n
  322. Donor_Status_e Donor_Status_f DemHomeOwner_n Gender_m Gender_f DemArea2
  323. / tol vif collin;
  324. run;
  325.  
  326.  
  327. /* STEP 2: Take the variable with highest collinearity with X --> (X)*(Y) */
  328.  
  329. /* STEP 3: Check Chi Sqr for (X)*(Y) */
  330. proc logistic data = work.dap_1_mc;
  331. CLASS Donor_Status_s Donor_Status_n Donor_Status_e Donor_Status_f DemHomeOwner_n Gender_m Gender_f;
  332. MODEL Donor(event='1')= DonCntP1 DONCntAll DONCntCardP1 DONCntCardAll DONAvgLast DONAvgP1
  333. DONAvgAll DONAvgCardP1 DONTimeLast DONTimeFirst CallCntP2 CallCntP1 CallCntAll
  334. CallCntCardP2 CallCntCardP1 Donor_Status_Prev_Camp Age
  335. AreaHomeValue AreaMedIncome Donor_Status_s Donor_Status_n
  336. Donor_Status_e Donor_Status_f DemHomeOwner_n Gender_m Gender_f DemArea2 / OUTROC = ROC;
  337. ROC;
  338. ODS OUTPUT ROCASSOCIATION = AUC;
  339. run;
  340.  
  341. /* STEP 4: Drop the variable with LOWER chisq */
  342. /* 1. remove CallCntCardAll */
  343. data work.dap_1_mc;
  344. set work.dap_1_mc;
  345. drop CallCntCardAll;
  346. run;
  347.  
  348. /* STEP 5: Repeat steps 1-4 until the highest VIF is below cutt-off (<2) */
  349.  
  350.  
  351.  
  352. /********************************************************************************************************************/
  353. /* ROUND 4 */
  354.  
  355.  
  356.  
  357. /********************************************************************************************************************/
  358. /* ROUND 5 */
  359.  
  360.  
  361.  
  362. /********************************************************************************************************************/
  363. /* ROUND 6 */
  364.  
  365.  
  366.  
  367. /********************************************************************************************************************/
  368. /* ROUND 7 */
  369.  
  370.  
  371.  
  372. /********************************************************************************************************************/
  373. /* ROUND 8 */
  374.  
  375.  
  376.  
  377. /********************************************************************************************************************/
  378. /* ROUND 9 */
  379.  
  380.  
  381.  
  382. /********************************************************************************************************************/
  383. /* ROUND 10 */
  384.  
  385.  
  386.  
  387. /********************************************************************************************************************/
  388. /* ROUND 11 */
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement