Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- FILENAME REFFILE '/home/apexmuaado0/dap_assignment/dap_data.xls';
- PROC IMPORT DATAFILE=REFFILE
- DBMS=XLS
- OUT=dap_data;
- GETNAMES=YES;
- RUN;
- /* drop the Donor ID column */
- data dap_data(drop=D_ID);
- set dap_data;
- run;
- /* list the first 10 observations */
- proc print data=work.dap_data (obs=10);
- title 'Listing of the Data Set';
- run;
- title;
- /* summary of the table contents */
- PROC CONTENTS DATA=work.dap_1;
- RUN;
- /* *******************************************************ENCODE CATEGORICAL VARIABLES*****************************************************************/
- /* ****************************************************************************************************************************************************/
- /* encode categorical variable in char to numeric
- Donor_Status:: A if active funder, S if star funder, N if new funder, E if inactive funder, F if first time funder, L if lapsing funder
- DUMMY VARS CREAteD For all THE DONORS EXCEPT for donor_status_l
- */
- data work.dap_data;
- set work.dap_data;
- if Donor_Status = 'A' then Donor_Status_a = 1;
- else if Donor_Status = 'S' then Donor_Status_s = 1;
- else if Donor_Status = 'N' then Donor_Status_n = 1;
- else if Donor_Status = 'E' then Donor_Status_e = 1;
- else if Donor_Status = 'F' then Donor_Status_f = 1;
- run;
- /* missing values in the dummy variables are declared as zero : except donor status l*/
- data work.dap_data;
- set work.dap_data;
- if Donor_Status_a = . then Donor_Status_a = 0;
- if Donor_Status_s = . then Donor_Status_s = 0;
- if Donor_Status_n = . then Donor_Status_n = 0;
- if Donor_Status_e = . then Donor_Status_e = 0;
- if Donor_Status_f = . then Donor_Status_f = 0;
- run;
- /* confirm the masking values are changed in the dataset */
- proc freq data = work.dap_1;
- table Donor_Status_n;
- run;
- /*
- DemHomeOwner:: H - if home owner U - if unknown
- CREAtE ONE DUMMY Variable FOR HOME OWNER
- */
- data work.dap_data;
- set work.dap_data;
- if DemHomeOwner = 'H' then DemHomeOwner_n = 1;
- if DemHomeOwner = 'U' then DemHomeOwner_n = 0;
- run;
- /* confirm the masking values are changed in the dataset */
- proc freq data = work.dap_data;
- table DemHomeOwner_n;
- run;
- /*
- Gender:: M for Male : F for Female
- create two dummy variables for male and female
- */
- data work.dap_data;
- set work.dap_data;
- if Gender = 'M' then Gender_m = 1;
- if Gender = 'F' then Gender_f = 1;
- run;
- /*declare misssing values of gender dummy variables as zero */
- data work.dap_data;
- set work.dap_data;
- if Gender_m = . then Gender_m = 0;
- if Gender_f = . then Gender_f = 0;
- run;
- /* check if any missing values exisit */
- proc means data = work.dap_1 n nmiss ;
- var _numeric_;
- run;
- /* *********************END*************************************END**************************************END*******************************************/
- /* ****************************************************************************************************************************************************/
- /* create a new dataset with the new encoded variables */
- data dap_1;
- set work.dap_data (drop=Gender DemHomeOwner Donor_Status);
- run;
- /* view the new dataset with encoded variables */
- PROC CONTENTS DATA=work.dap_1;
- RUN;
- /* change the vraiable type of the DemArea from char to numeric */
- data dap_1;
- set work.dap_1;
- DemArea2 = input(DemArea, best12.);
- run;
- data dap_1;
- set work.dap_1 (drop = DemArea);
- run;
- /* Outlier treatment *****************************************************************************************************************************/
- proc univariate data= WORK.dap_1 robustscale plot;
- var age areahomevalue areamedincome callcntall callcntcardall callcntcardp1
- callcntcardp2 callcntp1 callcntp2 demarea2 donavgcardp1 donavgall
- donavgcardp1 donavglast donavgp1 doncntall doncntcardall doncntcardp1 doncntp1
- donor donor_D donor_status_prev_camp dontimefirst dontimelast;
- run;
- /* get the values for iqr 3rd quartile and 1st quartile*/
- proc means data=work.dap_1 stackods n qrange p25 p75;
- var age areahomevalue areamedincome callcntall callcntcardall callcntcardp1
- callcntcardp2 callcntp1 callcntp2 demarea2 donavgcardp1 donavgall
- donavgcardp1 donavglast donavgp1 doncntall doncntcardall doncntcardp1 doncntp1
- donor donor_D donor_status_prev_camp dontimefirst dontimelast;
- ods output summary=ranges;
- run;
- *macro to cap outliers;
- %macro cap(dset=,var=, lower=, upper=);
- data &dset;
- set &dset;
- if &var>&upper then &var=&upper;
- if &var<&lower then &var=&lower;
- run;
- %mend;
- *create cutoffs and execute macro for each variable;
- data cutoffs;
- set ranges;
- lower=p25-1.5*qrange;
- upper=p75+1.5*qrange;
- string = catt('%cap(dset=dap_1, var=', variable, ", lower=", lower, ", upper=", upper ,");");
- call execute(string);
- run;
- /*assign labels */
- data dap_1; set dap_1;
- label
- Donor_D = "Amount donated for previous campaigns"
- DonCntP1 = "Times the person funded in the last 36 months"
- DONCntAll = "Times the person donated over period of few years"
- DONCntCardP1 = "Times the person funded in the last 36 months with the help of references"
- DONCntCardAll = "Times the person donated over period of few years with the help of references"
- DONAvgLast = "Amount funded last by the person (in $ thousands)"
- DONAvgP1 = "Fund Amount in Average for 36 Months"
- DONAvgAll = "Fund Amount in Average for overall period of few years"
- DONAvgCardP1 = "Fund Amount in Average for 36 Months from the individual with the help of references"
- DONTimeLast = "No of months since the donator made last donation"
- DONTimeFirst = "No of months since the donator made first donation"
- CallCntP2 = "No of Calls made by GRDS to the individual for the last 12 months"
- CallCntP1 = "No of Calls made by GRDS to the individual for the last 36 months"
- CallCntAll = "No of Calls made by GRDS to the individual over the period of few years"
- CallCntCardP2 = "No of Calls made by References to the individual for the last 12 months"
- CallCntCardP1 = "No of Calls made by References to the individual for the last 36 months"
- CallCntCardAll = "No of Calls made by References to the individual over the period of few years"
- Donor_Status_a = "active funder"
- Donor_Status_s = "star funder"
- Donor_Status_n = "new funder"
- Donor_Status_e = "inactive funder"
- Donor_Status_f = "first time funder"
- DemHomeOwner_n = "Owns home or not"
- Donor_Status_Prev_Camp = "Donated to the last response"
- DemArea2 = "Demographic Area"
- Age = "Age"
- Gender_m = "Male"
- Gender_f = "Female"
- AreaHomeValue = "median home value"
- AreaMedIncome = "median income";
- RUN;
- data dap_1;
- set work.dap_1 (drop = Donor_D);
- run;
- data dap_1_mc;
- set work.dap_1;
- run;
- /* ############################################################################################################################################################################################### */
- /* ############################################################################################################################################################################################### */
- /* ############################################################################################################################################################################################### */
- /* ############################################################################################################################################################################################### */
- /* ############################################################################################################################################################################################### */
- /* ############################################################################################################################################################################################### */
- /* ############################################################################################################################################################################################### */
- /* ############################################################################################################################################################################################### */
- /* ############################################################################################################################################################################################### */
- /* DAP_1 ::: Final dataset with outliers treated */
- /***** MEAN IMPUTATION ******/
- /* confirm the masking values are changed in the dataset */
- proc means data = work.dap_1 N nmiss;
- var _numeric_;
- run;
- proc stdize data=work.dap_1 out=Imputed_mean
- /* oprefix=Orig_ /* prefix for original variables */
- reponly /* only replace; do not standardize */
- method=MEAN; /* or MEDIAN, MINIMUM, MIDRANGE, etc. */
- var Donor_D DonCntP1 DONCntCardP1 DONCntCardAll DONAvgLast DONTimeLast CallCntP1 CallCntCardP2 DONTimeFirst Age CallCntCardAll; /* All continuous variables here */
- run;
- proc means data = dap_1_mc N nmiss;
- var _numeric_;
- run;
- proc freq data=dap_1_mc;
- table _numeric_;
- run;
- /************************ MULTICOLLINEARITY TREATEMENT ***********************************/
- /* STEP 1: Check VIF for all indp variables --> take max variable (X) */
- proc reg data=work.dap_1_mc;
- model Donor = DonCntP1 DONCntAll DONCntCardP1 DONCntCardAll DONAvgLast DONAvgP1
- DONAvgAll DONAvgCardP1 DONTimeLast DONTimeFirst CallCntP2 CallCntP1 CallCntAll
- CallCntCardP2 CallCntCardP1 CallCntCardAll Donor_Status_Prev_Camp Age
- AreaHomeValue AreaMedIncome Donor_Status_a Donor_Status_s Donor_Status_n
- Donor_Status_e Donor_Status_f DemHomeOwner_n Gender_m Gender_f DemArea2
- / tol vif collin;
- run;
- /* STEP 2: Take the variable with highest collinearity with X --> (X)*(Y) */
- /* STEP 3: Check Chi Sqr for (X)*(Y) */
- proc logistic data = work.dap_1_mc;
- CLASS Donor_Status_a Donor_Status_s Donor_Status_n Donor_Status_e Donor_Status_f DemHomeOwner_n Gender_m Gender_f;
- MODEL Donor(event='1')= DonCntP1 DONCntAll DONCntCardP1 DONCntCardAll DONAvgLast DONAvgP1
- DONAvgAll DONAvgCardP1 DONTimeLast DONTimeFirst CallCntP2 CallCntP1 CallCntAll
- CallCntCardP2 CallCntCardP1 CallCntCardAll Donor_Status_Prev_Camp Age
- AreaHomeValue AreaMedIncome Donor_Status_a Donor_Status_s Donor_Status_n
- Donor_Status_e Donor_Status_f DemHomeOwner_n Gender_m Gender_f DemArea2 / OUTROC = ROC;
- ROC;
- ODS OUTPUT ROCASSOCIATION = AUC;
- run;
- /* STEP 4: Drop the variable with LOWER chisq */
- /* 1. remove Donor_Status_a */
- data work.dap_1_mc;
- set work.dap_1_mc;
- drop Donor_Status_a;
- run;
- /* STEP 5: Repeat steps 1-4 until the highest VIF is below cutt-off (<2) */
- /********************************************************************************************************************/
- /* ROUND 2 */
- proc reg data=work.dap_1_mc;
- model Donor = DonCntP1 DONCntAll DONCntCardP1 DONCntCardAll DONAvgLast DONAvgP1
- DONAvgAll DONAvgCardP1 DONTimeLast DONTimeFirst CallCntP2 CallCntP1 CallCntAll
- CallCntCardP2 CallCntCardP1 CallCntCardAll Donor_Status_Prev_Camp Age
- AreaHomeValue AreaMedIncome Donor_Status_s Donor_Status_n
- Donor_Status_e Donor_Status_f DemHomeOwner_n Gender_m Gender_f DemArea2
- / tol vif collin;
- run;
- /* STEP 2: Take the variable with highest collinearity with X --> (X)*(Y) */
- /* STEP 3: Check Chi Sqr for (X)*(Y) */
- proc logistic data = work.dap_1_mc;
- CLASS Donor_Status_s Donor_Status_n Donor_Status_e Donor_Status_f DemHomeOwner_n Gender_m Gender_f;
- MODEL Donor(event='1')= DonCntP1 DONCntAll DONCntCardP1 DONCntCardAll DONAvgLast DONAvgP1
- DONAvgAll DONAvgCardP1 DONTimeLast DONTimeFirst CallCntP2 CallCntP1 CallCntAll
- CallCntCardP2 CallCntCardP1 CallCntCardAll Donor_Status_Prev_Camp Age
- AreaHomeValue AreaMedIncome Donor_Status_s Donor_Status_n
- Donor_Status_e Donor_Status_f DemHomeOwner_n Gender_m Gender_f DemArea2 / OUTROC = ROC;
- ROC;
- ODS OUTPUT ROCASSOCIATION = AUC;
- run;
- /* STEP 4: Drop the variable with LOWER chisq */
- /* 1. remove CallCntCardAll */
- data work.dap_1_mc;
- set work.dap_1_mc;
- drop CallCntCardAll;
- run;
- /* STEP 5: Repeat steps 1-4 until the highest VIF is below cutt-off (<2) */
- /********************************************************************************************************************/
- /* ROUND 3 */
- proc reg data=work.dap_1_mc;
- model Donor = DonCntP1 DONCntAll DONCntCardP1 DONCntCardAll DONAvgLast DONAvgP1
- DONAvgAll DONAvgCardP1 DONTimeLast DONTimeFirst CallCntP2 CallCntP1 CallCntAll
- CallCntCardP2 CallCntCardP1 Donor_Status_Prev_Camp Age
- AreaHomeValue AreaMedIncome Donor_Status_s Donor_Status_n
- Donor_Status_e Donor_Status_f DemHomeOwner_n Gender_m Gender_f DemArea2
- / tol vif collin;
- run;
- /* STEP 2: Take the variable with highest collinearity with X --> (X)*(Y) */
- /* STEP 3: Check Chi Sqr for (X)*(Y) */
- proc logistic data = work.dap_1_mc;
- CLASS Donor_Status_s Donor_Status_n Donor_Status_e Donor_Status_f DemHomeOwner_n Gender_m Gender_f;
- MODEL Donor(event='1')= DonCntP1 DONCntAll DONCntCardP1 DONCntCardAll DONAvgLast DONAvgP1
- DONAvgAll DONAvgCardP1 DONTimeLast DONTimeFirst CallCntP2 CallCntP1 CallCntAll
- CallCntCardP2 CallCntCardP1 Donor_Status_Prev_Camp Age
- AreaHomeValue AreaMedIncome Donor_Status_s Donor_Status_n
- Donor_Status_e Donor_Status_f DemHomeOwner_n Gender_m Gender_f DemArea2 / OUTROC = ROC;
- ROC;
- ODS OUTPUT ROCASSOCIATION = AUC;
- run;
- /* STEP 4: Drop the variable with LOWER chisq */
- /* 1. remove CallCntCardAll */
- data work.dap_1_mc;
- set work.dap_1_mc;
- drop CallCntCardAll;
- run;
- /* STEP 5: Repeat steps 1-4 until the highest VIF is below cutt-off (<2) */
- /********************************************************************************************************************/
- /* ROUND 4 */
- /********************************************************************************************************************/
- /* ROUND 5 */
- /********************************************************************************************************************/
- /* ROUND 6 */
- /********************************************************************************************************************/
- /* ROUND 7 */
- /********************************************************************************************************************/
- /* ROUND 8 */
- /********************************************************************************************************************/
- /* ROUND 9 */
- /********************************************************************************************************************/
- /* ROUND 10 */
- /********************************************************************************************************************/
- /* ROUND 11 */
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement