Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- FILENAME REFFILE '/home/apexmuaado0/dap_assignment/dap_data.xls';
- PROC IMPORT DATAFILE=REFFILE
- DBMS=XLS
- OUT=dap_data;
- GETNAMES=YES;
- RUN;
- /* drop the Donor ID column */
- data dap_data(drop=D_ID);
- set dap_data;
- run;
- /* list the first 10 observations */
- proc print data=work.dap_data (obs=10);
- title 'Listing of the Data Set';
- run;
- title;
- /* summary of the table contents */
- PROC CONTENTS DATA=work.dap_1;
- RUN;
- /* *******************************************************ENCODE CATEGORICAL VARIABLES*****************************************************************/
- /* ****************************************************************************************************************************************************/
- /* encode categorical variable in char to numeric
- Donor_Status:: A if active funder, S if star funder, N if new funder, E if inactive funder, F if first time funder, L if lapsing funder
- DUMMY VARS CREAteD For all THE DONORS EXCEPT for donor_status_l
- */
- data work.dap_data;
- set work.dap_data;
- if Donor_Status = 'A' then Donor_Status_a = 1;
- else if Donor_Status = 'S' then Donor_Status_s = 1;
- else if Donor_Status = 'N' then Donor_Status_n = 1;
- else if Donor_Status = 'E' then Donor_Status_e = 1;
- else if Donor_Status = 'F' then Donor_Status_f = 1;
- run;
- /* missing values in the dummy variables are declared as zero : except donor status l*/
- data work.dap_data;
- set work.dap_data;
- if Donor_Status_a = . then Donor_Status_a = 0;
- if Donor_Status_s = . then Donor_Status_s = 0;
- if Donor_Status_n = . then Donor_Status_n = 0;
- if Donor_Status_e = . then Donor_Status_e = 0;
- if Donor_Status_f = . then Donor_Status_f = 0;
- run;
- /* confirm the masking values are changed in the dataset */
- proc freq data = work.dap_1;
- table Donor_Status_n;
- run;
- /*
- DemHomeOwner:: H - if home owner U - if unknown
- CREAtE ONE DUMMY Variable FOR HOME OWNER
- */
- data work.dap_data;
- set work.dap_data;
- if DemHomeOwner = 'H' then DemHomeOwner_n = 1;
- if DemHomeOwner = 'U' then DemHomeOwner_n = 0;
- run;
- /* confirm the masking values are changed in the dataset */
- proc freq data = work.dap_data;
- table DemHomeOwner_n;
- run;
- /*
- Gender:: M for Male : F for Female
- create two dummy variables for male and female
- */
- data work.dap_data;
- set work.dap_data;
- if Gender = 'M' then Gender_m = 1;
- if Gender = 'F' then Gender_f = 1;
- run;
- /*declare misssing values of gender dummy variables as zero */
- data work.dap_data;
- set work.dap_data;
- if Gender_m = . then Gender_m = 0;
- if Gender_f = . then Gender_f = 0;
- run;
- /* check if any missing values exisit */
- proc means data = work.dap_1 n nmiss ;
- var _numeric_;
- run;
- /* *********************END*************************************END**************************************END*******************************************/
- /* ****************************************************************************************************************************************************/
- /* create a new dataset with the new encoded variables */
- data dap_1;
- set work.dap_data (drop=Gender DemHomeOwner Donor_Status);
- run;
- /* view the new dataset with encoded variables */
- PROC CONTENTS DATA=work.dap_1;
- RUN;
- /* change the vraiable type of the DemArea from char to numeric */
- data dap_1;
- set work.dap_1;
- DemArea2 = input(DemArea, best12.);
- run;
- data dap_1;
- set work.dap_1 (drop = DemArea);
- run;
- /* Outlier treatment *****************************************************************************************************************************/
- proc univariate data= WORK.dap_1 robustscale plot;
- var age areahomevalue areamedincome callcntall callcntcardall callcntcardp1
- callcntcardp2 callcntp1 callcntp2 demarea2 demhomeowner_n donavgcardp1 donavgall
- donavgcardp1 donavglast donavgp1 doncntall doncntcardall doncntcardp1 doncntp1
- donor donor_D donor_Status_n donor_status_prev_camp dontimefirst dontimelast Gender_m Gender_f;
- run;
- /* get the values for iqr 3rd quartile and 1st quartile*/
- proc means data=work.dap_1 stackods n qrange p25 p75;
- var age areahomevalue areamedincome callcntall callcntcardall callcntcardp1
- callcntcardp2 callcntp1 callcntp2 demarea2 demhomeowner_n donavgcardp1 donavgall
- donavgcardp1 donavglast donavgp1 doncntall doncntcardall doncntcardp1 doncntp1
- donor donor_D donor_Status_n donor_status_prev_camp dontimefirst dontimelast Gender_m Gender_f;
- ods output summary=ranges;
- run;
- *macro to cap outliers;
- %macro cap(dset=,var=, lower=, upper=);
- data &dset;
- set &dset;
- if &var>&upper then &var=&upper;
- if &var<&lower then &var=&lower;
- run;
- %mend;
- *create cutoffs and execute macro for each variable;
- data cutoffs;
- set ranges;
- lower=p25-1.5*qrange;
- upper=p75+1.5*qrange;
- string = catt('%cap(dset=dap_1, var=', variable, ", lower=", lower, ", upper=", upper ,");");
- call execute(string);
- run;
- /*assign labels */
- data dap_1; set dap_1;
- label
- Donor_D = "Amount donated for previous campaigns"
- DonCntP1 = "Times the person funded in the last 36 months"
- DONCntAll = "Times the person donated over period of few years"
- DONCntCardP1 = "Times the person funded in the last 36 months with the help of references"
- DONCntCardAll = "Times the person donated over period of few years with the help of references"
- DONAvgLast = "Amount funded last by the person (in $ thousands)"
- DONAvgP1 = "Fund Amount in Average for 36 Months"
- DONAvgAll = "Fund Amount in Average for overall period of few years"
- DONAvgCardP1 = "Fund Amount in Average for 36 Months from the individual with the help of references"
- DONTimeLast = "No of months since the donator made last donation"
- DONTimeFirst = "No of months since the donator made first donation"
- CallCntP2 = "No of Calls made by GRDS to the individual for the last 12 months"
- CallCntP1 = "No of Calls made by GRDS to the individual for the last 36 months"
- CallCntAll = "No of Calls made by GRDS to the individual over the period of few years"
- CallCntCardP2 = "No of Calls made by References to the individual for the last 12 months"
- CallCntCardP1 = "No of Calls made by References to the individual for the last 36 months"
- CallCntCardAll = "No of Calls made by References to the individual over the period of few years"
- Donor_Status_a = "active funder"
- Donor_Status_s = "star funder"
- Donor_Status_n = "new funder"
- Donor_Status_e = "inactive funder"
- Donor_Status_f = "first time funder"
- DemHomeOwner_n = "Owns home or not"
- Donor_Status_Prev_Camp = "Donated to the last response"
- DemArea2 = "Demographic Area"
- Age = "Age"
- Gender_m = "Male"
- Gender_f = "Female"
- AreaHomeValue = "median home value"
- AreaMedIncome = "median income";
- RUN;
- data dap_1;
- set work.dap_1 (drop = Donor_D);
- run;
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement