Advertisement
Guest User

Untitled

a guest
Nov 20th, 2017
197
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
SAS 9.64 KB | None | 0 0
  1. /* Macro function for Simply Data Preparation. It generates a PROC SQL steps.
  2. The parameter in the first position is InputDataset, which represents the SAS input dataset for this function (Train or Test).
  3. The parameter in the second position is OutputDataset, which represents the SAS output dataset (result of data preparation).
  4. The third parameter is main_vars. This variable used for define ID with or without response variable (DEF in this case)*/
  5.  
  6. %macro Simply_Data_Preparation(InputDataset, OutputDataset, main_vars);
  7.     proc sql;
  8.         create table SetWithFlags as
  9.             select &&&main_vars., max(AMT_CREDIT_SUM) as MAX_AMT_CREDIT, min(AMT_CREDIT_SUM) as MIN_AMT_CREDIT,
  10.             max(CREDIT_ACTIVE) as CREDIT_ACTIVE, max(CREDIT_COLLATERAL) as CREDIT_COLLATERAL,
  11.             (case
  12.                 when CREDIT_CURRENCY in ("usd", "eur", "chf", "jpy") then 1
  13.                 else 0 end
  14.             ) as FLAG_CREDIT_CURRENCY_FOREIGN,     
  15.             (case
  16.                 when CREDIT_TYPE in (0, 1, 2, 3, 4) then 1
  17.                 when CREDIT_TYPE in (5, 6, 7, 8, 9, 10, 11, 12, 13, 14) then 2
  18.                 when CREDIT_TYPE in (15, 16, 17, 18, 19, 90, 99) then 3 end
  19.             ) as FLAG_CREDIT_TYPE,
  20.             (case
  21.                 when substr((TEXT_PAYMENT_DISCIPLINE), 1,1) in ("0", "C", "X" ) then 1
  22.                 when substr((TEXT_PAYMENT_DISCIPLINE), 1,1) in ("1", "2", "3") then 2
  23.                 when substr((TEXT_PAYMENT_DISCIPLINE), 1,1) in ("4", "5") then 3
  24.                 end
  25.             ) as FLAG_TEXT,
  26.             max(CREDIT_DAY_OVERDUE) as CREDIT_DAY_OVERDUE, max(AMT_CREDIT_MAX_OVERDUE) as AMT_CREDIT_MAX_OVERDUE,
  27.             max(CNT_CREDIT_PROLONG) as CNT_CREDIT_PROLONG, max(AMT_CREDIT_SUM_DEBT) as AMT_CREDIT_SUM_DEBT,
  28.             max(AMT_CREDIT_SUM_LIMIT) as AMT_CREDIT_SUM_LIMIT, max(AMT_CREDIT_SUM_OVERDUE) as AMT_CREDIT_SUM_OVERDUE,
  29.             max(CREDIT_DELAY30) as CREDIT_DELAY30, max(CREDIT_DELAY5)as CREDIT_DELAY5, max(CREDIT_DELAY60) as CREDIT_DELAY60,
  30.             max(CREDIT_DELAY90) as CREDIT_DELAY90, max(CREDIT_DELAY_MORE) as CREDIT_DELAY_MORE, max(AMT_REQ_SOURCE_HOUR) as AMT_REQ_SOURCE_HOUR,
  31.             max(AMT_REQ_SOURCE_DAY) as AMT_REQ_SOURCE_DAY, max(AMT_REQ_SOURCE_WEEK) as AMT_REQ_SOURCE_WEEK, max(AMT_REQ_SOURCE_MON) as AMT_REQ_SOURCE_MON,
  32.             max(AMT_REQ_SOURCE_QRT) as AMT_REQ_SOURCE_QRT, max(AMT_REQ_SOURCE_YEAR) as AMT_REQ_SOURCE_YEAR, max(AMT_ANNUITY) as AMT_ANNUITY
  33.             from &InputDataset.
  34.             group by ID;
  35.     quit;
  36.  
  37.     proc sql;
  38.         create table &OutputDataset. as
  39.         select &&&main_vars., max(FLAG_CREDIT_CURRENCY_FOREIGN) as FLAG_CREDIT_CURRENCY_FOREIGN, max(FLAG_CREDIT_TYPE) as FLAG_CREDIT_TYPE,
  40.         max(FLAG_TEXT) as FLAG_TEXT, CREDIT_DAY_OVERDUE, AMT_CREDIT_MAX_OVERDUE, CNT_CREDIT_PROLONG, AMT_CREDIT_SUM_DEBT,
  41.         AMT_CREDIT_SUM_LIMIT, AMT_CREDIT_SUM_OVERDUE, CREDIT_DELAY30, CREDIT_DELAY5, CREDIT_DELAY60, CREDIT_DELAY90, CREDIT_DELAY_MORE,
  42.         AMT_REQ_SOURCE_HOUR, AMT_REQ_SOURCE_DAY , AMT_REQ_SOURCE_WEEK, AMT_REQ_SOURCE_MON, AMT_REQ_SOURCE_QRT, AMT_REQ_SOURCE_YEAR, AMT_ANNUITY
  43.  
  44.         from SetWithFlags
  45.         group by &&&main_vars., CREDIT_DAY_OVERDUE, AMT_CREDIT_MAX_OVERDUE, CNT_CREDIT_PROLONG, AMT_CREDIT_SUM_DEBT,
  46.         AMT_CREDIT_SUM_LIMIT, AMT_CREDIT_SUM_OVERDUE, CREDIT_DELAY30, CREDIT_DELAY5, CREDIT_DELAY60, CREDIT_DELAY90,
  47.         CREDIT_DELAY_MORE, AMT_REQ_SOURCE_HOUR, AMT_REQ_SOURCE_DAY , AMT_REQ_SOURCE_WEEK, AMT_REQ_SOURCE_MON,
  48.         AMT_REQ_SOURCE_QRT, AMT_REQ_SOURCE_YEAR, AMT_ANNUITY;
  49.     quit;
  50.  
  51.     proc sql;
  52.         drop table SetWithFlags;
  53.     quit;
  54.    
  55.     %replacement_missing_values(&OutputDataset., AMT_CREDIT_MAX_OVERDUE);
  56.     %replacement_missing_values(&OutputDataset., AMT_CREDIT_SUM_DEBT);
  57.     %replacement_missing_values(&OutputDataset., AMT_CREDIT_SUM_LIMIT);
  58.     %replacement_missing_values(&OutputDataset., AMT_ANNUITY);
  59.  
  60. %mend Simply_Data_Preparation;
  61.  
  62. /* Macro function for replacement missing values with 0. It generates a PROC SQL steps.
  63. The first parameter of this macro is Dataset, it represents the SAS dataset in which  
  64. the second parameter of this macro (value) is replaced.
  65. */
  66.  
  67. %macro replacement_missing_values(Dataset, value);
  68.     data &Dataset.;
  69.     set &Dataset.;
  70.     if &value. eq '.' then &value. = 0;
  71.     run;
  72. %mend replacement_missing_values;
  73.  
  74. /* Load Train Data*/
  75. proc import datafile='D:\hc_final\train.csv'
  76.             out=TrainHomecredit
  77.             dbms=dlm
  78.             replace;
  79.             datarow = 2;
  80.             delimiter = ',';
  81.             getnames = yes;
  82. run;
  83.  
  84. /* Load Test Data*/
  85. proc import datafile='D:\hc_final\test.csv'
  86.             out=TestHomecredit
  87.             dbms=dlm
  88.             replace;
  89.             datarow = 2;
  90.             delimiter = ',';
  91.             getnames = yes;
  92. run;
  93.  
  94. /* Simply Data Preparation*/
  95.  
  96. /* Macro variables are used to store and manipulate character strings*/
  97. %let main_variables_train = ID, DEF;
  98. %let macro_main_variables_train = main_variables_train;
  99. %let main_variable_test = ID;
  100. %let macro_main_variable_test = main_variable_test;
  101.  
  102. /* Run macro function*/
  103. %Simply_Data_Preparation(TrainHomecredit,TrainResult, &macro_main_variables_train);
  104. %Simply_Data_Preparation(TestHomecredit,TestResult, &macro_main_variable_test);
  105.  
  106. /* Sort the data by the target in preparation for stratified sampling*/
  107. proc sort data=TrainResult out=TrainResult;
  108.    by DEF;
  109. run;
  110.  
  111. /* The SURVEYSELECT procedure will perform stratified sampling on
  112. any variable in the STRATA  statement. The OUTALL option  
  113. specifies that you want a flag appended to the file to indicate
  114. selected records, not simply a file comprised of the selected  
  115. records.*/
  116. proc surveyselect noprint ranuni
  117.                   data = TrainResult
  118.                   samprate=.6667
  119.                   out=TrainResult
  120.                   seed=44444
  121.                   outall;
  122.     strata DEF;
  123. run;
  124.  
  125. /* Verify stratification */
  126. proc freq data = TrainResult;
  127.   tables DEF*selected;
  128. run;
  129.  
  130. /* Create training and validation data sets */
  131. data Train Valid;
  132.    set TrainResult;
  133.    if selected then output Train;
  134.    else output Valid;
  135. run;
  136.  
  137. /* Macro variable for all avaliable variables*/
  138. %let all_variables =  FLAG_CREDIT_CURRENCY_FOREIGN FLAG_CREDIT_TYPE FLAG_TEXT CREDIT_DAY_OVERDUE
  139. AMT_CREDIT_MAX_OVERDUE CNT_CREDIT_PROLONG AMT_CREDIT_SUM_DEBT AMT_CREDIT_SUM_LIMIT AMT_CREDIT_SUM_OVERDUE
  140. CREDIT_DELAY30 CREDIT_DELAY5 CREDIT_DELAY60 CREDIT_DELAY90 CREDIT_DELAY_MORE AMT_REQ_SOURCE_HOUR
  141. AMT_REQ_SOURCE_DAY AMT_REQ_SOURCE_WEEK AMT_REQ_SOURCE_MON AMT_REQ_SOURCE_QRT AMT_REQ_SOURCE_YEAR AMT_ANNUITY;
  142.  
  143. /*ods html close;*/
  144. ods output spearmancorr=spearman
  145.            hoeffdingcorr=hoeffding;
  146.  
  147. proc corr data=train
  148.     spearman
  149.     hoeffding
  150.     rank;
  151.     var &all_variables;
  152.     with DEF;
  153. run;
  154. /*ods html;*/
  155.  
  156. /* Macro variable for all reduced variables*/
  157. %let reduced_vars = AMT_CREDIT_MAX_OVERDUE CREDIT_DELAY30 CREDIT_DELAY5 AMT_CREDIT_SUM_DEBT AMT_REQ_SOURCE_YEAR AMT_CREDIT_SUM_LIMIT AMT_ANNUITY CREDIT_DELAY60 FLAG_CREDIT_TYPE FLAG_TEXT AMT_CREDIT_SUM_OVERDUE CREDIT_DAY_OVERDUE CREDIT_DELAY_MORE;
  158.  
  159. /* STEPWISE selection */
  160. proc logistic data=train;
  161.     STEPWISE:
  162.     model DEF (event='1')= &reduced_vars  
  163.     / selection=stepwise;
  164. run;
  165.  
  166. /* FORWARD selection */
  167. proc logistic data=train;
  168.     FORWARD:
  169.     model DEF (event='1')=&reduced_vars
  170.     / selection=forward;
  171. run;
  172.  
  173. /* BACKWARD selection */
  174. proc logistic data=train;
  175.     BACKWARD:
  176.     model DEF (event='1')=&reduced_vars
  177.     / selection=backward;
  178. run;
  179.  
  180. /* Macro variable for logistic regressions*/
  181. %let model_back = CREDIT_DAY_OVERDUE AMT_CREDIT_MAX_OVERDUE AMT_ANNUITY CREDIT_DELAY60 AMT_CREDIT_SUM_OVERDUE AMT_CREDIT_SUM_DEBT;
  182. %let model_stepwise = FLAG_TEXT FLAG_CREDIT_TYPE CREDIT_DELAY30 AMT_REQ_SOURCE_YEAR CREDIT_DELAY_MORE AMT_CREDIT_SUM_LIMIT CREDIT_DELAY5;
  183. %let model_forward = FLAG_TEXT FLAG_CREDIT_TYPE CREDIT_DELAY30 AMT_REQ_SOURCE_YEAR CREDIT_DELAY_MORE AMT_CREDIT_SUM_LIMIT CREDIT_DELAY5;
  184.  
  185.  
  186. /*Get results for Train Data by forward regression*/
  187. proc logistic data=train noprint;
  188.    model DEF(event='1')= &model_forward;
  189.    score data=train out=train_forward_out(rename=(p_1= p_mod_Forward));        
  190. run;
  191.  
  192. /*Get results for Train Data by backward regression*/
  193. proc logistic data=train noprint;
  194.    model DEF(event='1')=&model_back;
  195.    score data=train_forward_out out=train_forward_and_back_out(rename=(p_1=p_mod_Back));        
  196. run;
  197.  
  198. /*Get Statistic Information for Train Data*/
  199. ods select ROCOverlay ROCAssociation ROCContrastTest;
  200. proc logistic data=train_forward_and_back_out;
  201.    model DEF(event='1')=p_mod_Forward  p_mod_Back/ nofit;
  202.    roc "Forward Model" p_mod_Forward;
  203.    roc "Backward Model" p_mod_Back;
  204.    roccontrast "Comparing the Two Models";
  205.    title "Validation Data Set Performance TRAIN";
  206. run;
  207.  
  208. /*Get results for Validation Data by forward regression*/
  209. proc logistic data=train noprint;
  210.    model DEF(event='1')=&model_forward;
  211.    score data=valid out=valid_forward_out(rename=(p_1=p_mod_Forward ));        
  212. run;
  213.  
  214. /*Get results for Validation Data by backward regression*/
  215. proc logistic data=train noprint;
  216.    model DEF(event='1')=&model_back;
  217.    score data=valid_forward_out out=valid_forward_and_back_out(rename=(p_1=p_mod_Back));        
  218. run;
  219.  
  220. /*Get Statistic Information for Valid Data*/
  221. ods select ROCOverlay ROCAssociation ROCContrastTest;
  222. proc logistic data=valid_forward_and_back_out;
  223.    model DEF(event='1')=p_mod_Forward p_mod_Back/ nofit;
  224.    roc "Forward Model" p_mod_Forward;
  225.    roc "Backward Model" p_mod_Back;
  226.    roccontrast "Comparing the Two Models";
  227.    title "Validation Data Set Performance VALID";
  228. run;
  229.  
  230. /*Get predict information for Test Data, where p_1 - probability of default*/
  231. proc logistic data=train noprint;
  232.    model DEF(event='1')= &model_forward;
  233.    score data=TestResult out=Result(rename=(p_1= Score) keep=id p_1);        
  234. run;
  235.  
  236. %replacement_missing_values(Result, Score);
  237.  
  238. /* Export Result Data Set to CSV File*/
  239. proc export data=Result
  240.      outfile="D:\hc_final\Result.csv"
  241.      dbms=csv
  242.      replace;
  243. run;
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement