pmbrown

simul_data.sas

Nov 10th, 2020
46
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. /******************************************************************************
  2. Author, date: Paul Brown, JUL2015
  3. Macro name:   simul_data, iterat_simul (run simul_data iteratively in order to
  4.               obtain the desired correlations between the outcomes), nearestcorr
  5. Description:  simulate random samples from a multivariate normal distribution,
  6.               transform variables to required types. Create a dataset containing
  7.               num random samples indicated by the variable samp=1,2,...num.  
  8.               Outcomes may be TTE, dichotmous, normal or log-normal (developed in SAS 9.4)
  9. Reference:    Sun et al., Circulation: Heart Failure, 2012
  10.               Austin, Generating survival times, Stat in Med 2012
  11.               http://blogs.sas.com/content/iml/2012/11/28/computing-the-nearest-
  12.                 correlation-matrix.html
  13. ******************************************************************************/
  14.  
  15.  
  16. ********************************************************************;
  17. ***               define macro parameters                       ***;
  18. ********************************************************************;
  19.  
  20. %macro simul_data(n=100,     /*total sample size ie n/2 in each grp*/
  21.              num=1000,       /*no. of random samples*/
  22.              numvars=5,      /*number of variables*/
  23.              tte_fu=30,      /*follow-up time for censoring of TTE endpts, corresponds
  24.                                to the response estimates assumed for TTE outcomes*/
  25.              var1_tp=SURV,   /*variable type: bino=dichotomous, norm=normal,
  26.                                logn=log-normal ie log(post baseline/baseline),
  27.                                or surv=time-to-event (exponential)*/
  28.              var1_hi=0.93,   /*hi=active grp, the base response*/
  29.              var1_df=-0.02,  /*df=treatment difference, defines response for Lo=contrl grp*/
  30.              var1_sd=,       /*if var1_tp=norm or lognorm (sd for log(post/base))*/
  31.              var2_tp=SURV, var2_hi=0.86, var2_df=-0.02, var2_sd=,
  32.              var3_tp=BINO, var3_hi=0.2, var3_df=0.02, var3_sd=,
  33.              var4_tp=NORM, var4_hi=2500, var4_df=-500, var4_sd=2700,  
  34.              var5_tp=LOGN, var5_hi=0.5, var5_df=0.2, var5_sd=0.7,
  35.              /*initial working correlations */  
  36.              corr12=0, corr13=0, corr14=0, corr15=0,
  37.                        corr23=0, corr24=0, corr25=0,
  38.                                  corr34=0, corr35=0,
  39.                                            corr45=0);
  40.  
  41. %let gtype1=%upcase(&var1_tp) ;
  42. %let gtype2=%upcase(&var2_tp) ;
  43. %let gtype3=%upcase(&var3_tp) ;
  44. %let gtype4=%upcase(&var4_tp) ;
  45. %let gtype5=%upcase(&var5_tp) ;
  46.  
  47. ********************************************************************;
  48. ***               convert to normal variates           ***;
  49. ********************************************************************;
  50.  
  51. data _null_;
  52.   %do i=1 %to &numvars;
  53.     %if &&gtype&i=SURV %then %do;
  54.       surv_hi=&&var&i._hi;
  55.       surv_lo=&&var&i._hi+&&var&i._df;
  56.       *--hazards;
  57.       survh_hi=-log(surv_hi)/&tte_fu;
  58.       survh_lo=-log(surv_lo)/&tte_fu;
  59.       *--log hazards;
  60.       survz_hi=log(survh_hi);
  61.       survz_lo=log(survh_lo);
  62.       *--number of deaths;
  63.       survd_hi=(&n/2)*(1-surv_hi);
  64.       survd_lo=(&n/2)*(1-surv_lo);
  65.       *--variance;
  66.       survv_hi=1/survd_hi;
  67.       survv_lo=1/survd_lo;
  68.       call symput("var&i.v_hi",trim(left(put(survv_hi,best.))));
  69.       call symput("var&i.v_lo",trim(left(put(survv_lo,best.))));
  70.       call symput("var&i.z_hi",trim(left(put(survz_hi,best.))));
  71.       call symput("var&i.z_lo",trim(left(put(survz_lo,best.))));
  72.     %end;
  73.     %else %if &&gtype&i=BINO %then %do;
  74.       bino_hi=&&var&i._hi;
  75.       bino_lo=&&var&i._hi+&&var&i._df;
  76.       *--log odds;
  77.       binoz_hi=log(bino_hi/(1-bino_hi));
  78.       binoz_lo=log(bino_lo/(1-bino_lo));
  79.       *--variance of log odds;
  80.       binov_hi=1/((&n/2)*bino_hi*(1-bino_hi));
  81.       binov_lo=1/((&n/2)*bino_lo*(1-bino_lo));
  82.       call symput("var&i.v_hi",trim(left(put(binov_hi,best.))));
  83.       call symput("var&i.v_lo",trim(left(put(binov_lo,best.))));
  84.       call symput("var&i.z_hi",trim(left(put(binoz_hi,best.))));
  85.       call symput("var&i.z_lo",trim(left(put(binoz_lo,best.))));
  86.     %end;
  87.     %else %if &&gtype&i=NORM %then %do;
  88.       normv_hi=&&var&i._sd**2;
  89.       normv_lo=&&var&i._sd**2;
  90.       normz_lo=&&var&i._hi+&&var&i._df;
  91.       normz_hi=&&var&i._hi;
  92.       call symput("var&i.v_hi",trim(left(put(normv_hi,best.))));
  93.       call symput("var&i.v_lo",trim(left(put(normv_lo,best.))));
  94.       call symput("var&i.z_hi",trim(left(put(normz_hi,best.))));
  95.       call symput("var&i.z_lo",trim(left(put(normz_lo,best.))));
  96.     %end;
  97.     %else %if &&gtype&i=LOGN %then %do;
  98.       *--log ratio (post baseline/baseline);
  99.       lognz_lo=log(&&var&i._hi+&&var&i._df);
  100.       lognz_hi=log(&&var&i._hi);
  101.       *--variance of log ratio;
  102.       lognv_hi=&&var&i._sd**2;
  103.       lognv_lo=&&var&i._sd**2;
  104.       call symput("var&i.v_hi",trim(left(put(lognv_hi,best.))));
  105.       call symput("var&i.v_lo",trim(left(put(lognv_lo,best.))));
  106.       call symput("var&i.z_hi",trim(left(put(lognz_hi,best.))));
  107.       call symput("var&i.z_lo",trim(left(put(lognz_lo,best.))));
  108.     %end;
  109.   %end;
  110. run;
  111.  
  112.  
  113. ********************************************************************;
  114. ***          simulate data from multivariate normal         ***;
  115. ********************************************************************;
  116.  
  117. proc iml;
  118. %nearestcorr();
  119.   *specify parameters of multivariate normal dist;
  120.   mean_hi=1:&numvars; varn_hi=1:&numvars;
  121.   mean_lo=1:&numvars; varn_lo=1:&numvars;
  122.   %do i=1 %to &numvars;
  123.     mean_hi[&i]=&&var&i.z_hi;
  124.     varn_hi[&i]=&&var&i.v_hi;
  125.     mean_lo[&i]=&&var&i.z_lo;
  126.     varn_lo[&i]=&&var&i.v_lo;
  127.   %end;
  128.   corr_tmp_={1 &corr12 &corr13 &corr14 &corr15,
  129.         &corr12 1 &corr23 &corr24 &corr25,
  130.         &corr13 &corr23 1 &corr34 &corr35,
  131.         &corr14 &corr24 &corr34 1 &corr45,
  132.         &corr15 &corr25 &corr35 &corr45 1};
  133.   corr_tmp=corr_tmp_[1:&numvars, 1:&numvars]; /*in case numvar < 5*/
  134.   eigval=eigval(corr_tmp);
  135.   if all(eigval>0) then corr=corr_tmp; /*if positive definite*/
  136.   else corr=NearestCorr(corr_tmp); /*otherwise find nearest pos def matrix*/
  137.   print corr;
  138.   covr_hi=corr#sqrt(varn_hi`*varn_hi);
  139.   covr_lo=corr#sqrt(varn_lo`*varn_lo);
  140.   print covr_hi; print covr_lo;
  141.   *obtain random samples (seed=dec2014);
  142.   call randseed(1214);          
  143.   hi=randnormal((&n/2)*&num,mean_hi,covr_hi);
  144.   lo=randnormal((&n/2)*&num,mean_lo,covr_lo);
  145.   *create sas datasets;
  146.   samp=colvec(repeat(T(1:&num),1,&n/2));
  147.   z=samp||hi;
  148.   create rand_hi from z[c={"samp" "x1" "x2" "x3" "x4" "x5"}];
  149.     append from z;
  150.   close rand_hi;
  151.   z=samp||lo;
  152.   create rand_lo from z[c={"samp" "x1" "x2" "x3" "x4" "x5"}];
  153.     append from z;
  154.   close rand_lo;
  155. quit;
  156.  
  157. *combine data for the 2 grps;
  158. data randztmp;
  159.   format trt $10.;
  160.   set rand_hi (in=h) rand_lo (in=l);
  161.   *rename nominal Lo\Hi groups;
  162.   if h then trt='Active';
  163.   else if l then trt='Control';
  164. run;
  165.  
  166. *sort data;
  167. proc sort data=randztmp;
  168.   by samp trt;
  169. run;
  170.  
  171. *finalise data, create patient number;
  172. data randz;
  173.   retain subjno;
  174.   set randztmp;
  175.   by samp trt;
  176.   if first.samp then subjno=1;
  177.   else subjno=subjno+1;
  178. run;
  179.  
  180. ********************************************************************;
  181. ***               transform from normal variates                 ***;
  182. ********************************************************************;
  183.  
  184. data randsamp;
  185.   set randz;
  186.   %do i=1 %to &numvars;
  187.     %if &&gtype&i=SURV %then %do;
  188.       var&i=round(-log(ranuni(1412))/exp(x&i),1)+1;  
  189.       var&i.2=var&i; var&i.c=0;
  190.       if var&i gt &tte_fu then do;
  191.         var&i.2=&tte_fu;
  192.         var&i.c=1;
  193.       end;
  194.       label var&i="Survival endpoint &i"
  195.         var&i.2="Survival endpoint &i with censoring"
  196.         var&i.c="Survival endpoint &i censoring indicator";
  197.     %end;
  198.     %else %if &&gtype&i=BINO %then %do;
  199.       if trt='Active' then do;
  200.         x&i._=(x&i-&&var&i.z_hi)/sqrt(&&var&i.v_hi);
  201.         x&i.__=tinv(1-&&var&i._hi,&n/2-1);
  202.         if x&i._ gt x&i.__ then var&i=1;
  203.         else var&i=0;
  204.       end;
  205.       else if trt='Control' then do;
  206.         x&i._=(x&i-&&var&i.z_lo)/sqrt(&&var&i.v_lo);
  207.         x&i.__=tinv(1-(&&var&i._hi+&&var&i._df),&n/2-1);
  208.         if x&i._ gt x&i.__ then var&i=1;
  209.         else var&i=0;
  210.       end;
  211.       drop x&i._ x&i.__;
  212.       label var&i="Dichotomous endpoint &i";
  213.     %end;
  214.     %else %if &&gtype&i=NORM %then %do;
  215.       var&i=x&i;
  216.       label var&i="Continuous endpoint &i";
  217.     %end;
  218.     %else %if &&gtype&i=LOGN %then %do;
  219.       var&i=100*(exp(x&i)-1);
  220.       label var&i="Pct change endpoint &i";
  221.     %end;
  222.   %end;
  223.   drop x1-x&numvars ;
  224.   label trt='Treatment' samp='Rand sample no.' subjno='Patient no.';
  225. run;
  226.  
  227. proc sort data=randsamp;
  228.   by samp trt;
  229. run;
  230.  
  231. %mend simul_data;
  232.  
  233. ********************************************************************;
  234. ***             iterations to obtain correlations                ***;
  235. ********************************************************************;
  236.  
  237. %macro iterat_simul(n_=100,num_=1000,numvars_=5,criterion=0.025,maxiter=50,
  238.                     aim12=0.1, aim13=-0.06, aim14=0.05, aim15=0, aim23=-0.03,
  239.                     aim24=0, aim25=0, aim34=-0.6, aim35=0, aim45=0,/*desired correlations*/
  240.                     out=finalsamp, out_iter=finaliter); /*output datasets*/
  241.  
  242. %global gnumvars gnum gtype1 gtype2 gtype3 gtype4 gtype5 gprec ;
  243.  
  244. %let gnumvars=&numvars_;
  245. %let gnum=&num_;
  246. %let t0 = %sysfunc(datetime());
  247.  
  248. data iterations;
  249.   set _null_;
  250. run;
  251.  
  252. %simul_data(n=&n_,num=&num_,numvars=&numvars_);
  253.  
  254. %let gprec=1;
  255. %let iterat=0;
  256. %do %while (&gprec gt &criterion and &iterat lt &maxiter);
  257.                    /*prec (precision) = max difference between actual
  258.                                    correlation and the desired value*/
  259.  
  260. %let iterat=%eval(&iterat+1);
  261.  
  262. ods output pearsonCorr=corrns1
  263.            (keep=variable var:
  264.             rename=(var1-var&numvars_ = varc1-varc&numvars_));
  265. proc corr data=randsamp pearson; /*pearson used even for binary outcomes, produces the same corr
  266.                                    as the biserial point correlation*/
  267.   var var1-var&numvars_; /*careful to exclude var[]2 (survival vars)*/
  268. run;
  269.  
  270. proc sort data=corrns1;
  271.   by variable;
  272. run;
  273.  
  274. data randzc;
  275.   set randz;
  276.   rename x1-x&numvars_ = var1-var&numvars_;
  277. run;
  278.  
  279. ods output pearsonCorr=corrns2
  280.            (keep=variable var:
  281.             rename=(var1-var&numvars_ = varcz1-varcz&numvars_));
  282. proc corr data=randzc pearson;
  283.   var var:; /*across random samples*/
  284. run;
  285.  
  286. proc sort data=corrns2;
  287.   by variable;
  288. run;
  289.  
  290. data corrns;
  291.   merge corrns1 corrns2;
  292.   by variable;
  293. run;
  294.  
  295. data iterations;
  296.   format precision 7.5 ;
  297.   retain prectmp 0;
  298.   set iterations corrns (in=a);
  299.   conv=0.005; /*conv=convergence limit*/
  300.   if a then do;
  301.     iteration=&iterat;
  302.     %do p=1 %to 5;
  303.       %do b=1 %to 5;
  304.         if variable="var&p" then do;
  305.           %if &b gt &p and &b le &numvars_ %then %do; /*corrns above diagonal*/
  306.             diff&p&b=abs(varc&b-&&aim&p&b);
  307.             prectmp=max(prectmp,diff&p&b);
  308.             if diff&p&b lt conv then c&p&b=varcz&b; /*desired corrn attained*/
  309.             else do; /*desired value not attained, thus adjust*/
  310.               if &&aim&p&b gt 0 then c&p&b=min((&&aim&p&b/varc&b)*varcz&b,1); /*+ve corrns*/
  311.               else c&p&b=max((&&aim&p&b/varc&b)*varcz&b,-1); /*-ve corrns*/
  312.             end;
  313.             call symput("c&p&b",trim(left(put(c&p&b,best.)))); /*corrn assigned its new value*/
  314.           %end;
  315.         end;
  316.           %if &b gt &p and (&p gt &numvars_ or &b gt &numvars_) %then %do;
  317.             call symput("c&p&b",trim(left(put(0,best.)))); /*superfluous correlations*/
  318.           %end;
  319.       %end;
  320.     %end;
  321.     if variable="var&numvars_" then call symput("gprec",trim(left(put(prectmp,7.5))));
  322.     precision=prectmp; /*this precision will be compared against the criterion and the loop
  323.                       terminated if satisfied*/
  324.   end;
  325. run;
  326.  
  327. %simul_data(n=&n_,num=&num_,numvars=&numvars_,
  328.              corr12=&c12, corr13=&c13, corr14=&c14, corr15=&c15,
  329.                           corr23=&c23, corr24=&c24, corr25=&c25,
  330.                                        corr34=&c34, corr35=&c35,
  331.                                                     corr45=&c45);
  332. %end;
  333.  
  334. %let speed=%sysfunc(datetime())-&t0;
  335.  
  336. data &out;
  337.   set randsamp;
  338. run;
  339.  
  340. data &out_iter;
  341.   retain iteration variable varc1-varc&numvars_ precision speed ; /*put vars in order*/
  342.   format varc1-varc&numvars_ 5.2;
  343.   set iterations;
  344.   speed=put(&speed,6.2)||' secs';
  345.   keep iteration variable varc1-varc&numvars_ precision speed;
  346. run;
  347.  
  348. proc sort data=&out_iter;
  349.   by iteration variable;
  350. run;
  351.  
  352. %mend iterat_simul;
  353.  
  354.  
  355. *** end *****************************************;
  356.  
RAW Paste Data