/**********************************************************************/ /***************** LOGISTIC REGRESSION IMPUTATION MACRO ***************/ /**********************************************************************/ /* */ /* USAGE: %logitimpute(datain,dataout,y,yout,x) */ /* where: datain = input sas dataset */ /* dataout = output sas dataset (can be the same as datain) */ /* y = name of variable in datain to be imputed under a*/ /* logistic regression model */ /* yout = name of variable in dataout where LOGITIMPUTE */ /* writes the imputed values of y (can be y) */ /* x = names of variables in datain to use to condition*/ /* the masking regression */ /* ------------------------------------------------------------------ */ /* RETURNS: A vector of imputed values &yout in an output data set */ /* called &dataout. The imputed values are draws from the */ /* posterior predictive distribution implied by the */ /* regression model under an uniformative prior */ /* ------------------------------------------------------------------ */ /* NOTES: Unlike the masking macros, LOGITIMPUTE *creates* the output */ /* data set &dataout. It is a copy of &datain with a new */ /* variable called &yout, which contains the imputed values */ /* of &y. To fill the missing values of &y with imputed */ /* values just set &y=&yout. Similarly, to replace &datain */ /* with the completed data, set &datain=&dataout */ /**********************************************************************/ /* written by: Simon D. Woodcock CISER, Dept. of Economics Cornell University 201 Caldwell Hall Ithaca, NY 14850 sdw9@cornell.edu */ %MACRO logitimpute(datain,dataout,y,yout,x); * add indexing variable to &datain so we can reassemble the completed data; data &datain; set &datain; imputeindex = _n_; run; * subset &datain into complete and incomplete observations; data complete; set &datain; if &y ~= .; run; data incomplete; set &datain; if &y = .; * clean up &datain; data &datain (drop = imputeindex); set &datain; run; * perform logistic regression and collect output; proc logistic data=complete outest=results covout descending; model &y = &x; run; * generate masked data as draws from the normal approximation to the posterior * predictive distribution under an uninformative prior ; proc iml; edit incomplete; read all var {&x} into slopes; nmiss = nrow(slopes); xmat = j(nmiss,1,1)||slopes; nvar = ncol(xmat); do i = 1 to nmiss; /* verify x is complete */ do k = 1 to nvar; if xmat[i,k] = . then do; print "ERROR: the conditioning matrix x cannot contain missing values"; abort; end; end; end; use results; read var {Intercept &x} into beta; beta = beta`; select = 2:nvar+1; read point select var {Intercept &x} into covb; T = root(covb); z = j(nvar,1,0); /* pre-allocate vector z */ do i = 1 to nvar; /* draw nvar normal deviates */ z[i,1] = rannor(0); end; betastar = beta + T*z; /* perturb beta */ ii = j(nmiss,1,1); cdf = ii/(ii+exp(-xmat*betastar)); /* predicted probabilities */ u = j(nmiss,1,0); /* pre-allocate vector u */ do i = 1 to nmiss; /* draw nobs uniform deviates */ u[i,1] = ranuni(0); end; yimputed = (u <= cdf); /* draw masked y */ setout incomplete; &yout = yimputed; replace all var{&yout}; /* fill imputed values */ quit; * reassemble completed data in original order from &datain; data &dataout (drop = imputeindex); set complete incomplete; by imputeindex; run; * clean up workspace; proc datasets library = work; delete complete incomplete results sscp; run; quit; %MEND;