/*macro to compare whole sample and analytical samples http://www.estat.us/sas/samplecheck.txt*/ /*Documentation is not so great, so please ask me. kuekawa @ alumni.uchicago.edu . I wrote this program to compare an analytical sample versus the whole sample--for a paper that I've been doing with charles Bidwell. Often in research our analytical sample gets small because of a pattern of missing cases. We need to compare our analytical sample and the original sample, to compare means and standard deviation. Typically in education resaerch students from an analytical sample tends to be smarter and less at risk AND variances tend to be smaller. We need to show these things and talk about them. I created a hypothetical data (which came from TIMSS data). I took out some values, so it contains missing values. We are interested in the following two samples: Group 1 The data that doesn't have missing values on MAT7 and MAT8 (7th grade and 8th grade acheievment scores) VERSUS Group 2 The data that did not go into the sample above. You have to create a flag variable (coded as 0 or 10) in the follwoing way to indicate which observations belong to which group. You will get two tables. This is for the whole sample to show general information. Whole_samplemean --> Mean Whole_samplestdev-->Standard Deviation Whole_samplemin --> Minimum Whole_samplemax --> Maximum Whole_sampleN --> N whole sample 23:38 Sunday, May 22, 2005 25 WHOLE_ WHOLE_ WHOLE_ WHOLE_ WHOLE_ Obs variable SAMPLEmean SAMPLEstdev SAMPLEmin SAMPLEmax SAMPLEN 1 GNP14 -0.25 0.14 -0.42 0.07 9 2 PROP 91.50 12.21 62.00 100.00 10 The second table compares two samples. Sample1_0 is the data set that suffered from missing cases on a selected list of variables (MAT7 and MAT8 here), while Sample_1 is the data set that has no missing cases on those variables. TEST1 is a t-test if a variable is a continuos variable. It uses qui-sqaure test if a variable is a dummy variable. Variance Test is a test of variance equality. Whe P-value small, it means variances from the two samples are not the same. sample1 23:38 Sunday, May 22, 2005 26 sample1_ sample1_ sample1_ sample1_ sample1variance Obs variable 0mean 1mean TEST1 0stdev 1stdev Test 1 GNP14 -0.25 -0.25 0.99 0.08 0.19 0.16 2 PROP 86.50 94.83 0.43 17.99 6.40 0.05 */ data kaz; /*example data set*/ input acro \$ NATION \$ 6-14 NAME \$ 15-33 MAT7 MAT8 GNP14 PROP NATEXAM NATSYLB NATTEXT block \$; cards; aus Australi Australia 498 . -0.15526 84 0 1 0 ocea aut Austria Austria 509 539.43 -0.29163 100 0 0 1 weuro bfl Belgi_FL Belgium (Fl) . 565.18 -0.25157 100 . 1 0 weuro bfr Belgi_FR Belgium (Fr) 507 526.26 -0.25157 100 0 . 0 weuro can Canada Canada 494 527.24 0.07184 88 0 0 0 namer col Colombia Colombia 369 . -0.23699 62 0 . 0 samer cyp Cyprus Cyprus 446 473.59 -0.41906 95 0 1 . seuro csk Czech Czech Republic 523 563.75 -0.34840 86 0 1 0 eeuro dnk Denmark Denmark . 502.29 -0.34057 100 1 0 0 weuro fra France France 492 537.83 . 100 0 1 0 weuro ;run; /*how to create sample indicators*/ data kaz; set kaz; sample1=0; XXX=nmiss(of MAT7 MAT8 ); if XXX=0 then sample1=1; run; /*data name here. It can also take a form of here.kaz with a library name*/ %let dataname=kaz; %let vars=GNP14 prop; %let analyticalsample1=sample1; %let excel=C:\temp\sample.xls; /*************************************************************************************/ /*************************************************************************************/ /*************************************************************************************/ /*************************************************************************************/ ods listing close; proc means data=&dataname mean std min max n stderr; var &vars; ods output summary=whole; run; data whole;set whole; length ID \$ 30; ID="WHOLE SAMPLE"; run; /*GET descriptive statistics for subgroups within data*/ /*In this case, boys and girls' samples*/ /*Note the variable "valid2" that I used at where-line*/ /*In this way I pick cases that have no missing values*/ /*See how valid2 is constructed above*/ %macro jones (var1=, and=); proc means data=&dataname mean std min max n stderr; class &var1; var &vars ; ods output summary=&var1; run; data &var1;set &var1; length ID \$ 30; y="&var1._"; ID=compress(y||&var1); drop &var1; run; /*begin for statistical test*//*begin for statistical test*//*begin for statistical test*/ ods listing close; proc ttest data=&dataname alpha=.05; class &var1; var &vars ; ods output equality=EQ TTests=T; run; data EQ2; length variable \$ 20; set EQ; keep variable variancetest ; VarianceTest=ProbF; run; proc sort;by variable;run; data POOLED; length variable \$ 20; set T; if Method="Pooled"; keep variable T_equalV originalseq; T_equalV=ProbT; originalseq=_n_; run; proc sort;by variable;run; data Satter; length variable \$ 20; set T; if Method="Satterthwaite"; keep variable T_unequalV; T_unequalV=ProbT; run; proc sort;by variable;run; data TtestResult;merge EQ2 pooled Satter; by variable; keep Variable VarianceTest Ttest originalseq; /*If variance is equal use pooled. if not use Satterthwaite*/ Ttest=T_equalV; if VarianceTest < .05 then Ttest=T_unequalV; run; proc sort;by variable;run; ods listing close; proc freq data=&dataname ; tables &var1*(&vars)/CHISQ; ods output ChiSq =KISQ; run; ods listing; data KISQ2; length variable \$ 20; set KISQ; keep variable ChiTest; jun=indexc(table,"*")+2; variable=substr(table,jun,20); if statistic="Chi-Square"; Chitest=prob; run; proc sort;by variable;run; data Ttest&var1; merge TtestResult KISQ2; by variable; keep variable &var1.Chitest &var1.Ttest &var1.varianceTest originalseq ; &var1.Chitest=Chitest; &var1.Ttest=Ttest; &var1.varianceTest=varianceTest; run; proc sort;by variable;run; ods listing; /*end*//*end*//*end*//*end*//*end*//*end*/ %mend jones; %jones (var1=&analyticalsample1); /*merging t test result*/ data Ttestboth; set Ttest&analyticalsample1 ; by variable; run; proc sort;by variable;run; /*Merging the two result data sets*/ data allthese; set whole &analyticalsample1 ; run; /*Transpose to get the data into a shape*/ proc transpose data=allthese out=alltheseT; id ID; run; /*Lots of manipulation to put the data into a shape that is ready to go into an excel sheet*/ data alltheseT2;set alltheseT; x=_name_; group=tranwrd(x,"_Mean",""); group=tranwrd(group,"_StdDev",""); group=tranwrd(group,"_Min",""); group=tranwrd(group,"_N",""); group=tranwrd(group,"_Max",""); group=tranwrd(group,"_StdErr",""); run; /*I do proc print once in a while to see how the data looks like*/ /*Sometimes I have to do a complex way of PROC TRANSPOSE*/ /*I don't enjoy this sort of programming*/ %macro john (var1=); proc transpose data=alltheseT2 out=&var1; by group notsorted; id _label_; var &var1; run; data &var1; length variable \$ 20; set &var1; keep group variable &var1.mean /*&var1.std*/ &var1.max &var1.min &var1.stdev &var1.N &var1.std_error ; if minimum=0 and maximum=1 then do; x=(mean*(1-mean)); Std_Error=sqrt(x/(N) ); end; drop x; &var1.mean=mean; &var1.std_error=Std_Error; &var1.max =maximum; &var1.min = minimum; &var1.stdev=std_dev; &var1.N=N; variable=group; run; proc sort;by variable;run; %mend john; %john (var1=WHOLE_SAMPLE); %john (var1=&analyticalsample1._0); %john (var1=&analyticalsample1._1); data descriptivestat; merge whole_sample &analyticalsample1._0 &analyticalsample1._1 Ttestboth ; *by variable; by variable; if variable ne "NObs"; *drop group; TEST1=&analyticalsample1.Ttest; if WHOLE_SAMPLEmin=0 and WHOLE_SAMPLEmax=1 then do; TEST1=&analyticalsample1.Chitest; end; run; proc sort;by originalseq;run; /* data D1;set descriptivestat; keep WHOLE_SAMPLEN sample1_0N sample1_1N sample2_0N sample2_1N; run; */ proc transpose data=descriptivestat out=Dt1; var WHOLE_SAMPLEN &analyticalsample1._0N &analyticalsample1._1N ; id group; run; data DT2;set DT1; max1=max (of &vars); min1=min (of &vars); STD1=std( of &vars); d="N="; r="-"; length range \$ 30; if STD1=0 then do; range=compress(d||max1); end; if STD1 ne 0 then do; range=compress(d|| min1 || r||max1); end; run; proc transpose data=DT2 out=DT3; var range; id _name_; run; /*************************************************************************************/ ods listing; ods html file="&excel"; proc print data=DT3; title "Number of observations by samples"; run; proc print data=descriptivestat round; title "whole sample"; var variable WHOLE_SAMPLEmean WHOLE_SAMPLEstdev WHOLE_SAMPLEmin WHOLE_SAMPLEmax WHOLE_SAMPLEn; run; proc print data=descriptivestat round; title "&analyticalsample1"; var variable &analyticalsample1._0mean &analyticalsample1._1mean TEST1 &analyticalsample1._0stdev &analyticalsample1._1stdev &analyticalsample1.varianceTest; run; run; ods html close; /* PROC EXPORT DATA= work.descriptivestat OUTFILE= "&excel" DBMS=EXCEL2000 REPLACE; RUN;*/