Subject: Q: comparing a number of data sets Summary: A general macro is given with bells and whistles. Respondent: Ian Whitlock Sung-Il Cho wants to compare IDs in an arbitrary set of data sets. For the data sets A, B, C, and D he suggested >data set combination: AB BC CD AC AD BD ABC ABD ACD BCD Total >---------------------------------------------------------------------- >Number of common IDs >in the combination: 100 75 80 60 85 65 50 30 45 25 150 >---------------------------------------------------------------------- Although responses have been given I have not seen anyone give a response that works for any reasonable number of data sets and a key consisting of any number of variables. I originally wrote the macro to assume a parameter LIB giving the library consisting of all the data sets. This was later modified to allow the user to specify a parameter MEMLIST. If any of the members in memlist contain a libref then LIB is set to null. The basic idea to to sort and merge all members by the KEY subsetting to FIRST.lastkey so that each key is counted only once. Each member has a corresponding IN= variable which is saved. A PROC FREQ converts the data to counts which are then summed to form all relevant intersections. Two reports are produced - a disjoint break down from PROC FREQ and an intersection report. If the above form of report is essential then one can replace the last PROC PRINT with a simple DATA _NULL_ report. Here is the report for the test code given below which generates 3 work data sets. Disjoint Partition Report of Distinct ID Counts In Library WORK. Frequency W1 W2 W3 Count 0 0 1 1 0 1 0 2 0 1 1 1 1 0 0 2 1 1 0 3 1 1 1 1 ========= 10 Intersection Report for Distinct ID In Library WORK. W1 W2 W3 COUNT in 6 in 7 in 3 in in 4 in in 2 in in in 1 T T T 10 Here is the test code. proc datasets lib = work kill mt = data ; run ; data w1 w2 w3 ; id = 6 ; output w1 ; id = 7 ; output w2 ; id = 8 ; output w1 w2 ; id = 9 ; output w1 w2 ; id = 10 ; output w3 w2 ; id = 1 ; output w1 w2 w3 ; id = 2 ; output w1 ; id = 3 ; output w2 ; id = 4 ; output w3 ; id = 5 ; output w1 w2 ; run ; %chkkey() Note the macro does not assume unique identifiers or one variable identifiers. The PROC FORMAT code which follows the macro is part of the module to be executed once when the file is included or "auto called". %macro chkkey ( lib = work , /* input SAS data library */ memlist = , /* members in &lib or complete */ /* data specification when &lib is null */ key = id , /* key variable(s) */ tl = 3 /* line number where mac may put title */ ) ; %local i /* loop index */ nmem /* number of members in &memlist */ memname /* member of given library */ word /* test value in id list */ lastkey /* last member of memlist */ ; %let key = %qupcase ( &key ) ; %if %index ( &memlist , . ) %then %let lib = ; /*-------------------------------------------------------------- create list of members as macro variable memlist --------------------------------------------------------------*/ %if %quote(&memlist) = %then %do ; proc sql noprint ; select memname into : memlist separated by ' ' from dictionary.columns where libname = "%upcase(&lib)" ; %let nmem = &sqlobs ; quit ; %end ; %else %do ; %let nmem = 0 ; %let memname = %qscan ( &memlist , &nmem + 1 , %str( ) ) ; %do %while ( &memname ^= ) ; %let nmem = %eval ( &nmem + 1 ) ; %let memname = %qscan ( &memlist , &nmem + 1 , %str( ) ) ; %end ; %end ; %if &nmem = 0 %then %goto mexit ; /*-------------------------------------------------------------- get data --------------------------------------------------------------*/ %if %quote(&lib) ^= %then %let lib = %qupcase ( &lib.. ) ; %do i = 1 %to &nmem ; /* sort data sets */ %let memname = %qscan ( &memlist , &i , %str( ) ) ; /* note lib is null or contains dot */ proc sort data = &lib&memname ( keep = &key ) out = %if %index ( &memname , . ) %then %qscan ( &memname , 2 ) ; %else &memname ; ; by &key ; run ; /* sort data sets */ %end ; data __source ( keep = mem: ) ; merge %do i = 1 %to &nmem ; %qscan ( &memlist , &i , %str( ) ) ( in = in&i ) %end ; ; by &key ; %let i = 0 ; %do %until ( &i = 0 ) ; /* get last key variable */ %let i = %eval ( &i + 1 ) ; %let word = %qscan ( &key , &i , %str( ) ) ; %if &word ^= %then %let lastkey = &word ; %else %let i = 0 ; %end ; /* get last key variable */ if first.&lastkey ; label %do i = 1 %to &nmem ; mem&i = "%qscan ( &memlist , &i , %str( ) )" %end ; ; %do i = 1 %to &nmem ; mem&i = in&i ; %end ; run ; /*-------------------------------------------------------------- take disjoint partition counts and report them --------------------------------------------------------------*/ proc freq data = __source ; table %do i = 1 %to &nmem - 1 ; mem&i * %end ; mem&i / noprint out = __source ; ; run ; title&tl "Disjoint Partition Report of Distinct &key Counts" ; %if &lib ^= %then %do ; title%eval(&tl+1) "In Library &lib" ; %end ; proc print data = __source label noobs ; sum count ; var mem1 - mem&nmem count ; run ; /*-------------------------------------------------------------- sum disjoint counts to intersections and report them --------------------------------------------------------------*/ data __source ( keep = mem: count n ) ; array mem (&nmem) ; array chk (&nmem) ; if eof then do ; /* totals processing */ do i = 1 to &nmem ; mem ( i ) = . ; end ; n = &nmem ; count = total ; output ; end ; /* totals processing */ set __source ( keep = mem: count rename = ( count = cnt ) ) end = eof ; total + cnt ; n = sum ( of mem(*) ) ; count = 0 ; do ptr = 1 to nobs ; /* sum counts */ set __source ( rename = ( count=freq %do i = 1 %to &nmem ; mem&i = chk&i %end ; ) ) point = ptr nobs = nobs ; okcnt = 0 ; do i = 1 to &nmem ; if mem ( i ) = 1 and chk ( i ) = 0 then leave ; okcnt + ( mem ( i ) = chk ( i ) = 1 ) ; end ; if okcnt = n then count + freq ; end ; /* sum counts */ output ; run ; proc sort data = __source ; by n %do i = 1 %to &nmem ; descending mem&i %end ; ; run ; title&tl "Intersection Report for Distinct &key" ; %if &lib ^= %then %do ; title%eval(&tl+1) "In Library &lib" ; %end ; proc print data = __source label noobs ; var mem1 - mem&nmem count ; format mem1 - mem&nmem mem. ; run ; title&tl ; %mexit: %mend chkkey ; options mprint ; proc format ; value mem 1 = 'in' 0 = ' ' . = 'T' ; run ; Ian Whitlock