[green]/************************************************************
SAMPLE DATA
************************************************************/
[/green]
* sample data for hand checking;
data tree;
input customer_no customer_xref;
datalines;
1 2
10 5
1 3
3 5
4 7
5 6
12 2
7 8
13 4
14 13
17 26
;
run;
* large sample data (1M records) to check performance;
data tree2;
do customer_no = 1 to 1000000;
customer_xref = floor(5000000 * ranuni(10));
if (customer_no ne customer_xref) then output;
end;
;
run;
* create view so that input dataset name and variable names
are the same - easy to which between input datasets. I
dont have to change the algorithm when i change input
data;
proc sql;
create view v_tree as
select customer_no as n1, customer_xref as n2
from tree2;
quit;
[green]/************************************************************
THE ALGORITHM
************************************************************/
[/green]
* implement disjoint sets ADT and output results;
* nodes (customers) that have the same representative belong
to the same group (cross referenced);
data output (keep = n representative depth);
* hash table to store representatives for disjoint sets;
declare hash h_rep(hashexp: 16);
h_rep.defineKey('node');
h_rep.defineData('node','representative');
h_rep.defineDone();
call missing(node, representative);
* step 1: populate hash table, make each nodes its own set;
put "Step 1: Loading reference data into hash table...processing";
do until (eof);
set v_tree end=eof;
if (h_rep.check(key: n1) ne 0) then h_rep.add(key: n1, data: n1, data: n1);
if (h_rep.check(key: n2) ne 0) then h_rep.add(key: n2, data: n2, data: n2);
end;
* step 2: make nodes equievalent;
put "Step 2: Searching for equivalences...processing";
do until (eof2);
set v_tree end=eof2;
record + 1;
if mod(record, 10000) = 0 then put "...completed " record "records";
* find representative (root node) of node 1;
rep1 = n1;
do while (1=1);
h_rep.find(key:rep1);
if (rep1 = representative) then leave;
rep1 = representative;
end;
* find representative (root node) of node 2;
rep2 = n2;
do while (1=1);
h_rep.find(key:rep2);
if (rep2 = representative) then leave;
rep2 = representative;
end;
* make root nodes equivalent;
if (rep1 ne rep2) then do;
h_rep.replace(key: rep2, data: rep2, data: rep1);
end;
end;
* step 3: output results;
put "Step 3: Outputting the results...processing";
declare hiter h_iter('h_rep');
rc = h_iter.first();
do while (rc = 0);
n = node;
rep1 = node;
depth = 0;
do while (1=1); * find representative for node;
h_rep.find(key:rep1);
depth + 1; * record depth of tree (for interest sake only);
if (rep1 = representative) then leave;
rep1 = representative;
end;
output;
rc = h_iter.next();
end;
stop;
run;
[green]/************************************************************
A QUICK SUMMARY OF RESULTS
************************************************************/
[/green]
* if there are a lot of deep trees then we could apply the standard
performance improvements - so lets check the depths;
proc freq data = output;
title 'Depth of search';
table depth;
quit;
* let see how many members (customers) are grouped together;
proc sql;
create table rep_count as
select representative, count(*) as count_nodes
from output
group by representative;
quit;
proc freq data = rep_count;
title 'Number of equivalent (cross referenced) customers';
table count_nodes;
quit;