Untitled

/*

Stefan Gustafsson, Acando

This solution supports the general case where all kinds of relations are possible on input.

For example
3->1
3->2
will result in all three nodes assigned to the same customer

*/

set nocount on
set statistics time off

-- Idea borrowed from Daniel Hutmacher
-- Create a dummy table with a columnstore index to force batch mode processing
if object_id('tempdb..#ColStoreTable') is not null DROP TABLE #ColStoreTable;
CREATE TABLE #ColStoreTable (
    col     tinyint NOT NULL
);
CREATE CLUSTERED COLUMNSTORE INDEX ColStoreIndex ON #ColStoreTable;
INSERT INTO #ColStoreTable VALUES (0);
ALTER INDEX ColStoreIndex ON #ColStoreTable REORGANIZE;

-- Create some temp tables
IF object_id('tempdb..#e') IS NOT NULL DROP TABLE #e
IF object_id('tempdb..#m') IS NOT NULL DROP TABLE #m
IF object_id('tempdb..#t') IS NOT NULL DROP TABLE #t

create table #e      (s int not null, t int not null)
create table #m      (s int not null, t int not null, new_s int not null, new_t int not null)
create table #t      (s int not null)

create unique clustered index CL on #e(s, t)
create unique clustered index CL on #m(s, t)
create unique clustered index CL on #t(s)

raiserror ('Calculate #e',0,0) with nowait
-- Build edge list
;with
p as (
    select PurchaseID, FirstName, LastName, Email, StreetAddress, ZipCode, CreditCard
    from dbo.PurchasesBig
    --from dbo.Purchases
)
, cte1 as (
    select
        PurchaseID,
        Key1 = (Firstname+'/'+CreditCard) collate finnish_swedish_bin,
        Key2 = (Firstname+'/'+Email) collate finnish_swedish_bin,
        Key3 = (Firstname+'/'+LastName+'/'+StreetAddress+'/'+cast(ZipCode as varchar(10))) collate finnish_swedish_bin
    from p
)
, k1 as (
    select Key1, MIN(PurchaseID) as node
    from cte1
    group by Key1
)
, k2 as (
    select Key2, MIN(PurchaseID) as node
    from cte1
    group by Key2
)
, k3 as (
    select Key3, MIN(PurchaseID)as node
    from cte1
    group by Key3
)
,cte2 as (
    select
        a.PurchaseID as s,
        k1.node as n1,
        k2.node as n2,
        k3.node as n3
    from cte1 a
    inner hash join k1 on k1.Key1 = a.Key1
    inner hash join k2 on k2.Key2 = a.Key2
    inner hash join k3 on k3.Key3 = a.Key3
)
, cte3 as (
    -- Return one row for each unique link from this node
    select
        s,
        t =
            case n
                when 1 then n1
                when 2 then n2
                when 3 then n3
            end
        from cte2
    cross join (values (1),(2),(3)) v(n)
    where
        -- Exclude duplicate links
        -- Exclude links to self unless all links are to self
        (n = 1 and (n1 <> s or (n1 = n2 and n1 = n3))) or
        (n = 2 and n2 <> s and n2 <> n1) or
        (n = 3 and n3 <> s and n3 <> n1 and n3 <> n2)
)
insert into #e with (tablockx)
    (s, t)
select s, t
from cte3
LEFT JOIN #ColStoreTable ON 1=0
option (hash group)

/*
Create some more interesting test data

truncate table #e
insert into #e (s, t)
select s+v0.number*1000, t+v0.number*1000
from (
    --values (7,5),(5,3),(5,1),(3,1),(5,4),(6,4),(4,2),(1,1),(2,2)
    values (1,1),(3,1),(3,2),(2,2),(4,2),(4,1)
    ) e(s,t)
cross join master..spt_values v0
where v0.type='p' and v0.number < 1

-- Create data suitable for visualization using http://www.webgraphviz.com/
select cast(s as varchar(10))+' -> '+CAST(t as varchar(10))
from #e
order by s, t

Example graph:
digraph g {
rankdir=LR;
1 -> 1
2 -> 2
3 -> 1
4 -> 2
5 -> 1
5 -> 3
5 -> 4
6 -> 4
7 -> 5
}

*/

/*
We now have a general DAG (Directed Acyclic Graph)
To be able to process this using SQL efficiently we want to transform it to a number of trees.

The problem is nodes that have multiple outgoing links (splits)

5 -> 1
5 -> 2
5 -> 3

We want to transform this into

5 -> 1
2 -> 1
3 -> 1

This can be done by applying the following transforms:

5 -> 2 => 2 -> 1
5 -> 3 => 3 -> 1

The strategy is to repeatedly find all splits, generate a table with desired transforms, and apply the transforms

When no more splits are found, we have a tree that can be efficiently processed
*/

declare @rc int

raiserror ('Find 1',0,0) with nowait
-- Find splits
truncate table #m
insert into #m (s, t, new_s, new_t)
--output inserted.*
select a.s, a.t, a.t, b.t
from #e a
join (select s, MIN(t) as t from #e group by s having COUNT(*)>1) b on b.s = a.s and a.t <> b.t
LEFT JOIN #ColStoreTable ON 1=0

--select * from #m order by 1,2

set @rc = @@rowcount

while @rc > 0
begin

    -- Delete existing rows
    raiserror ('delete #e',0,0) with nowait
    delete from a
    --output deleted.*
    from #e a
    join #m b on b.s = a.s and b.t = a.t
    LEFT JOIN #ColStoreTable ON 1=0

    -- insert new rows if they do not already exist
    -- and if they have not already been removed in this iteration
    raiserror ('insert #e',0,0) with nowait
    truncate table #t
    insert into #e (s,t)
    output inserted.s into #t
    --output inserted.*
    select a.new_s, a.new_t
    from (select distinct new_s, new_t from #m) a
    left join #e b on b.s = a.new_s and b.t = a.new_t
    left join #m c on c.s = a.new_s and c.t = a.new_t
    LEFT JOIN #ColStoreTable ON 1=0
    where b.s is null and c.s is null

    set @rc = @@rowcount

    if @rc > 0
    begin

        -- Find more splits
        -- this time only look at the rows affected by the previous insert
        raiserror ('Find 2',0,0) with nowait
        truncate table #m
        insert into #m (s, t, new_s, new_t)
        --output inserted.*
        select a.s, a.t, a.t, b.t
        from #e a
        join (select s, MIN(t) as t from #e group by s having COUNT(*)>1) b on b.s = a.s and a.t <> b.t
        join #t c on c.s = b.s
        LEFT JOIN #ColStoreTable ON 1=0

        set @rc = @@rowcount

    end
end

-- We have now finished processing splits
-- #e now represents a number of trees
raiserror ('Insert Customers',0,0) with nowait
truncate table dbo.Customers
insert into dbo.Customers with (tablockx)
    (PurchaseID, CustomerID)
select s, t
from #e

set @rc = @@rowcount

-- Idea borrowed from Mikael Eriksson
-- Update CustomerID from parents in a loop
while @rc > 0
begin

    raiserror ('Update Customers',0,0) with nowait
    update c
    set c.CustomerID = p.CustomerID
    from dbo.Customers c
    join dbo.Customers p on
        c.CustomerID = p.PurchaseID and
        p.PurchaseID <> p.CustomerID and
        c.PurchaseID <> c.CustomerID

    set @rc = @@rowcount

end