Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /*
- Stefan Gustafsson, Acando
- This solution supports the general case where all kinds of relations are possible on input.
- For example
- 3->1
- 3->2
- will result in all three nodes assigned to the same customer
- */
- set nocount on
- set statistics time off
- -- Idea borrowed from Daniel Hutmacher
- -- Create a dummy table with a columnstore index to force batch mode processing
- if object_id('tempdb..#ColStoreTable') is not null DROP TABLE #ColStoreTable;
- CREATE TABLE #ColStoreTable (
- col tinyint NOT NULL
- );
- CREATE CLUSTERED COLUMNSTORE INDEX ColStoreIndex ON #ColStoreTable;
- INSERT INTO #ColStoreTable VALUES (0);
- ALTER INDEX ColStoreIndex ON #ColStoreTable REORGANIZE;
- -- Create some temp tables
- IF object_id('tempdb..#e') IS NOT NULL DROP TABLE #e
- IF object_id('tempdb..#m') IS NOT NULL DROP TABLE #m
- IF object_id('tempdb..#t') IS NOT NULL DROP TABLE #t
- create table #e (s int not null, t int not null)
- create table #m (s int not null, t int not null, new_s int not null, new_t int not null)
- create table #t (s int not null)
- create unique clustered index CL on #e(s, t)
- create unique clustered index CL on #m(s, t)
- create unique clustered index CL on #t(s)
- raiserror ('Calculate #e',0,0) with nowait
- -- Build edge list
- ;with
- p as (
- select PurchaseID, FirstName, LastName, Email, StreetAddress, ZipCode, CreditCard
- from dbo.PurchasesBig
- --from dbo.Purchases
- )
- , cte1 as (
- select
- PurchaseID,
- Key1 = (Firstname+'/'+CreditCard) collate finnish_swedish_bin,
- Key2 = (Firstname+'/'+Email) collate finnish_swedish_bin,
- Key3 = (Firstname+'/'+LastName+'/'+StreetAddress+'/'+cast(ZipCode as varchar(10))) collate finnish_swedish_bin
- from p
- )
- , k1 as (
- select Key1, MIN(PurchaseID) as node
- from cte1
- group by Key1
- )
- , k2 as (
- select Key2, MIN(PurchaseID) as node
- from cte1
- group by Key2
- )
- , k3 as (
- select Key3, MIN(PurchaseID)as node
- from cte1
- group by Key3
- )
- ,cte2 as (
- select
- a.PurchaseID as s,
- k1.node as n1,
- k2.node as n2,
- k3.node as n3
- from cte1 a
- inner hash join k1 on k1.Key1 = a.Key1
- inner hash join k2 on k2.Key2 = a.Key2
- inner hash join k3 on k3.Key3 = a.Key3
- )
- , cte3 as (
- -- Return one row for each unique link from this node
- select
- s,
- t =
- case n
- when 1 then n1
- when 2 then n2
- when 3 then n3
- end
- from cte2
- cross join (values (1),(2),(3)) v(n)
- where
- -- Exclude duplicate links
- -- Exclude links to self unless all links are to self
- (n = 1 and (n1 <> s or (n1 = n2 and n1 = n3))) or
- (n = 2 and n2 <> s and n2 <> n1) or
- (n = 3 and n3 <> s and n3 <> n1 and n3 <> n2)
- )
- insert into #e with (tablockx)
- (s, t)
- select s, t
- from cte3
- LEFT JOIN #ColStoreTable ON 1=0
- option (hash group)
- /*
- Create some more interesting test data
- truncate table #e
- insert into #e (s, t)
- select s+v0.number*1000, t+v0.number*1000
- from (
- --values (7,5),(5,3),(5,1),(3,1),(5,4),(6,4),(4,2),(1,1),(2,2)
- values (1,1),(3,1),(3,2),(2,2),(4,2),(4,1)
- ) e(s,t)
- cross join master..spt_values v0
- where v0.type='p' and v0.number < 1
- -- Create data suitable for visualization using http://www.webgraphviz.com/
- select cast(s as varchar(10))+' -> '+CAST(t as varchar(10))
- from #e
- order by s, t
- Example graph:
- digraph g {
- rankdir=LR;
- 1 -> 1
- 2 -> 2
- 3 -> 1
- 4 -> 2
- 5 -> 1
- 5 -> 3
- 5 -> 4
- 6 -> 4
- 7 -> 5
- }
- */
- /*
- We now have a general DAG (Directed Acyclic Graph)
- To be able to process this using SQL efficiently we want to transform it to a number of trees.
- The problem is nodes that have multiple outgoing links (splits)
- 5 -> 1
- 5 -> 2
- 5 -> 3
- We want to transform this into
- 5 -> 1
- 2 -> 1
- 3 -> 1
- This can be done by applying the following transforms:
- 5 -> 2 => 2 -> 1
- 5 -> 3 => 3 -> 1
- The strategy is to repeatedly find all splits, generate a table with desired transforms, and apply the transforms
- When no more splits are found, we have a tree that can be efficiently processed
- */
- declare @rc int
- raiserror ('Find 1',0,0) with nowait
- -- Find splits
- truncate table #m
- insert into #m (s, t, new_s, new_t)
- --output inserted.*
- select a.s, a.t, a.t, b.t
- from #e a
- join (select s, MIN(t) as t from #e group by s having COUNT(*)>1) b on b.s = a.s and a.t <> b.t
- LEFT JOIN #ColStoreTable ON 1=0
- --select * from #m order by 1,2
- set @rc = @@rowcount
- while @rc > 0
- begin
- -- Delete existing rows
- raiserror ('delete #e',0,0) with nowait
- delete from a
- --output deleted.*
- from #e a
- join #m b on b.s = a.s and b.t = a.t
- LEFT JOIN #ColStoreTable ON 1=0
- -- insert new rows if they do not already exist
- -- and if they have not already been removed in this iteration
- raiserror ('insert #e',0,0) with nowait
- truncate table #t
- insert into #e (s,t)
- output inserted.s into #t
- --output inserted.*
- select a.new_s, a.new_t
- from (select distinct new_s, new_t from #m) a
- left join #e b on b.s = a.new_s and b.t = a.new_t
- left join #m c on c.s = a.new_s and c.t = a.new_t
- LEFT JOIN #ColStoreTable ON 1=0
- where b.s is null and c.s is null
- set @rc = @@rowcount
- if @rc > 0
- begin
- -- Find more splits
- -- this time only look at the rows affected by the previous insert
- raiserror ('Find 2',0,0) with nowait
- truncate table #m
- insert into #m (s, t, new_s, new_t)
- --output inserted.*
- select a.s, a.t, a.t, b.t
- from #e a
- join (select s, MIN(t) as t from #e group by s having COUNT(*)>1) b on b.s = a.s and a.t <> b.t
- join #t c on c.s = b.s
- LEFT JOIN #ColStoreTable ON 1=0
- set @rc = @@rowcount
- end
- end
- -- We have now finished processing splits
- -- #e now represents a number of trees
- raiserror ('Insert Customers',0,0) with nowait
- truncate table dbo.Customers
- insert into dbo.Customers with (tablockx)
- (PurchaseID, CustomerID)
- select s, t
- from #e
- set @rc = @@rowcount
- -- Idea borrowed from Mikael Eriksson
- -- Update CustomerID from parents in a loop
- while @rc > 0
- begin
- raiserror ('Update Customers',0,0) with nowait
- update c
- set c.CustomerID = p.CustomerID
- from dbo.Customers c
- join dbo.Customers p on
- c.CustomerID = p.PurchaseID and
- p.PurchaseID <> p.CustomerID and
- c.PurchaseID <> c.CustomerID
- set @rc = @@rowcount
- end
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement