Advertisement
stegus64

Untitled

Mar 24th, 2017
86
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
T-SQL 6.25 KB | None | 0 0
  1. /*
  2.  
  3. Stefan Gustafsson, Acando
  4.  
  5. This solution supports the general case where all kinds of relations are possible on input.
  6.  
  7. For example
  8. 3->1
  9. 3->2
  10. will result in all three nodes assigned to the same customer
  11.  
  12. */
  13.  
  14. set nocount on
  15. set statistics time off
  16.  
  17. -- Idea borrowed from Daniel Hutmacher
  18. -- Create a dummy table with a columnstore index to force batch mode processing
  19. if object_id('tempdb..#ColStoreTable') is not null DROP TABLE #ColStoreTable;
  20. CREATE TABLE #ColStoreTable (
  21.     col     tinyint NOT NULL
  22. );
  23. CREATE CLUSTERED COLUMNSTORE INDEX ColStoreIndex ON #ColStoreTable;
  24. INSERT INTO #ColStoreTable VALUES (0);
  25. ALTER INDEX ColStoreIndex ON #ColStoreTable REORGANIZE;
  26.  
  27. -- Create some temp tables
  28. IF object_id('tempdb..#e') IS NOT NULL DROP TABLE #e
  29. IF object_id('tempdb..#m') IS NOT NULL DROP TABLE #m
  30. IF object_id('tempdb..#t') IS NOT NULL DROP TABLE #t
  31.  
  32. create table #e      (s int not null, t int not null)
  33. create table #m      (s int not null, t int not null, new_s int not null, new_t int not null)
  34. create table #t      (s int not null)
  35.  
  36. create unique clustered index CL on #e(s, t)
  37. create unique clustered index CL on #m(s, t)
  38. create unique clustered index CL on #t(s)
  39.  
  40. raiserror ('Calculate #e',0,0) with nowait
  41. -- Build edge list
  42. ;with
  43. p as (
  44.     select PurchaseID, FirstName, LastName, Email, StreetAddress, ZipCode, CreditCard
  45.     from dbo.PurchasesBig
  46.     --from dbo.Purchases
  47. )
  48. , cte1 as (
  49.     select
  50.         PurchaseID,
  51.         Key1 = (Firstname+'/'+CreditCard) collate finnish_swedish_bin,
  52.         Key2 = (Firstname+'/'+Email) collate finnish_swedish_bin,
  53.         Key3 = (Firstname+'/'+LastName+'/'+StreetAddress+'/'+cast(ZipCode as varchar(10))) collate finnish_swedish_bin
  54.     from p
  55. )
  56. , k1 as (
  57.     select Key1, MIN(PurchaseID) as node
  58.     from cte1
  59.     group by Key1
  60. )
  61. , k2 as (
  62.     select Key2, MIN(PurchaseID) as node
  63.     from cte1
  64.     group by Key2
  65. )
  66. , k3 as (
  67.     select Key3, MIN(PurchaseID)as node
  68.     from cte1
  69.     group by Key3
  70. )
  71. ,cte2 as (
  72.     select
  73.         a.PurchaseID as s,
  74.         k1.node as n1,
  75.         k2.node as n2,
  76.         k3.node as n3
  77.     from cte1 a
  78.     inner hash join k1 on k1.Key1 = a.Key1
  79.     inner hash join k2 on k2.Key2 = a.Key2
  80.     inner hash join k3 on k3.Key3 = a.Key3
  81. )
  82. , cte3 as (
  83.     -- Return one row for each unique link from this node
  84.     select
  85.         s,
  86.         t =
  87.             case n
  88.                 when 1 then n1
  89.                 when 2 then n2
  90.                 when 3 then n3
  91.             end
  92.         from cte2
  93.     cross join (values (1),(2),(3)) v(n)
  94.     where
  95.         -- Exclude duplicate links
  96.         -- Exclude links to self unless all links are to self
  97.         (n = 1 and (n1 <> s or (n1 = n2 and n1 = n3))) or
  98.         (n = 2 and n2 <> s and n2 <> n1) or
  99.         (n = 3 and n3 <> s and n3 <> n1 and n3 <> n2)
  100. )
  101. insert into #e with (tablockx)
  102.     (s, t)
  103. select s, t
  104. from cte3
  105. LEFT JOIN #ColStoreTable ON 1=0
  106. option (hash group)
  107.  
  108. /*
  109. Create some more interesting test data
  110.  
  111. truncate table #e
  112. insert into #e (s, t)
  113. select s+v0.number*1000, t+v0.number*1000
  114. from (
  115.     --values (7,5),(5,3),(5,1),(3,1),(5,4),(6,4),(4,2),(1,1),(2,2)
  116.     values (1,1),(3,1),(3,2),(2,2),(4,2),(4,1)
  117.     ) e(s,t)
  118. cross join master..spt_values v0
  119. where v0.type='p' and v0.number < 1
  120.  
  121. -- Create data suitable for visualization using http://www.webgraphviz.com/
  122. select cast(s as varchar(10))+' -> '+CAST(t as varchar(10))
  123. from #e
  124. order by s, t
  125.  
  126. Example graph:
  127. digraph g {
  128. rankdir=LR;
  129. 1 -> 1
  130. 2 -> 2
  131. 3 -> 1
  132. 4 -> 2
  133. 5 -> 1
  134. 5 -> 3
  135. 5 -> 4
  136. 6 -> 4
  137. 7 -> 5
  138. }
  139.  
  140. */
  141.  
  142. /*
  143. We now have a general DAG (Directed Acyclic Graph)
  144. To be able to process this using SQL efficiently we want to transform it to a number of trees.
  145.  
  146. The problem is nodes that have multiple outgoing links (splits)
  147.  
  148. 5 -> 1
  149. 5 -> 2
  150. 5 -> 3
  151.  
  152. We want to transform this into
  153.  
  154. 5 -> 1
  155. 2 -> 1
  156. 3 -> 1
  157.  
  158. This can be done by applying the following transforms:
  159.  
  160. 5 -> 2 => 2 -> 1
  161. 5 -> 3 => 3 -> 1
  162.  
  163. The strategy is to repeatedly find all splits, generate a table with desired transforms, and apply the transforms
  164.  
  165. When no more splits are found, we have a tree that can be efficiently processed
  166. */
  167.  
  168. declare @rc int
  169.  
  170. raiserror ('Find 1',0,0) with nowait
  171. -- Find splits
  172. truncate table #m
  173. insert into #m (s, t, new_s, new_t)
  174. --output inserted.*
  175. select a.s, a.t, a.t, b.t
  176. from #e a
  177. join (select s, MIN(t) as t from #e group by s having COUNT(*)>1) b on b.s = a.s and a.t <> b.t
  178. LEFT JOIN #ColStoreTable ON 1=0
  179.  
  180. --select * from #m order by 1,2
  181.  
  182. set @rc = @@rowcount
  183.  
  184. while @rc > 0
  185. begin
  186.  
  187.     -- Delete existing rows
  188.     raiserror ('delete #e',0,0) with nowait
  189.     delete from a
  190.     --output deleted.*
  191.     from #e a
  192.     join #m b on b.s = a.s and b.t = a.t
  193.     LEFT JOIN #ColStoreTable ON 1=0
  194.  
  195.     -- insert new rows if they do not already exist
  196.     -- and if they have not already been removed in this iteration
  197.     raiserror ('insert #e',0,0) with nowait
  198.     truncate table #t
  199.     insert into #e (s,t)
  200.     output inserted.s into #t
  201.     --output inserted.*
  202.     select a.new_s, a.new_t
  203.     from (select distinct new_s, new_t from #m) a
  204.     left join #e b on b.s = a.new_s and b.t = a.new_t
  205.     left join #m c on c.s = a.new_s and c.t = a.new_t
  206.     LEFT JOIN #ColStoreTable ON 1=0
  207.     where b.s is null and c.s is null
  208.    
  209.     set @rc = @@rowcount
  210.  
  211.     if @rc > 0
  212.     begin
  213.  
  214.         -- Find more splits
  215.         -- this time only look at the rows affected by the previous insert
  216.         raiserror ('Find 2',0,0) with nowait
  217.         truncate table #m
  218.         insert into #m (s, t, new_s, new_t)
  219.         --output inserted.*
  220.         select a.s, a.t, a.t, b.t
  221.         from #e a
  222.         join (select s, MIN(t) as t from #e group by s having COUNT(*)>1) b on b.s = a.s and a.t <> b.t
  223.         join #t c on c.s = b.s
  224.         LEFT JOIN #ColStoreTable ON 1=0
  225.  
  226.         set @rc = @@rowcount
  227.  
  228.     end
  229. end
  230.  
  231. -- We have now finished processing splits
  232. -- #e now represents a number of trees
  233. raiserror ('Insert Customers',0,0) with nowait
  234. truncate table dbo.Customers
  235. insert into dbo.Customers with (tablockx)
  236.     (PurchaseID, CustomerID)
  237. select s, t
  238. from #e
  239.  
  240. set @rc = @@rowcount
  241.  
  242. -- Idea borrowed from Mikael Eriksson
  243. -- Update CustomerID from parents in a loop
  244. while @rc > 0
  245. begin
  246.  
  247.     raiserror ('Update Customers',0,0) with nowait
  248.     update c
  249.     set c.CustomerID = p.CustomerID
  250.     from dbo.Customers c
  251.     join dbo.Customers p on
  252.         c.CustomerID = p.PurchaseID and
  253.         p.PurchaseID <> p.CustomerID and
  254.         c.PurchaseID <> c.CustomerID
  255.  
  256.     set @rc = @@rowcount
  257.  
  258. end
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement