Advertisement
Guest User

Untitled

a guest
Sep 25th, 2015
341
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
SQL 6.59 KB | None | 0 0
  1.  
  2. /****************************************************************************
  3. * GENERATE FAKE DATA
  4. ****************************************************************************/
  5.  
  6. -- Generate 15MM names, with lengths ranging from 3 to 105 characters
  7. -- The average length is 33 characters
  8. CREATE TABLE dbo.SourceTable (
  9.     id INT IDENTITY(1,1)
  10.         CONSTRAINT PK_SourceTable PRIMARY KEY,
  11.     name VARCHAR(128) NOT NULL,
  12.     bad_count INT NULL,
  13.     bad_count_new INT NULL,
  14. )
  15. GO
  16. INSERT INTO dbo.SourceTable WITH (TABLOCK) (name)
  17. SELECT TOP 15000000 v1.name + v2.name + v3.name
  18. FROM master..spt_values v1
  19. CROSS JOIN master..spt_values v2
  20. CROSS JOIN master..spt_values v3
  21. WHERE NULLIF(v1.name,'') IS NOT NULL
  22.     AND NULLIF(v2.name,'') IS NOT NULL
  23.     AND NULLIF(v3.name,'') IS NOT NULL
  24. GO
  25.  
  26. -- Generate 3K bad phrases, with lengths ranging from 6 to 18 characters
  27. -- The average length is 10 characters
  28. CREATE TABLE dbo.Bad_Phrase (
  29.   phrase VARCHAR(50) NOT NULL PRIMARY KEY
  30. )
  31. GO
  32. INSERT INTO dbo.Bad_Phrase (phrase)
  33. SELECT DISTINCT TOP 3000 p.phrase
  34. FROM (
  35.     SELECT SUBSTRING(v1.name + v2.name,
  36.             CONVERT(INT, FLOOR(0.3*LEN(v1.name+v2.name))),
  37.             CONVERT(INT, FLOOR(5+0.2*LEN(v1.name+v2.name)))
  38.         ) AS phrase
  39.     FROM master..spt_values v1
  40.     CROSS JOIN master..spt_values v2
  41.     WHERE v1.name IS NOT NULL
  42.         AND v2.name IS NOT NULL
  43. ) p
  44. WHERE LEN(p.phrase) >= 3
  45. GO
  46.  
  47. -- Examine names and phrases
  48. SELECT MIN(LEN(name)), MAX(LEN(name)), AVG(LEN(name)) FROM SourceTable
  49. SELECT MIN(LEN(phrase)), MAX(LEN(phrase)), AVG(LEN(phrase)) FROM Bad_Phrase
  50. GO
  51.  
  52. -- Generate a tally table
  53. CREATE TABLE dbo.Numbers (
  54.     n INT PRIMARY KEY
  55. )
  56. INSERT INTO dbo.Numbers (n)
  57. SELECT TOP 1000 ROW_NUMBER() OVER (ORDER BY (SELECT NULL))
  58. FROM master..spt_values v1
  59. GO
  60.  
  61. /****************************************************************************
  62. * PROC TO PERFORM ORIGINAL UPDATE FOR A BATCH OF NAMES
  63. ****************************************************************************/
  64.  
  65. CREATE PROC #oldUpdate (@minId INT, @batchSize INT)
  66. AS
  67. BEGIN
  68.     DECLARE @startTime DATETIME = GETDATE()
  69.     DECLARE @maxId INT = @minId + @batchSize - 1
  70.  
  71.     UPDATE dbo.SourceTable
  72.     SET Bad_Count = (
  73.                    SELECT  COUNT(*)
  74.                    FROM Bad_Phrase
  75.                    WHERE SourceTable.Name LIKE '%'+Bad_Phrase.phrase+'%'
  76.                  )
  77.     WHERE id BETWEEN @minId AND @maxId
  78.  
  79.     DECLARE @TIME INT = DATEDIFF(ms, @startTime, GETDATE())
  80.     RAISERROR('Completed %i rows, starting at id %i, in %ims', 0, 1,
  81.         @batchSize, @minId, @TIME) WITH NOWAIT
  82. END
  83. GO
  84.  
  85. /****************************************************************************
  86. * PROC TO PERFORM THE NEW UPDATE FOR A BATCH OF NAMES
  87. ****************************************************************************/
  88.  
  89. CREATE PROC #newUpdate (@minId INT, @batchSize INT)
  90. AS
  91. BEGIN
  92.     DECLARE @startTime DATETIME = GETDATE()
  93.     DECLARE @maxId INT = @minId + @batchSize - 1
  94.  
  95.     -- For each name, generate the string at each offset
  96.     DECLARE @maxBadPhraseLen INT = (SELECT MAX(LEN(phrase)) FROM Bad_Phrase)
  97.     SELECT s.id, sub.sub_name
  98.     INTO #SubNames
  99.     FROM (SELECT * FROM SourceTable WHERE id BETWEEN @minId AND @maxId) s
  100.     CROSS APPLY (
  101.         -- Create a row for each substring of the name, starting at each character
  102.         -- offset within that string.  For example, if the name is "abcd", this CROSS APPLY
  103.         -- will generate 4 rows, with values ("abcd"), ("bcd"), ("cd"), and ("d"). In order
  104.         -- for the name to be LIKE the bad phrase, the bad phrase must match the leading X
  105.         -- characters (where X is the length of the bad phrase) of at least one of these
  106.         -- substrings. This can be efficiently computed after indexing the substrings.
  107.         -- As an optimization, we only store @maxBadPhraseLen characters rather than
  108.         -- storing the full remainder of the name from each offset; all other characters are
  109.         -- simply extra space that isn't needed to determine whether a bad phrase matches.
  110.         SELECT TOP(LEN(s.name)) SUBSTRING(s.name, n.n, @maxBadPhraseLen) AS sub_name
  111.         FROM Numbers n
  112.         ORDER BY n.n
  113.     ) sub
  114.     -- Create an index so that bad phrases can be quickly compared for a match
  115.     CREATE CLUSTERED INDEX IX_SubNames ON #SubNames (sub_name)
  116.  
  117.     -- For each name, compute the number of distinct bad phrases that match
  118.     -- By "match", we mean that the a substring starting from one or more
  119.     -- character offsets of the overall name matches the bad phrase
  120.     SELECT s.id, COUNT(DISTINCT b.phrase) AS bad_count
  121.     INTO #tempBadCounts
  122.     FROM dbo.Bad_Phrase b
  123.     JOIN #SubNames s
  124.         ON s.sub_name LIKE b.phrase + '%'
  125.     GROUP BY s.id
  126.  
  127.     -- Perform the actual update into a "bad_count_new" field
  128.     -- For validation, we'll compare bad_count_new with the originally computed bad_count
  129.     UPDATE s
  130.     SET s.bad_count_new = COALESCE(b.bad_count, 0)
  131.     FROM dbo.SourceTable s
  132.     LEFT JOIN #tempBadCounts b
  133.         ON b.id = s.id
  134.     WHERE s.id BETWEEN @minId AND @maxId
  135.  
  136.     DECLARE @TIME INT = DATEDIFF(ms, @startTime, GETDATE())
  137.     RAISERROR('Completed %i rows, starting at id %i, in %ims', 0, 1,
  138.         @batchSize, @minId, @TIME) WITH NOWAIT
  139. END
  140. GO
  141.  
  142. /****************************************************************************
  143. * PERFORMANCE TESTING
  144. ****************************************************************************/
  145.  
  146. SET STATISTICS TIME, IO OFF
  147. GO
  148.  
  149. -- Perform 10% of the overall update, broken into batches of 1% at a time
  150. -- and measuring the run-time perform (in seconds) of each batch:
  151. -- 1:  204s old, 5.7s new
  152. -- 2:  166s old, 4.8s new
  153. -- 3:  161s old, 5.5s new
  154. -- 4:  169s old, 4.9s new
  155. -- 5:  168s old, 5.2s new
  156. -- 6:  161s old, 6.0s new
  157. -- 7:  135s old, 6.9s new
  158. -- 8:  126s old, 6.5s new
  159. -- 9:  127s old, 5.3s new
  160. -- 10: 129s old, 5.2s new
  161. -- Total: 1546s old, 56s new
  162. DECLARE @minId INT = 1, @batchSize INT = 150000, @batchNum INT = 1
  163. WHILE (@batchNum <= 10)
  164. BEGIN
  165.     EXEC #oldUpdate @minId = @minId, @batchSize = @batchSize
  166.     EXEC #newUpdate @minId = @minId, @batchSize = @batchSize
  167.     SET @minId = @minId + @batchSize
  168.     SET @batchNum = @batchNum + 1
  169. END
  170. GO
  171.  
  172. /****************************************************************************
  173. * CORRECTNESS TESTING
  174. ****************************************************************************/
  175.  
  176. SELECT COUNT(*) numNames,
  177.     COUNT(CASE WHEN s.bad_count = s.bad_count_new THEN 1 END) numMatching,
  178.     COUNT(CASE WHEN bad_count > 0 THEN 1 END) numWithBadCount
  179. FROM dbo.SourceTable s
  180. WHERE s.id BETWEEN 1 AND 1500000
  181. --numNames  numMatching numWithBadCount
  182. --1500000   1500000     900616
  183. GO
  184.  
  185. /****************************************************************************
  186. * CLEANUP
  187. ****************************************************************************/
  188.  
  189. DROP TABLE dbo.SourceTable
  190. GO
  191. DROP TABLE dbo.Bad_Phrase
  192. GO
  193. DROP TABLE dbo.Numbers
  194. GO
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement