Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /****************************************************************************
- * GENERATE FAKE DATA
- ****************************************************************************/
- -- Generate 15MM names, with lengths ranging from 3 to 105 characters
- -- The average length is 33 characters
- CREATE TABLE dbo.SourceTable (
- id INT IDENTITY(1,1)
- CONSTRAINT PK_SourceTable PRIMARY KEY,
- name VARCHAR(128) NOT NULL,
- bad_count INT NULL,
- bad_count_new INT NULL,
- )
- GO
- INSERT INTO dbo.SourceTable WITH (TABLOCK) (name)
- SELECT TOP 15000000 v1.name + v2.name + v3.name
- FROM master..spt_values v1
- CROSS JOIN master..spt_values v2
- CROSS JOIN master..spt_values v3
- WHERE NULLIF(v1.name,'') IS NOT NULL
- AND NULLIF(v2.name,'') IS NOT NULL
- AND NULLIF(v3.name,'') IS NOT NULL
- GO
- -- Generate 3K bad phrases, with lengths ranging from 6 to 18 characters
- -- The average length is 10 characters
- CREATE TABLE dbo.Bad_Phrase (
- phrase VARCHAR(50) NOT NULL PRIMARY KEY
- )
- GO
- INSERT INTO dbo.Bad_Phrase (phrase)
- SELECT DISTINCT TOP 3000 p.phrase
- FROM (
- SELECT SUBSTRING(v1.name + v2.name,
- CONVERT(INT, FLOOR(0.3*LEN(v1.name+v2.name))),
- CONVERT(INT, FLOOR(5+0.2*LEN(v1.name+v2.name)))
- ) AS phrase
- FROM master..spt_values v1
- CROSS JOIN master..spt_values v2
- WHERE v1.name IS NOT NULL
- AND v2.name IS NOT NULL
- ) p
- WHERE LEN(p.phrase) >= 3
- GO
- -- Examine names and phrases
- SELECT MIN(LEN(name)), MAX(LEN(name)), AVG(LEN(name)) FROM SourceTable
- SELECT MIN(LEN(phrase)), MAX(LEN(phrase)), AVG(LEN(phrase)) FROM Bad_Phrase
- GO
- -- Generate a tally table
- CREATE TABLE dbo.Numbers (
- n INT PRIMARY KEY
- )
- INSERT INTO dbo.Numbers (n)
- SELECT TOP 1000 ROW_NUMBER() OVER (ORDER BY (SELECT NULL))
- FROM master..spt_values v1
- GO
- /****************************************************************************
- * PROC TO PERFORM ORIGINAL UPDATE FOR A BATCH OF NAMES
- ****************************************************************************/
- CREATE PROC #oldUpdate (@minId INT, @batchSize INT)
- AS
- BEGIN
- DECLARE @startTime DATETIME = GETDATE()
- DECLARE @maxId INT = @minId + @batchSize - 1
- UPDATE dbo.SourceTable
- SET Bad_Count = (
- SELECT COUNT(*)
- FROM Bad_Phrase
- WHERE SourceTable.Name LIKE '%'+Bad_Phrase.phrase+'%'
- )
- WHERE id BETWEEN @minId AND @maxId
- DECLARE @TIME INT = DATEDIFF(ms, @startTime, GETDATE())
- RAISERROR('Completed %i rows, starting at id %i, in %ims', 0, 1,
- @batchSize, @minId, @TIME) WITH NOWAIT
- END
- GO
- /****************************************************************************
- * PROC TO PERFORM THE NEW UPDATE FOR A BATCH OF NAMES
- ****************************************************************************/
- CREATE PROC #newUpdate (@minId INT, @batchSize INT)
- AS
- BEGIN
- DECLARE @startTime DATETIME = GETDATE()
- DECLARE @maxId INT = @minId + @batchSize - 1
- -- For each name, generate the string at each offset
- DECLARE @maxBadPhraseLen INT = (SELECT MAX(LEN(phrase)) FROM Bad_Phrase)
- SELECT s.id, sub.sub_name
- INTO #SubNames
- FROM (SELECT * FROM SourceTable WHERE id BETWEEN @minId AND @maxId) s
- CROSS APPLY (
- -- Create a row for each substring of the name, starting at each character
- -- offset within that string. For example, if the name is "abcd", this CROSS APPLY
- -- will generate 4 rows, with values ("abcd"), ("bcd"), ("cd"), and ("d"). In order
- -- for the name to be LIKE the bad phrase, the bad phrase must match the leading X
- -- characters (where X is the length of the bad phrase) of at least one of these
- -- substrings. This can be efficiently computed after indexing the substrings.
- -- As an optimization, we only store @maxBadPhraseLen characters rather than
- -- storing the full remainder of the name from each offset; all other characters are
- -- simply extra space that isn't needed to determine whether a bad phrase matches.
- SELECT TOP(LEN(s.name)) SUBSTRING(s.name, n.n, @maxBadPhraseLen) AS sub_name
- FROM Numbers n
- ORDER BY n.n
- ) sub
- -- Create an index so that bad phrases can be quickly compared for a match
- CREATE CLUSTERED INDEX IX_SubNames ON #SubNames (sub_name)
- -- For each name, compute the number of distinct bad phrases that match
- -- By "match", we mean that the a substring starting from one or more
- -- character offsets of the overall name matches the bad phrase
- SELECT s.id, COUNT(DISTINCT b.phrase) AS bad_count
- INTO #tempBadCounts
- FROM dbo.Bad_Phrase b
- JOIN #SubNames s
- ON s.sub_name LIKE b.phrase + '%'
- GROUP BY s.id
- -- Perform the actual update into a "bad_count_new" field
- -- For validation, we'll compare bad_count_new with the originally computed bad_count
- UPDATE s
- SET s.bad_count_new = COALESCE(b.bad_count, 0)
- FROM dbo.SourceTable s
- LEFT JOIN #tempBadCounts b
- ON b.id = s.id
- WHERE s.id BETWEEN @minId AND @maxId
- DECLARE @TIME INT = DATEDIFF(ms, @startTime, GETDATE())
- RAISERROR('Completed %i rows, starting at id %i, in %ims', 0, 1,
- @batchSize, @minId, @TIME) WITH NOWAIT
- END
- GO
- /****************************************************************************
- * PERFORMANCE TESTING
- ****************************************************************************/
- SET STATISTICS TIME, IO OFF
- GO
- -- Perform 10% of the overall update, broken into batches of 1% at a time
- -- and measuring the run-time perform (in seconds) of each batch:
- -- 1: 204s old, 5.7s new
- -- 2: 166s old, 4.8s new
- -- 3: 161s old, 5.5s new
- -- 4: 169s old, 4.9s new
- -- 5: 168s old, 5.2s new
- -- 6: 161s old, 6.0s new
- -- 7: 135s old, 6.9s new
- -- 8: 126s old, 6.5s new
- -- 9: 127s old, 5.3s new
- -- 10: 129s old, 5.2s new
- -- Total: 1546s old, 56s new
- DECLARE @minId INT = 1, @batchSize INT = 150000, @batchNum INT = 1
- WHILE (@batchNum <= 10)
- BEGIN
- EXEC #oldUpdate @minId = @minId, @batchSize = @batchSize
- EXEC #newUpdate @minId = @minId, @batchSize = @batchSize
- SET @minId = @minId + @batchSize
- SET @batchNum = @batchNum + 1
- END
- GO
- /****************************************************************************
- * CORRECTNESS TESTING
- ****************************************************************************/
- SELECT COUNT(*) numNames,
- COUNT(CASE WHEN s.bad_count = s.bad_count_new THEN 1 END) numMatching,
- COUNT(CASE WHEN bad_count > 0 THEN 1 END) numWithBadCount
- FROM dbo.SourceTable s
- WHERE s.id BETWEEN 1 AND 1500000
- --numNames numMatching numWithBadCount
- --1500000 1500000 900616
- GO
- /****************************************************************************
- * CLEANUP
- ****************************************************************************/
- DROP TABLE dbo.SourceTable
- GO
- DROP TABLE dbo.Bad_Phrase
- GO
- DROP TABLE dbo.Numbers
- GO
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement