Untitled


/****************************************************************************
* GENERATE FAKE DATA
****************************************************************************/

-- Generate 15MM names, with lengths ranging from 3 to 105 characters
-- The average length is 33 characters
CREATE TABLE dbo.SourceTable (
    id INT IDENTITY(1,1)
        CONSTRAINT PK_SourceTable PRIMARY KEY,
    name VARCHAR(128) NOT NULL,
    bad_count INT NULL,
    bad_count_new INT NULL,
)
GO
INSERT INTO dbo.SourceTable WITH (TABLOCK) (name)
SELECT TOP 15000000 v1.name + v2.name + v3.name
FROM master..spt_values v1
CROSS JOIN master..spt_values v2
CROSS JOIN master..spt_values v3
WHERE NULLIF(v1.name,'') IS NOT NULL
    AND NULLIF(v2.name,'') IS NOT NULL
    AND NULLIF(v3.name,'') IS NOT NULL
GO

-- Generate 3K bad phrases, with lengths ranging from 6 to 18 characters
-- The average length is 10 characters
CREATE TABLE dbo.Bad_Phrase (
  phrase VARCHAR(50) NOT NULL PRIMARY KEY
)
GO
INSERT INTO dbo.Bad_Phrase (phrase)
SELECT DISTINCT TOP 3000 p.phrase
FROM (
    SELECT SUBSTRING(v1.name + v2.name,
            CONVERT(INT, FLOOR(0.3*LEN(v1.name+v2.name))),
            CONVERT(INT, FLOOR(5+0.2*LEN(v1.name+v2.name)))
        ) AS phrase
    FROM master..spt_values v1
    CROSS JOIN master..spt_values v2
    WHERE v1.name IS NOT NULL
        AND v2.name IS NOT NULL
) p
WHERE LEN(p.phrase) >= 3
GO

-- Examine names and phrases
SELECT MIN(LEN(name)), MAX(LEN(name)), AVG(LEN(name)) FROM SourceTable
SELECT MIN(LEN(phrase)), MAX(LEN(phrase)), AVG(LEN(phrase)) FROM Bad_Phrase
GO

-- Generate a tally table
CREATE TABLE dbo.Numbers (
    n INT PRIMARY KEY
)
INSERT INTO dbo.Numbers (n)
SELECT TOP 1000 ROW_NUMBER() OVER (ORDER BY (SELECT NULL))
FROM master..spt_values v1
GO

/****************************************************************************
* PROC TO PERFORM ORIGINAL UPDATE FOR A BATCH OF NAMES
****************************************************************************/

CREATE PROC #oldUpdate (@minId INT, @batchSize INT)
AS
BEGIN
    DECLARE @startTime DATETIME = GETDATE()
    DECLARE @maxId INT = @minId + @batchSize - 1

    UPDATE dbo.SourceTable
    SET Bad_Count = (
                   SELECT  COUNT(*)
                   FROM Bad_Phrase
                   WHERE SourceTable.Name LIKE '%'+Bad_Phrase.phrase+'%'
                 )
    WHERE id BETWEEN @minId AND @maxId

    DECLARE @TIME INT = DATEDIFF(ms, @startTime, GETDATE())
    RAISERROR('Completed %i rows, starting at id %i, in %ims', 0, 1,
        @batchSize, @minId, @TIME) WITH NOWAIT
END
GO

/****************************************************************************
* PROC TO PERFORM THE NEW UPDATE FOR A BATCH OF NAMES
****************************************************************************/

CREATE PROC #newUpdate (@minId INT, @batchSize INT)
AS
BEGIN
    DECLARE @startTime DATETIME = GETDATE()
    DECLARE @maxId INT = @minId + @batchSize - 1

    -- For each name, generate the string at each offset
    DECLARE @maxBadPhraseLen INT = (SELECT MAX(LEN(phrase)) FROM Bad_Phrase)
    SELECT s.id, sub.sub_name
    INTO #SubNames
    FROM (SELECT * FROM SourceTable WHERE id BETWEEN @minId AND @maxId) s
    CROSS APPLY (
        -- Create a row for each substring of the name, starting at each character
        -- offset within that string.  For example, if the name is "abcd", this CROSS APPLY
        -- will generate 4 rows, with values ("abcd"), ("bcd"), ("cd"), and ("d"). In order
        -- for the name to be LIKE the bad phrase, the bad phrase must match the leading X
        -- characters (where X is the length of the bad phrase) of at least one of these
        -- substrings. This can be efficiently computed after indexing the substrings.
        -- As an optimization, we only store @maxBadPhraseLen characters rather than
        -- storing the full remainder of the name from each offset; all other characters are
        -- simply extra space that isn't needed to determine whether a bad phrase matches.
        SELECT TOP(LEN(s.name)) SUBSTRING(s.name, n.n, @maxBadPhraseLen) AS sub_name
        FROM Numbers n
        ORDER BY n.n
    ) sub
    -- Create an index so that bad phrases can be quickly compared for a match
    CREATE CLUSTERED INDEX IX_SubNames ON #SubNames (sub_name)

    -- For each name, compute the number of distinct bad phrases that match
    -- By "match", we mean that the a substring starting from one or more
    -- character offsets of the overall name matches the bad phrase
    SELECT s.id, COUNT(DISTINCT b.phrase) AS bad_count
    INTO #tempBadCounts
    FROM dbo.Bad_Phrase b
    JOIN #SubNames s
        ON s.sub_name LIKE b.phrase + '%'
    GROUP BY s.id

    -- Perform the actual update into a "bad_count_new" field
    -- For validation, we'll compare bad_count_new with the originally computed bad_count
    UPDATE s
    SET s.bad_count_new = COALESCE(b.bad_count, 0)
    FROM dbo.SourceTable s
    LEFT JOIN #tempBadCounts b
        ON b.id = s.id
    WHERE s.id BETWEEN @minId AND @maxId

    DECLARE @TIME INT = DATEDIFF(ms, @startTime, GETDATE())
    RAISERROR('Completed %i rows, starting at id %i, in %ims', 0, 1,
        @batchSize, @minId, @TIME) WITH NOWAIT
END
GO

/****************************************************************************
* PERFORMANCE TESTING
****************************************************************************/

SET STATISTICS TIME, IO OFF
GO

-- Perform 10% of the overall update, broken into batches of 1% at a time
-- and measuring the run-time perform (in seconds) of each batch:
-- 1:  204s old, 5.7s new
-- 2:  166s old, 4.8s new
-- 3:  161s old, 5.5s new
-- 4:  169s old, 4.9s new
-- 5:  168s old, 5.2s new
-- 6:  161s old, 6.0s new
-- 7:  135s old, 6.9s new
-- 8:  126s old, 6.5s new
-- 9:  127s old, 5.3s new
-- 10: 129s old, 5.2s new
-- Total: 1546s old, 56s new
DECLARE @minId INT = 1, @batchSize INT = 150000, @batchNum INT = 1
WHILE (@batchNum <= 10)
BEGIN
    EXEC #oldUpdate @minId = @minId, @batchSize = @batchSize
    EXEC #newUpdate @minId = @minId, @batchSize = @batchSize
    SET @minId = @minId + @batchSize
    SET @batchNum = @batchNum + 1
END
GO

/****************************************************************************
* CORRECTNESS TESTING
****************************************************************************/

SELECT COUNT(*) numNames,
    COUNT(CASE WHEN s.bad_count = s.bad_count_new THEN 1 END) numMatching,
    COUNT(CASE WHEN bad_count > 0 THEN 1 END) numWithBadCount
FROM dbo.SourceTable s
WHERE s.id BETWEEN 1 AND 1500000
--numNames  numMatching numWithBadCount
--1500000   1500000     900616
GO

/****************************************************************************
* CLEANUP
****************************************************************************/

DROP TABLE dbo.SourceTable
GO
DROP TABLE dbo.Bad_Phrase
GO
DROP TABLE dbo.Numbers
GO