Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /*
- This script relates to the following DBA.StackExchange answer:
- http://dba.stackexchange.com/questions/131759/is-there-a-sql-server-implementation-of-the-longest-common-substring-problem/131766#131766
- This script provides several performance / scalability tests for the [LongestCommonSubstring] User-Defined Aggregate (UDA).
- A T-SQL installation script (no external DLL) containing only the User-Defined Aggregate (UDA) is located at:
- http://pastebin.com/wnLwT1GM
- The source code for the [LongestCommonSubstring] User-Defined Aggregate (UDA) is located at:
- http://pastebin.com/ceDypBp0
- Date: 2016-03-17
- Version: 1.0.3
- For more functions like this, please visit: http://SQLsharp.com
- Copyright (c) 2016 Sql Quantum Leap. All rights reserved.
- http://www.SqlQuantumLeap.com
- */
- /* -- CLEAN UP (if needed)
- USE [ {your_database_name} ];
- */
- SET ANSI_NULLS, ANSI_PADDING, ANSI_WARNINGS, ARITHABORT, CONCAT_NULL_YIELDS_NULL, QUOTED_IDENTIFIER ON;
- SET NOCOUNT ON;
- GO
- USE [ {your_database_name} ];
- GO
- -------------------------------------------------------------------------------------
- -- Original @MisterMagoo Test
- -------------------------------------------------------------------------------------
- /* -- Highlight and execute the following once
- IF (OBJECT_ID('tempdb..#ShortStrings') IS NOT NULL)
- BEGIN
- PRINT N'Dropping #ShortStrings table...';
- DROP TABLE #ShortStrings;
- END;
- PRINT N'Creating and Populating #ShortStrings table...';
- CREATE TABLE #ShortStrings
- (
- [ShortStringsID] INT NOT NULL IDENTITY(1, 1) PRIMARY KEY,
- [String] sysname COLLATE DATABASE_DEFAULT NULL
- );
- INSERT INTO #ShortStrings ([String])
- SELECT sac.[Name] AS [String]
- FROM master.sys.all_columns sac
- CROSS JOIN master.sys.indexes si1
- CROSS JOIN master.sys.indexes si2
- WHERE sac.[Name] LIKE N'%refer%';
- -- 1,213,488 rows on SQL Server 2014
- SELECT COUNT(*) FROM #ShortStrings;
- */
- /* Any NULLs mean there is not a "longest common string" */
- IF (EXISTS(SELECT 1 FROM #ShortStrings WHERE [String] IS NULL))
- BEGIN
- SELECT NULL AS [Longest Common Substrings];
- RETURN;
- END;
- /* We need to know number of rows in the sample and the length of the shortest string
- as the longest common substring cannot be longer than the shortest string in the set */
- DECLARE @TotalRows INT,
- @MinLen TINYINT,
- @MatchesFound INT = 0;
- SELECT @MinLen = MIN(LEN([String])),
- @TotalRows = COUNT(DISTINCT [String])
- FROM #ShortStrings;
- RAISERROR(N'Maximum Possible Length: %d; Total Distinct Rows: %d', 0, 0, @MinLen, @TotalRows) WITH NOWAIT;
- DECLARE @Matches TABLE
- (
- [String] sysname COLLATE DATABASE_DEFAULT NULL
- );
- /* Check backwards from the longest possible string to the shortest and break when we find a match */
- /* You might want to check the air conditioner is switched on here */
- WHILE (@MinLen > 0 AND @MatchesFound = 0)
- BEGIN
- RAISERROR(N'Processing strings of length: %d', 0, 0, @MinLen) WITH NOWAIT;
- /* this method is "brute force"
- 1. find all substrings for each input string
- 2. pick the first substring that appears in every input string
- we find this by grouping, counting and comparing to the number of input strings
- */
- INSERT INTO @Matches ([String])
- SELECT [Match]
- FROM (
- SELECT [String], SUBSTRING([String], T.[N], @MinLen) AS [Match]
- FROM #ShortStrings
- CROSS APPLY ( SELECT LEN([String]) - @MinLen + 1 ) a(L)
- CROSS APPLY (
- SELECT TOP(a.[L]) V.[N]
- FROM (
- VALUES(1),(2),(3),(4),(5),(6),(7),(8),(9),(10),(11),(12),(13),(14),(15),(16),(17),
- (18),(19),(20),(21),(22),(23),(24),(25),(26),(27),(28),(29),(30),(31),(32),(33),(34),
- (35),(36),(37),(38),(39),(40),(41),(42),(43),(44),(45),(46),(47),(48),(49),(50)
- ) V(N)
- ) T(N)
- ) Matches(String, Match)
- GROUP BY [Match]
- HAVING COUNT(DISTINCT [String]) = @TotalRows;
- SET @MatchesFound = @@ROWCOUNT;
- /* Decrement so next time we check for a shorter match */
- SET @MinLen = (@MinLen - 1);
- END;
- /* Display the results */
- SELECT [String] AS [Longest Common Substrings]
- FROM @Matches;
- -- Longest Common Substring: referen
- -- 1:13 (across 3 successive executions AFTER the initial execution)
- -----------------
- SELECT dbo.LongestCommonSubstring([String], 0)
- FROM #ShortStrings;
- -- <Items Merged="False"><Item>referen</Item></Items>
- -- 0:12 (across 3 successive executions AFTER the initial execution)
- SELECT dbo.LongestCommonSubstring([String], 1)
- FROM #ShortStrings;
- -- 0:12 (across 3 successive executions AFTER the initial execution)
- /*
- <Items Merged="False">
- <Item IsLongest="True">referen</Item>
- <Item IsLongest="False">refere</Item>
- <Item IsLongest="False">eferen</Item>
- <Item IsLongest="False">refer</Item>
- <Item IsLongest="False">efere</Item>
- <Item IsLongest="False">feren</Item>
- <Item IsLongest="False">refe</Item>
- <Item IsLongest="False">efer</Item>
- <Item IsLongest="False">fere</Item>
- <Item IsLongest="False">eren</Item>
- <Item IsLongest="False">ref</Item>
- <Item IsLongest="False">efe</Item>
- <Item IsLongest="False">fer</Item>
- <Item IsLongest="False">ere</Item>
- <Item IsLongest="False">ren</Item>
- <Item IsLongest="False">re</Item>
- <Item IsLongest="False">ef</Item>
- <Item IsLongest="False">fe</Item>
- <Item IsLongest="False">er</Item>
- <Item IsLongest="False">en</Item>
- <Item IsLongest="False">r</Item>
- <Item IsLongest="False">e</Item>
- <Item IsLongest="False">f</Item>
- <Item IsLongest="False">n</Item>
- <Item IsLongest="False">c</Item>
- <Item IsLongest="False">_</Item>
- </Items>
- */
- GO
- -------------------------------------------------------------------------------------
- -- Modified @MisterMagoo Test:
- -- Same code and base data, but add to each row: a tie-for-first-place
- -- substring, a 4 character common substring, and 3 random characters.
- -------------------------------------------------------------------------------------
- /* -- Highlight and execute the following once
- IF (OBJECT_ID('tempdb..#MultipleMatches') IS NOT NULL)
- BEGIN
- PRINT N'Dropping #MultipleMatches table...';
- DROP TABLE #MultipleMatches;
- END;
- PRINT N'Creating and Populating #MultipleMatches table...';
- CREATE TABLE #MultipleMatches
- (
- [MultipleMatchesID] INT NOT NULL IDENTITY(1, 1) PRIMARY KEY,
- [String] sysname COLLATE DATABASE_DEFAULT NULL
- );
- INSERT INTO #MultipleMatches ([String])
- SELECT ss.[String] + '1234567' + CONVERT(VARCHAR(5), CRYPT_GEN_RANDOM(3)) + '_Yo!' AS [String]
- FROM #ShortStrings ss
- -- 1,213,488 rows on SQL Server 2014
- SELECT COUNT(*) FROM #MultipleMatches;
- */
- /* Any NULLs mean there is not a "longest common string" */
- IF (EXISTS(SELECT 1 FROM #MultipleMatches WHERE [String] IS NULL))
- BEGIN
- SELECT NULL AS [Longest Common Substrings];
- RETURN;
- END;
- /* We need to know number of rows in the sample and the length of the shortest string
- as the longest common substring cannot be longer than the shortest string in the set */
- DECLARE @TotalRows INT,
- @MinLen TINYINT,
- @MatchesFound INT = 0;
- SELECT @MinLen = MIN(LEN([String])),
- @TotalRows = COUNT(DISTINCT [String])
- FROM #MultipleMatches;
- RAISERROR(N'Maximum Possible Length: %d; Total Distinct Rows: %d', 0, 0, @MinLen, @TotalRows) WITH NOWAIT;
- DECLARE @Matches TABLE
- (
- [String] sysname COLLATE DATABASE_DEFAULT NULL
- );
- /* Check backwards from the longest possible string to the shortest and break when we find a match */
- /* You might want to check the air conditioner is switched on here */
- WHILE (@MinLen > 0 AND @MatchesFound = 0)
- BEGIN
- RAISERROR(N'Processing strings of length: %d', 0, 0, @MinLen) WITH NOWAIT;
- /* this method is "brute force"
- 1. find all substrings for each input string
- 2. pick the first substring that appears in every input string
- we find this by grouping, counting and comparing to the number of input strings
- */
- INSERT INTO @Matches ([String])
- SELECT [Match]
- FROM (
- SELECT [String], SUBSTRING([String], T.[N], @MinLen) AS [Match]
- FROM #MultipleMatches
- CROSS APPLY ( SELECT LEN([String]) - @MinLen + 1 ) a(L)
- CROSS APPLY (
- SELECT TOP(a.[L]) V.[N]
- FROM (
- VALUES(1),(2),(3),(4),(5),(6),(7),(8),(9),(10),(11),(12),(13),(14),(15),(16),(17),
- (18),(19),(20),(21),(22),(23),(24),(25),(26),(27),(28),(29),(30),(31),(32),(33),(34),
- (35),(36),(37),(38),(39),(40),(41),(42),(43),(44),(45),(46),(47),(48),(49),(50)
- ) V(N)
- ) T(N)
- ) Matches(String, Match)
- GROUP BY [Match]
- HAVING COUNT(DISTINCT [String]) = @TotalRows;
- SET @MatchesFound = @@ROWCOUNT;
- /* Decrement so next time we check for a shorter match */
- SET @MinLen = (@MinLen - 1);
- END;
- /* Display the results */
- SELECT [String] AS [Longest Common Substrings]
- FROM @Matches;
- -- ?? never returned
- -- ?? killed after 23 minutes. Only process strings of lengths: 27, 26, and 25
- -----------------
- SELECT dbo.LongestCommonSubstring([String], 0)
- FROM #MultipleMatches;
- -- <Items Merged="False"><Item>referen</Item><Item>1234567</Item></Items>
- -- 1:09 (across 3 successive executions AFTER the initial execution)
- SELECT dbo.LongestCommonSubstring([String], 1)
- FROM #MultipleMatches;
- -- 1:10 (across 3 successive executions AFTER the initial execution)
- /*
- <Items Merged="False">
- <Item IsLongest="True">referen</Item>
- <Item IsLongest="True">1234567</Item>
- <Item IsLongest="False">refere</Item>
- <Item IsLongest="False">eferen</Item>
- <Item IsLongest="False">123456</Item>
- <Item IsLongest="False">234567</Item>
- <Item IsLongest="False">refer</Item>
- <Item IsLongest="False">efere</Item>
- <Item IsLongest="False">feren</Item>
- <Item IsLongest="False">12345</Item>
- <Item IsLongest="False">23456</Item>
- <Item IsLongest="False">34567</Item>
- <Item IsLongest="False">refe</Item>
- <Item IsLongest="False">efer</Item>
- <Item IsLongest="False">fere</Item>
- <Item IsLongest="False">eren</Item>
- <Item IsLongest="False">1234</Item>
- <Item IsLongest="False">2345</Item>
- <Item IsLongest="False">3456</Item>
- <Item IsLongest="False">4567</Item>
- <Item IsLongest="False">_Yo!</Item>
- <Item IsLongest="False">ref</Item>
- <Item IsLongest="False">efe</Item>
- <Item IsLongest="False">fer</Item>
- <Item IsLongest="False">ere</Item>
- <Item IsLongest="False">ren</Item>
- <Item IsLongest="False">123</Item>
- <Item IsLongest="False">234</Item>
- <Item IsLongest="False">345</Item>
- <Item IsLongest="False">456</Item>
- <Item IsLongest="False">567</Item>
- <Item IsLongest="False">_Yo</Item>
- <Item IsLongest="False">Yo!</Item>
- <Item IsLongest="False">re</Item>
- <Item IsLongest="False">ef</Item>
- <Item IsLongest="False">fe</Item>
- <Item IsLongest="False">er</Item>
- <Item IsLongest="False">en</Item>
- <Item IsLongest="False">12</Item>
- <Item IsLongest="False">23</Item>
- <Item IsLongest="False">34</Item>
- <Item IsLongest="False">45</Item>
- <Item IsLongest="False">56</Item>
- <Item IsLongest="False">67</Item>
- <Item IsLongest="False">_Y</Item>
- <Item IsLongest="False">Yo</Item>
- <Item IsLongest="False">o!</Item>
- <Item IsLongest="False">r</Item>
- <Item IsLongest="False">e</Item>
- <Item IsLongest="False">f</Item>
- <Item IsLongest="False">n</Item>
- <Item IsLongest="False">c</Item>
- <Item IsLongest="False">_</Item>
- <Item IsLongest="False">1</Item>
- <Item IsLongest="False">2</Item>
- <Item IsLongest="False">3</Item>
- <Item IsLongest="False">4</Item>
- <Item IsLongest="False">5</Item>
- <Item IsLongest="False">6</Item>
- <Item IsLongest="False">7</Item>
- <Item IsLongest="False">Y</Item>
- <Item IsLongest="False">o</Item>
- <Item IsLongest="False">!</Item>
- </Items>
- */
- GO
- -------------------------------------------------------------------------------------
- -- Solomonified @MisterMagoo Technique:
- -- Same data as "Modifed @MisterMagoo Test", but attempt to extract
- -- substrings from 2 entries into a temp table, then find which ones
- -- are present across all rows, then return the longest one(s).
- -------------------------------------------------------------------------------------
- DECLARE @ReturnAllSubstrings BIT = 0;
- IF (OBJECT_ID('tempdb..#Substrings') IS NOT NULL)
- BEGIN
- PRINT N'Dropping #Substrings table...';
- DROP TABLE #Substrings;
- END;
- PRINT N'Creating #Substrings table...';
- CREATE TABLE #Substrings
- (
- [SubstringsID] INT NOT NULL IDENTITY(1, 1) PRIMARY KEY,
- [String] sysname COLLATE DATABASE_DEFAULT NULL
- );
- /* Any NULLs mean there is not a "longest common string" */
- IF (EXISTS(SELECT 1 FROM #MultipleMatches WHERE [String] IS NULL))
- BEGIN
- RETURN;
- END;
- /* We need to know number of rows in the sample and the length of the shortest string
- as the longest common substring cannot be longer than the shortest string in the set */
- DECLARE @TotalRows INT,
- @MinLen TINYINT,
- @MatchesFound INT = 0;
- SELECT @MinLen = MIN(LEN([String])),
- @TotalRows = COUNT(*)
- FROM #MultipleMatches;
- RAISERROR(N'Maximum Possible Length: %d; Total Distinct Rows: %d', 0, 0, @MinLen, @TotalRows) WITH NOWAIT;
- /* Check backwards from the longest possible string to the shortest and break when we find a match */
- DECLARE @SampleSet TABLE
- (
- [String] sysname COLLATE DATABASE_DEFAULT NULL
- );
- INSERT INTO @SampleSet ([String])
- SELECT TOP (2) [String]
- FROM #MultipleMatches
- GROUP BY [String]
- ORDER BY LEN([String]) ASC;
- WHILE (@MinLen > 0)
- BEGIN
- RAISERROR(N'Processing strings of length: %d', 0, 0, @MinLen) WITH NOWAIT;
- /* this method is "brute force"
- 1. find all substrings for each input string
- 2. pick the first substring that appears in every input string
- we find this by grouping, counting and comparing to the number of input strings
- */
- INSERT INTO #Substrings ([String])
- SELECT [Match]
- FROM (
- SELECT [String], SUBSTRING([String], T.[N], @MinLen) AS [Match]
- FROM @SampleSet
- CROSS APPLY ( SELECT LEN([String]) - @MinLen + 1 ) a(L)
- CROSS APPLY (
- SELECT TOP(a.[L]) V.[N]
- FROM (
- VALUES(1),(2),(3),(4),(5),(6),(7),(8),(9),(10),(11),(12),(13),(14),(15),(16),(17),
- (18),(19),(20),(21),(22),(23),(24),(25),(26),(27),(28),(29),(30),(31),(32),(33),(34),
- (35),(36),(37),(38),(39),(40),(41),(42),(43),(44),(45),(46),(47),(48),(49),(50)
- ) V(N)
- ) T(N)
- ) Matches(String, Match)
- GROUP BY [Match]
- HAVING COUNT(DISTINCT [String]) = 2;
- SET @MatchesFound = @@ROWCOUNT;
- /* Decrement so next time we check for a shorter match */
- SET @MinLen = (@MinLen - 1);
- END;
- --SELECT * FROM #Substrings ORDER BY LEN([String]) DESC, SubstringsID ASC;
- DECLARE @Matches TABLE
- (
- [String] sysname COLLATE DATABASE_DEFAULT NULL,
- [IsLongest] BIT NOT NULL
- );
- DECLARE @Test sysname,
- @MaxLength INT,
- @CurrentLength INT;
- DECLARE tests CURSOR STATIC LOCAL READ_ONLY FORWARD_ONLY
- FOR SELECT [String]
- FROM #Substrings
- ORDER BY LEN([String]) DESC;--, [SubstringsID] ASC -- ??
- OPEN tests;
- FETCH NEXT
- FROM tests
- INTO @Test;
- WHILE (@@FETCH_STATUS = 0)
- BEGIN
- SET @CurrentLength = LEN(@Test);
- RAISERROR(N'Testing substring: %s', 0, 0, @Test) WITH NOWAIT;
- -- @MaxLength will remain NULL until the first common substring is found
- IF ((@ReturnAllSubstrings = 0) AND (@CurrentLength < @MaxLength))
- BEGIN
- BREAK;
- END;
- IF (NOT EXISTS(
- SELECT *
- FROM #MultipleMatches mm
- WHERE CHARINDEX(@Test, mm.[String]) = 0
- )
- )
- BEGIN
- SET @MaxLength = ISNULL(@MaxLength, @CurrentLength);
- INSERT INTO @Matches ([String], [IsLongest])
- VALUES (@Test, IIF(@CurrentLength = @MaxLength, 1, 0));
- RAISERROR(N'Found match: %s; Length: %d', 0, 0, @Test, @CurrentLength) WITH NOWAIT;
- END;
- FETCH NEXT
- FROM tests
- INTO @Test;
- END;
- CLOSE tests;
- DEALLOCATE tests;
- /* Display the results */
- SELECT [String], [IsLongest]
- FROM @Matches;
- -- 0:22 across several executions IF @ReturnAllSubstrings = 0
- -- 2:41 across several executions IF @ReturnAllSubstrings = 1
- /*
- String IsLongest
- 1234567 1
- referen 1
- 123456 0
- 234567 0
- eferen 0
- refere 0
- 12345 0
- 23456 0
- 34567 0
- efere 0
- feren 0
- refer 0
- _Yo! 0
- 1234 0
- 2345 0
- 3456 0
- 4567 0
- efer 0
- eren 0
- fere 0
- refe 0
- _Yo 0
- 123 0
- 234 0
- 345 0
- 456 0
- 567 0
- efe 0
- ere 0
- fer 0
- ref 0
- ren 0
- Yo! 0
- _Y 0
- 12 0
- 23 0
- 34 0
- 45 0
- 56 0
- 67 0
- ef 0
- en 0
- er 0
- fe 0
- o! 0
- re 0
- Yo 0
- ! 0
- _ 0
- 1 0
- 2 0
- 3 0
- 4 0
- 5 0
- 6 0
- 7 0
- c 0
- e 0
- f 0
- n 0
- o 0
- r 0
- Y 0
- */
- -------------------------------------------------------------------------------------
- /* -- related research queries:
- SELECT MAX(LEN([String])) FROM #ShortStrings;
- SELECT DISTINCT [String] FROM #ShortStrings;
- SELECT * FROM sys.assemblies;
- SELECT * FROM sys.dm_clr_loaded_assemblies;
- SELECT CONVERT(BIT, 'True');
- SELECT '1234567' + CONVERT(VARCHAR(5), CRYPT_GEN_RANDOM(3)) + '_Yo!'
- */
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement