Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- -- This SQL pertains to the following DBA.StackExchange answer:
- -- http://dba.stackexchange.com/questions/122612/how-to-find-values-with-multiple-consecutive-upper-case-characters/122625#122625
- SET NOCOUNT ON;
- DECLARE @SampleData TABLE (Name NVARCHAR(100) COLLATE SQL_Latin1_General_CP1_CI_AS NOT NULL);
- INSERT INTO @SampleData (Name) VALUES (N'mr john smith'); -- no match #1
- INSERT INTO @SampleData (Name) VALUES (N'Mr John Smith'); -- no match #2
- INSERT INTO @SampleData (Name) VALUES (N'Mr JOHN Smith');
- INSERT INTO @SampleData (Name) VALUES (N'MR John Smith');
- INSERT INTO @SampleData (Name) VALUES (N'Mr John SMITH');
- INSERT INTO @SampleData (Name) VALUES (N'Mr JOhn Smith');
- INSERT INTO @SampleData (Name) VALUES (N'Mr ÜLala Jones');
- INSERT INTO @SampleData (Name) VALUES (N'Mr üLala Jones'); -- no match #3
- INSERT INTO @SampleData (Name) VALUES (N'Mr U' + NCHAR(0x0308) + N'Lala Jonez');
- INSERT INTO @SampleData (Name) VALUES (N'Mr U' + NCHAR(0x0308) + NCHAR(0x0309) + NCHAR(0x0308) + N'Lala Jonezzz');
- INSERT INTO @SampleData (Name) VALUES (N'Mr ULala Jonah'); -- FULLWIDTH LATIN CAPITAL LETTER U ( http://unicode.org/cldr/utility/character.jsp?a=FF35 )
- INSERT INTO @SampleData (Name) VALUES (N'Mr ÖLala Jonah'); -- U+00D6 LATIN CAPITAL LETTER O WITH DIAERESIS
- INSERT INTO @SampleData (Name) VALUES (N'Mr ѮLala Jonah'); -- U+046E CYRILLIC CAPITAL LETTER KSI
- INSERT INTO @SampleData (Name) VALUES (N'Mr ⓊLala Jonah'); -- U+24CA CIRCLED LATIN CAPITAL LETTER U (noteable exception since Unicode category is Other_Symbol instead of Uppercase_Letter) ( http://unicode.org/cldr/utility/character.jsp?a=24CA )
- -- Case-Sensitive collation pulls back everything when using range syntax: [A-Z]
- SELECT Name AS [Case-Sensitive_Range]
- FROM @SampleData
- WHERE Name COLLATE Latin1_General_100_CS_AS LIKE N'%[A-Z][A-Z]%';
- -- Binary collation filters too much:
- -- Ü isn't in A-Z
- -- U + NCHAR(0x0308) puts NCHAR(0x0308) between the "U" and any upper-case letter that follows.
- SELECT Name AS [Binary_Range]
- FROM @SampleData
- WHERE Name COLLATE Latin1_General_100_BIN2 LIKE N'%[A-Z][A-Z]%';
- -- Case-Sensitive collation pulls back just the upper-case when specifying each letter,
- -- but can't see variations with accents or letters not in the US-English alphabet.
- -- However, it does find the wide character "U" that a Binary collation would exclude.
- SELECT Name AS [Case-Sensitive_Individual]
- FROM @SampleData
- WHERE Name COLLATE Latin1_General_100_CS_AS LIKE N'%[ABCDEFGHIJKLMNOPQRSTUVWXYZ][ABCDEFGHIJKLMNOPQRSTUVWXYZ]%';
- -- Case-Sensitive collation with additional non-US-English letters pulls back more than before:
- -- it finds the decomposed character "U + NCHAR(0x0308)" that a Binary collation would exclude.
- -- However, it is not always intuitively obvious since similar looking characters that are different
- -- are treated as being different: the "Ӧ" in the LIKE is U+04E6 CYRILLIC CAPITAL LETTER O WITH DIAERESIS
- -- whereas the "Ö" in the test name is U+00D6 LATIN CAPITAL LETTER O WITH DIAERESIS.
- -- It also does not find U + NCHAR(0x0308) + NCHAR(0x0309) + NCHAR(0x0308) since that doesn't match any
- -- precomposed (or composite) character that can be added to the LIKE definition.
- SELECT Name AS [Case-Sensitive_Individual_AdditionalLetters]
- FROM @SampleData
- WHERE Name COLLATE Latin1_General_100_CS_AS LIKE N'%[ABCDEFGHIJKLMNOPQRSTUVWXYZÜӦ][ABCDEFGHIJKLMNOPQRSTUVWXYZÜӦ]%';
- -- Case-Sensitive but Accent-INsensitive collation, even without the additional letters, pulls
- -- back even more, including both "CYRILLIC CAPITAL LETTER O WITH DIAERESIS" and
- -- "LATIN CAPITAL LETTER O WITH DIAERESIS", and now also U + NCHAR(0x0308) + NCHAR(0x0309) + NCHAR(0x0308),
- -- since only the base letter matters now that accent marks are being ignored.
- -- However, it cannot find the "Ѯ", which is U+046E CYRILLIC CAPITAL LETTER KSI, since it
- -- is not in the LIKE definition.
- SELECT Name AS [Case-Sensitive_Accent-INsensitive_Individual_NoAdditionalLetters]
- FROM @SampleData
- WHERE Name COLLATE Latin1_General_100_CS_AI LIKE N'%[ABCDEFGHIJKLMNOPQRSTUVWXYZ][ABCDEFGHIJKLMNOPQRSTUVWXYZ]%'; -- U+04E6 CYRILLIC CAPITAL LETTER O WITH DIAERESIS
- --==================================================================================
- --
- -- The following two examples make use of a SQLCLR Scalar Function available in the
- -- Free version of SQL# ( http://www.SQLsharp.com )
- --
- --=====================================================
- -- Regular Expressions to the rescue:
- SELECT Name AS [RegEx_AllButCircledLetters]
- FROM @SampleData
- WHERE SQL#.RegEx_IsMatch4k(Name, N'(?:\p{Lu}\p{M}*){2}', 1, '') = 1
- -- There are some that are not matched by \p{Lu}, such as: Ⓐ - Ⓩ (U+24B6 - U+24CF).
- -- Not a problem, we will just add in an "or" condition (i.e. "|") and specify that range.
- -- Diacritical marks cannot be added to the "circled letters" so the "or" is between the
- -- range and the "upper-case plus zero-or-more diacritical marks".
- SELECT Name AS [OneRegExToMatchThemAll]
- FROM @SampleData
- WHERE SQL#.RegEx_IsMatch4k(Name, N'(?:[\u24B6-\u24CF]|\p{Lu}\p{M}*){2}', 1, '') = 1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement