SqlQuantumLeap

T-SQL Function (iTVF) to List All Unicode Code Points

Mar 26th, 2019
609
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
T-SQL 5.60 KB | None | 0 0
  1. /*
  2.  -------------------- List All Unicode Code Points -------------------
  3.  
  4. Created By: Solomon Rutzky / Sql Quantum Leap ( https://SqlQuantumLeap.com/ )
  5. Created On: 2019-03-25
  6. Updated On: 2019-04-09
  7. Updated On: 2020-01-13 ~ Add UTF-8 support for all SQL Server versions (does not require "_UTF8" collation!!)
  8.                        ~ Clarify that HTML escape sequences are also used in XML
  9.                        ~ Add more app languages: F#, JavaScript, and Julia
  10.                        ~ Correct "\x" to be "\u" for C#, F#, Java, etc app languages
  11.                        ~ Split "\u" and "\u\u" into 2 columns to be more accurate for app languages
  12.                        ~ Add "\U" column to be more accurate for app languages
  13.                        ~ Fix "Location" URL in header comment
  14.  
  15. Location:          https://pastebin.com/0JvkHu2D
  16. Related blog post: https://sqlquantumleap.com/2019/03/25/ssms-tip-3-easily-access-research-all-unicode-characters-yes-including-emojis-%f0%9f%98%b8/
  17.  
  18. Escape Sequences:  For more details and/or additional languages, please see the following post:
  19.                    https://sqlquantumleap.com/2019/06/26/unicode-escape-sequences-across-various-languages-and-platforms-including-supplementary-characters/
  20. ----------------------------------------------------------------------
  21. */
  22.  
  23.  
  24. USE [tempdb];
  25.  
  26.  
  27. IF (OBJECT_ID(N'dbo.ListAllUnicodeCodePoints') IS NULL)
  28. BEGIN
  29.     RAISERROR(N'Creating placeholder function...', 10, 1) WITH NOWAIT;
  30.     EXEC(N'CREATE FUNCTION dbo.ListAllUnicodeCodePoints()
  31.             RETURNS TABLE AS RETURN SELECT 1 AS [a];');
  32. END;
  33.  
  34.  
  35. -------------------------------------------------------------------------------
  36. --===========================================================================--
  37. -------------------------------------------------------------------------------
  38. GO
  39. ALTER FUNCTION dbo.ListAllUnicodeCodePoints(@EncodeSurrogateCodePointsInUTF8 BIT = 0)
  40. RETURNS TABLE
  41. AS RETURN
  42. WITH nums AS
  43. (
  44.   SELECT TOP (1114111) (ROW_NUMBER() OVER (ORDER BY @@MICROSOFTVERSION) - 1) AS [num]
  45.   FROM   [master].[sys].[all_columns] ac1
  46.   CROSS JOIN [master].[sys].[all_columns] ac2
  47. ), chars AS
  48. (
  49.   SELECT CONVERT(INT, n.[num]) AS [num], -- pass-through
  50.          RIGHT(CONVERT(CHAR(6), CONVERT(BINARY(3), n.[num]), 2),
  51.                CASE WHEN n.[num] > 65535 THEN 5 ELSE 4 END) AS [CodePointHex],
  52.          CONVERT(INT, CASE WHEN n.[num] > 65535 THEN 55232 + (n.[num] / 1024) END) AS [HighSurrogateINT],
  53.          CONVERT(INT, CASE WHEN n.[num] > 65535 THEN 56320 + (n.[num] % 1024) END) AS [LowSurrogateINT]
  54.   FROM   nums n
  55.   WHERE  n.[num] BETWEEN 0x000000 AND 0x014700 -- filter out 925,455
  56.   OR     n.[num] BETWEEN 0x016800 AND 0x030000 -- unmapped code
  57.   OR     n.[num] BETWEEN 0x0E0001 AND 0x0E01EF -- points
  58. )
  59. SELECT
  60.        'U+' + c.[CodePointHex] AS [CodePoint],
  61.        c.[num] AS [CdPntINT],
  62.        '0x' + c.[CodePointHex] AS [CdPntBIN],
  63.        CASE
  64.          WHEN c.[num] > 65535 THEN NCHAR(c.[HighSurrogateINT]) + NCHAR(c.[LowSurrogateINT])
  65.          ELSE NCHAR(c.[num])
  66.        END AS [Char],
  67.        CASE
  68.          WHEN c.[num] > 65535 THEN CONVERT(CHAR(10), CONVERT(BINARY(4),
  69.                                            NCHAR(c.[HighSurrogateINT]) + NCHAR(c.[LowSurrogateINT])), 1)
  70.          ELSE CONVERT(CHAR(6), CONVERT(BINARY(2), NCHAR(c.[num])), 1)
  71.        END AS [UTF-16LE       ],
  72.        '0x' + CASE -- https://rosettacode.org/wiki/UTF-8_encode_and_decode#VBA
  73.                 WHEN c.[num] < 128
  74.                   THEN CONVERT(CHAR(4), CONVERT(BINARY(1), c.[num]), 2)
  75.                 WHEN c.[num] BETWEEN 128 AND 2047
  76.                   THEN CONVERT(CHAR(2), CONVERT(BINARY(1), ((c.[num] / 64) + 192)), 2)
  77.                   +    CONVERT(CHAR(2), CONVERT(BINARY(1), ((c.[num] % 64) + 128)), 2)
  78.                 WHEN (@EncodeSurrogateCodePointsInUTF8 = 0) AND (c.[num] BETWEEN 55296 AND 57343)
  79.                   THEN 'EFBFBD' -- Replacement (U+FFFD) Surrogate Code Points are invalid in UTF-8
  80.                 WHEN c.[num] BETWEEN 2048 AND 65535
  81.                   THEN CONVERT(CHAR(2), CONVERT(BINARY(1), (((c.[num] / 64) / 64) + 224)), 2)
  82.                   +    CONVERT(CHAR(2), CONVERT(BINARY(1), (((c.[num] / 64) % 64) + 128)), 2)
  83.                   +    CONVERT(CHAR(2), CONVERT(BINARY(1), ((c.[num] % 64) + 128)), 2)
  84.                 WHEN c.[num] BETWEEN 65536 AND 1114111
  85.                   THEN CONVERT(CHAR(2), CONVERT(BINARY(1), ((((c.[num] / 64) / 64) / 64) + 240)), 2)
  86.                   +    CONVERT(CHAR(2), CONVERT(BINARY(1), ((((c.[num] / 64) / 64) % 64) + 128)), 2)
  87.                   +    CONVERT(CHAR(2), CONVERT(BINARY(1), (((c.[num] / 64) % 64) + 128)), 2)
  88.                   +    CONVERT(CHAR(2), CONVERT(BINARY(1), ((c.[num] % 64) + 128)), 2)
  89.                 ELSE CONVERT(VARCHAR(15), NULL)
  90.               END AS [UTF-8          ],
  91.        c.[HighSurrogateINT] AS [HighSrgtINT],
  92.        c.[LowSurrogateINT] AS [LowSrgtINT],
  93.        CONVERT(BINARY(2), c.[HighSurrogateINT]) AS [HighSrgtBIN],
  94.        CONVERT(BINARY(2), c.[LowSurrogateINT]) AS [LowSrgtBIN],
  95.        'NCHAR(' + CASE
  96.                     WHEN c.[num] > 65535 THEN CONVERT(CHAR(6), CONVERT(BINARY(2), c.[HighSurrogateINT]), 1)
  97.                       + ') + NCHAR(' + CONVERT(CHAR(6), CONVERT(BINARY(2), c.[LowSurrogateINT]), 1)
  98.                     ELSE CONVERT(CHAR(6), CONVERT(BINARY(2), c.[num]), 1)
  99.                   END + ')' AS [T-SQL                                                  ],
  100.        '&#x' + c.[CodePointHex] + ';' AS [HTML/XML    ],
  101.        CASE
  102.          WHEN c.[num] < 65536 THEN '\u' + CONVERT(CHAR(4), CONVERT(BINARY(2), c.[num]), 2)
  103.          ELSE CONVERT(VARCHAR(10), NULL)
  104.        END AS [C#/F#/C++/Java[Script]]/Julia/?],
  105.        CASE
  106.          WHEN c.[num] > 65535 THEN '\u' + CONVERT(CHAR(4), CONVERT(BINARY(2), c.[HighSurrogateINT]), 2)
  107.            + '\u' + CONVERT(CHAR(4), CONVERT(BINARY(2), c.[LowSurrogateINT]), 2)
  108.          ELSE CONVERT(VARCHAR(15), NULL)
  109.        END AS [C#/F#/Java[Script]]/?],
  110.        '\U' + CONVERT(CHAR(8), CONVERT(BINARY(4), c.[num]), 2) AS [C#/F#/C/C++/Julia/?]
  111. FROM   chars c;
  112. GO
  113. -------------------------------------------------------------------------------
  114. --===========================================================================--
  115. -------------------------------------------------------------------------------
  116.  
  117.  
  118. -------- TEST ---------
  119.  
  120.  
  121. -- List all 188,657 code points:
  122. SELECT cp.*
  123. FROM   dbo.ListAllUnicodeCodePoints(DEFAULT) cp; -- DEFAULT is same as 0
  124.  
  125.  
  126. -- List surrogate code points to show difference in UTF-8 encoding options:
  127. -- (Surrogate code points are invalid in UTF-8 and ideally should not be encoded)
  128. SELECT enc.[CodePoint], enc.[CdPntINT], enc.[CdPntBIN], enc.[Char],
  129.        no_enc.[UTF-8          ] AS [UTF-8 conforming], enc.[UTF-8          ] AS [UTF-8 encoded]
  130. FROM   dbo.ListAllUnicodeCodePoints(0) no_enc -- 0 is same as DEFAULT
  131. INNER JOIN dbo.ListAllUnicodeCodePoints(1) enc
  132.         ON enc.[CdPntINT] = no_enc.[CdPntINT]
  133. WHERE  no_enc.[CdPntINT] BETWEEN 0xD800 AND 0xDFFF;
  134.  
  135.  
  136. -- List some emoji:
  137. SELECT cp.*
  138. FROM   dbo.ListAllUnicodeCodePoints(0) cp
  139. WHERE  cp.[CdPntINT] BETWEEN 0x1F000 AND 0x1F9FF;
  140.  
  141.  
  142. -- List the Tibetan characters, sorted naturally for that language:
  143. SELECT cp.*
  144. FROM   dbo.ListAllUnicodeCodePoints(0) cp
  145. WHERE  cp.[CdPntINT] BETWEEN 0x0F00 AND 0x0FFF -- Tibetan
  146. ORDER BY  cp.[Char] COLLATE Nepali_100_CS_AS;
  147.  
  148.  
  149. -- List characters that are considered the same as "E"
  150. -- (when using Latin1_General_100_CI_AI):
  151. SELECT cp.*
  152. FROM   dbo.ListAllUnicodeCodePoints(0) cp
  153. WHERE  cp.[Char] = N'E' COLLATE Latin1_General_100_CI_AI
  154. ORDER BY  cp.[CdPntINT];
  155. -- 94 rows!!
  156.  
  157.  
  158. -- List characters that have a numeric value between 0 and 10,000
  159. -- (for pre-SQL Server 2017, use Latin1_General_100_CI_AI):
  160. SELECT cp.*
  161. FROM   dbo.ListAllUnicodeCodePoints(0) cp
  162. WHERE  cp.[Char] LIKE N'%[0-' + NCHAR(0x2182) + N']%' COLLATE Japanese_XJIS_140_CI_AI--Latin1_General_100_CI_AI
  163. ORDER BY  cp.[Char] COLLATE Japanese_XJIS_140_CI_AI--Latin1_General_100_CI_AI
  164. -- 752 rows!! (for Japanese_XJIS_140_CI_AI)
  165. -- 550 rows!! (for Latin1_General_100_CI_AI)
  166.  
  167.  
  168.  
  169. -- Can be used in SSMS a as a keyboard Query Shortcut (only a single line is allowed):
  170. SELECT cp.* FROM dbo.ListAllUnicodeCodePoints(0) cp; RETURN;
  171. -- See blog post (linked in the header comment bloc) for details.
  172.  
  173.  
  174.  
  175. -- DROP FUNCTION dbo.ListAllUnicodeCodePoints;
Add Comment
Please, Sign In to add comment