Y_Less

y_punycode.inc

Apr 29th, 2013
357
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Pawn 13.46 KB | None | 0 0
  1. /**--------------------------------------------------------------------------**\
  2.                     ===================================
  3.                      y_punycode - Character encodings.
  4.                     ===================================
  5. Description:
  6.     Functions for converting unicode strings to and from punycode, to be
  7.     represented in just ASCII characters.  Based on several public
  8.     implementations and the RFC, adapted for PAWN.  For more information see:
  9.    
  10.     https://en.wikipedia.org/wiki/Punycode
  11.    
  12.     Also includes a function that hooks the "HTTP" function to allow for
  13.     internationalised domain names with that function.
  14. Legal:
  15.     Version: MPL 1.1
  16.    
  17.     The contents of this file are subject to the Mozilla Public License Version
  18.     1.1 (the "License"); you may not use this file except in compliance with
  19.     the License. You may obtain a copy of the License at
  20.     http://www.mozilla.org/MPL/
  21.    
  22.     Software distributed under the License is distributed on an "AS IS" basis,
  23.     WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  24.     for the specific language governing rights and limitations under the
  25.     License.
  26.    
  27.     The Original Code is the YSI punycode include.
  28.    
  29.     The Initial Developer of the Original Code is Alex "Y_Less" Cole.
  30.     Portions created by the Initial Developer are Copyright (C) 2011
  31.     the Initial Developer. All Rights Reserved.
  32.    
  33.     Contributors:
  34.         ZeeX, koolk, JoeBullet/Google63, g_aSlice/Slice
  35.    
  36.     Thanks:
  37.         JoeBullet/Google63 - Handy arbitrary ASM jump code using SCTRL.
  38.         ZeeX - Very productive conversations.
  39.         koolk - IsPlayerinAreaEx code.
  40.         TheAlpha - Danish translation.
  41.         breadfish - German translation.
  42.         Fireburn - Dutch translation.
  43.         yom - French translation.
  44.         50p - Polish translation.
  45.         Zamaroht - Spanish translation.
  46.         Dracoblue, sintax, mabako, Xtreme, other coders - Producing other modes
  47.             for me to strive to better.
  48.         Pixels^ - Running XScripters where the idea was born.
  49.         Matite - Pestering me to release it and using it.
  50.    
  51.     Very special thanks to:
  52.         Thiadmer - PAWN, whose limits continue to amaze me!
  53.         Kye/Kalcor - SA:MP.
  54.         SA:MP Team past, present and future - SA:MP.
  55.    
  56. Version:
  57.     0.1
  58. Changelog:
  59.     29/04/13:
  60.         Added Puny_HTTP.
  61.     26/04/13:
  62.         First version.
  63. Functions:
  64.     Public
  65.         -
  66.     Core:
  67.         -
  68.     Stock:
  69.         Puny_Encode - Convert a Unicode string to Punycode.
  70.         Puny_Decode - Convert a Punycode string to Unicode.
  71.         Puny_HTTP - Wrapper for "HTTP" to encode domain names.
  72.     Static:
  73.         -
  74.     Inline:
  75.         -
  76.     API:
  77.         -
  78. Callbacks:
  79.     -
  80. Definitions:
  81.     -
  82. Enums:
  83.     -
  84. Macros:
  85.     -
  86. Tags:
  87.     -
  88. Variables:
  89.     Global:
  90.         -
  91.     Static:
  92.         -
  93. Commands:
  94.     -
  95. Compile options:
  96.     -
  97. Operators:
  98.     -
  99. \**--------------------------------------------------------------------------**/
  100.  
  101. // Because at this point I don't know where to put this file.
  102. #tryinclude "..\y_debug"
  103. #tryinclude "y_debug"
  104. #tryinclude <YSI\y_debug>
  105.  
  106. #include <a_http>
  107.  
  108. #define string:
  109.  
  110. #define PUNY_BASE (36)
  111. #define PUNY_CHAR ('-')
  112. // Some versions use "-1" or "cellmax", the RFC uses "PUNY_BASE".
  113. #define PUNY_INVL PUNY_BASE
  114.  
  115. static stock const
  116.     PUNY_TMIN = 1,
  117.     PUNY_TMAX = 26,
  118.     PUNY_SKEW = 38,
  119.     PUNY_BIAS = 72,
  120.     PUNY_INIT = 128,
  121.     PUNY_DAMP = 700,
  122.     YSI_gscDecoder[128] =
  123.         {
  124.             PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL,
  125.             PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL,
  126.             PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL,
  127.             PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL,
  128.             PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL,
  129.             PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL,
  130.             // '0' - '9'.
  131.             26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
  132.             PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL,
  133.             // 'A' - 'Z'.
  134.              0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
  135.             16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
  136.             PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL,
  137.             // 'a' - 'z'.
  138.              0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
  139.             16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
  140.             PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL, PUNY_INVL
  141.         };
  142.  
  143. /**--------------------------------------------------------------------------**\
  144. <summary>Puny_Decode</summary>
  145. <param name="dst">Where to store the converted string.</param>
  146. <param name="src">The string to convert.</param>
  147. <param name="wlen">The length of the destination.</param>
  148. <param name="delimiter">What character to place between the parts.</param>
  149. <returns>
  150.     -
  151. </returns>
  152. <remarks>
  153.     Takes a unicode string and converts it to punycode.
  154. </remarks>
  155. \**--------------------------------------------------------------------------**/
  156.  
  157. stock Puny_Decode(string:dst[], string:src[], wlen = sizeof (dst), const delimiter = PUNY_CHAR)
  158. {
  159.     new
  160.         rlen = strlen(src),
  161.         basicEnd = rlen;
  162.     while (basicEnd--)
  163.     {
  164.         if (src[basicEnd] == delimiter) break;
  165.     }
  166.     // Enough space for the string, and not empty.
  167.     if (0 < ++basicEnd < wlen)
  168.     {
  169.         // Enough space to store the basic string (and no punycode string).
  170.         dst[0] = '\0',
  171.         strcat(dst, src, basicEnd);
  172.     }
  173.     else
  174.     {
  175.         return dst[0] = '\0', strcat(dst, src, wlen), 1;
  176.     }
  177.     --wlen;
  178.     for (
  179.         new
  180.             n     = PUNY_INIT,
  181.             bias  = PUNY_BIAS,
  182.             delta = 0,
  183.             codePointsWritten = basicEnd - 1,
  184.             pointsRead = basicEnd;
  185.         pointsRead != rlen && codePointsWritten != wlen;
  186.         )
  187.     {
  188.         new
  189.             oldDelta = delta;
  190.         for (new w = 1, k = PUNY_BASE; pointsRead != rlen; k += PUNY_BASE)
  191.         {
  192.             new
  193.                 digit = YSI_gscDecoder[src[pointsRead++]];
  194.             if (digit == PUNY_BASE || digit > (cellmax - delta) / w) return 0;
  195.             delta += digit * w;
  196.             new
  197.                 t = (k <= bias) ? (PUNY_TMIN) : ((k >= bias + PUNY_TMAX) ? (PUNY_TMAX) : (k - bias));
  198.             // Find the end of the current code.
  199.             if (digit < t) break;
  200.             if (w > cellmax / (PUNY_BASE - t)) return 0;
  201.             w *= PUNY_BASE - t;
  202.         }
  203.         bias = Puny_Adapt(delta - oldDelta, ++codePointsWritten, oldDelta == 0);
  204.         if (delta / codePointsWritten > cellmax - n) return 0;
  205.         static
  206.             sTinyString[2];
  207.         n += delta / codePointsWritten,
  208.         delta %= codePointsWritten,
  209.         sTinyString[0] = n,
  210.         strins(dst, sTinyString, delta++, wlen + 1);
  211.     }
  212.     return 1;
  213. }
  214.  
  215. /**--------------------------------------------------------------------------**\
  216. <summary>Puny_Encode</summary>
  217. <param name="dst">Where to store the converted string.</param>
  218. <param name="src">The string to convert.</param>
  219. <param name="wlen">The length of the destination.</param>
  220. <param name="delimiter">What character is between the parts.</param>
  221. <returns>
  222.     -
  223. </returns>
  224. <remarks>
  225.     Takes a punycode string and converts it to unicode.
  226. </remarks>
  227. \**--------------------------------------------------------------------------**/
  228.  
  229. stock Puny_Encode(string:dst[], string:src[], wlen = sizeof (dst), const delimiter = PUNY_CHAR)
  230. {
  231.     new
  232.         widx,
  233.         rlen = strlen(src);
  234.     --wlen;
  235.     for (new ridx = 0; ridx != rlen; ++ridx)
  236.     {
  237.         if ('\0' < src[ridx] <= '~')
  238.         {
  239.             if (widx == wlen) return (dst[widx] = '\0');
  240.             dst[widx++] = src[ridx];
  241.         }
  242.     }
  243.     // Wrote out all the characters.
  244.     if (widx == rlen) return (dst[widx] = '\0'), -1;
  245.     if (widx < wlen) dst[widx++] = delimiter;
  246.     else return (dst[widx] = '\0');
  247.     // Set up punycode variables.
  248.     for (
  249.         new
  250.             n     = PUNY_INIT,
  251.             bias  = PUNY_BIAS,
  252.             delta = 0,
  253.             codePointsWritten = widx - 1,
  254.             basicPointsWritten = widx;
  255.         codePointsWritten < rlen;
  256.         )
  257.     {
  258.         new
  259.             m = cellmax;
  260.         for (new ridx = 0; ridx != rlen; ++ridx)
  261.         {
  262.             if (n <= src[ridx] < m)
  263.             {
  264.                 // Find the lowest Unicode character.
  265.                 m = src[ridx];
  266.             }
  267.         }
  268.         // Make sure the number isn't too big to encode.
  269.         if ((m - n) > (cellmax - delta) / (codePointsWritten + 1)) return (dst[widx] = '\0');
  270.         // More punycode state machine.
  271.         delta += (m - n) * (codePointsWritten + 1),
  272.         n = m;
  273.         for (new ridx = 0; ridx != rlen; ++ridx)
  274.         {
  275.             if (src[ridx] < n)
  276.             {
  277.                 if (++delta == 0) return (dst[widx] = '\0');
  278.             }
  279.             else if (src[ridx] == n)
  280.             {
  281.                 widx += Puny_EncodeVar(bias, delta, dst[widx], wlen - widx),
  282.                 ++codePointsWritten,
  283.                 bias = Puny_Adapt(delta, codePointsWritten, (codePointsWritten == basicPointsWritten)),
  284.                 delta = 0;
  285.             }
  286.         }
  287.         ++n,
  288.         ++delta;
  289.     }
  290.     return (dst[widx] = '\0'), 1;
  291. }
  292.  
  293. /**--------------------------------------------------------------------------**\
  294. <summary>_Puny_Basic</summary>
  295. <param name="num">The single number to encode.</param>
  296. <returns>
  297.     -
  298. </returns>
  299. <remarks>
  300.     Convert a single digit to base 36.
  301. </remarks>
  302. \**--------------------------------------------------------------------------**/
  303.  
  304. #define _Puny_Basic(%0) (((%0) > 25) ? ((%0) + ('0' - 25)) : ((%0) + 'a'))
  305.  
  306. /**--------------------------------------------------------------------------**\
  307. <summary>Puny_EncodeVar</summary>
  308. <param name="bias">Part of the state machine.</param>
  309. <param name="delta">Part of the state machine.</param>
  310. <param name="dst">Array to write to.</param>
  311. <param name="wlen">Size of the array.</param>
  312. <returns>
  313.     -
  314. </returns>
  315. <remarks>
  316.     This is part of how the punycode algorithm encodes numbers as very clever
  317.     strings, but honestly I don't fully understand it!
  318. </remarks>
  319. \**--------------------------------------------------------------------------**/
  320.  
  321. static stock Puny_EncodeVar(bias, delta, dst[], wlen)
  322. {
  323.     new
  324.         i = 0,
  325.         k = PUNY_BASE,
  326.         t;
  327.     while (i < wlen)
  328.     {
  329.         if (k <= bias) t = PUNY_TMIN;
  330.         else if (k >= bias + PUNY_TMAX) t = PUNY_TMAX;
  331.         else t = k - bias;
  332.         // Find the last digit below the threshold.
  333.         if (delta < t) break;
  334.         new
  335.             c = t + (delta - t) % (PUNY_BASE - t);
  336.         dst[i++] = _Puny_Basic(c),
  337.         delta = (delta - t) / (PUNY_BASE - t),
  338.         k += PUNY_BASE;
  339.     }
  340.     if (i < wlen) dst[i++] = _Puny_Basic(delta);
  341.     return i;
  342. }
  343.  
  344. /**--------------------------------------------------------------------------**\
  345. <summary>Puny_Adapt</summary>
  346. <param name="delta">Part of the state machine.</param>
  347. <param name="length">Written string size.</param>
  348. <param name="firstTime">Have special characters already been written?</param>
  349. <returns>
  350.     -
  351. </returns>
  352. <remarks>
  353.     This is part of how the punycode algorithm encodes numbers as very clever
  354.     strings, but honestly I don't fully understand it!
  355. </remarks>
  356. \**--------------------------------------------------------------------------**/
  357.  
  358. static stock Puny_Adapt(delta, length, bool:firstTime)
  359. {
  360.     if (firstTime) delta /= PUNY_DAMP;
  361.     else delta >>>= 1;
  362.     delta += delta / length;
  363.     new
  364.         k = 0;
  365.     while (delta > (PUNY_BASE - PUNY_TMIN) * PUNY_TMAX >> 1)
  366.     {
  367.         delta /= PUNY_BASE - PUNY_TMIN,
  368.         k += PUNY_BASE;
  369.     }
  370.     return k + (PUNY_BASE - PUNY_TMIN + 1) * delta / (delta + PUNY_SKEW);
  371. }
  372.  
  373.  
  374. /**--------------------------------------------------------------------------**\
  375. <summary>Puny_HTTP</summary>
  376. <param name="index">The HTTP reference index.</param>
  377. <param name="type">How the request should be sent.</param>
  378. <param name="url[]">The (internationalised) URL address.</param>
  379. <param name="data[]">The GET/POST data.</param>
  380. <param name="callback[]">Which function to return the data to.</param>
  381. <returns>
  382.     -
  383. </returns>
  384. <remarks>
  385.     Hooks the "HTTP" function.
  386. </remarks>
  387. \**--------------------------------------------------------------------------**/
  388.  
  389. stock Puny_HTTP(index, type, url[], data[], callback[])
  390. {
  391.     static
  392.         sPart[64], // Maximum legal domain part length.
  393.         sEncoded[256]; // Maximum legal hostname length.
  394.     new
  395.         idx = strfind(url, !"://");
  396.     // Skip any prefix.
  397.     if (idx != -1) idx += 2;
  398.     // Add the protocol.
  399.     sEncoded[0] = '\0',
  400.     strcat(sEncoded, url, idx + 2);
  401.     // Encode all parts.
  402.     new
  403.         prev = idx + 1,
  404.         end = strfind(url, !"/", false, prev);
  405.     if (end == -1) end = strlen(url); // Nothing after the main domain.
  406.     do
  407.     {
  408.         // Find the size of one part.
  409.         idx = strfind(url, !".", false, prev);
  410.         // Only encode the domain part.
  411.         if (!(-1 < idx < end)) idx = end;
  412.         static
  413.             ch;
  414.         // There's no length parameter for "Puny_Encode", so we need a limit.
  415.         ch = url[idx],
  416.         url[idx] = sPart[0] = '\0';
  417.         switch (Puny_Encode(sPart, url[prev]))
  418.         {
  419.             // Encoding error.
  420.             case 0: return 0;
  421.             // Encoded something, add the prefix.
  422.             case 1:
  423.             {
  424.                 // The hyphen at the start is the only one - no latin chars.
  425.                 if (sPart[0] == '-' && strfind(sPart, !"-", false, 1) == -1) format(sEncoded, sizeof (sEncoded), "%sxn-%s%c", sEncoded, sPart, ch);
  426.                 else format(sEncoded, sizeof (sEncoded), "%sxn--%s%c", sEncoded, sPart, ch);
  427.                 #if defined _DEBUG
  428.                     #if _DEBUG >= 1
  429.                         static
  430.                             sDecoded[64];
  431.                         Puny_Decode(sDecoded, sPart);
  432.                         P:5("Puny_HTTP Original: \"%s\", Encoded: \"%s\", Decoded: \"%s\"", url[prev], sPart, sDecoded);
  433.                         if (strcmp(url[prev], sDecoded)) P:E("Puny_Decode did not match Puny_Encode");
  434.                     #endif
  435.                 #endif
  436.             }
  437.             // No special characters.
  438.             case -1: format(sEncoded, sizeof (sEncoded), "%s%s%c", sEncoded, sPart, ch);
  439.         }
  440.         // Restore the data.
  441.         url[idx] = ch,
  442.         prev = idx + 1;
  443.     }
  444.     while (idx < end);
  445.     // Add the remainder of the domain.
  446.     if (url[end]) strcat(sEncoded, url[end + 1]);
  447.     #if defined _DEBUG
  448.         P:2("Puny_HTTP Domain: \"%s\" -> \"%s\"", url, sEncoded);
  449.     #endif
  450.     // Call the original "HTTP".
  451.     return HTTP(index, type, sEncoded, data, callback);
  452. }
  453.  
  454. #if defined _ALS_HTTP
  455.     #undef HTTP
  456. #else
  457.  
  458.     native BAD_HTTP(index, type, url[], data[], callback[]) = HTTP;
  459.  
  460.     #define _ALS_HTTP
  461. #endif
  462. #define HTTP Puny_HTTP
  463.  
  464. #undef _Puny_Basic
  465. #undef PUNY_BASE
  466. #undef PUNY_CHAR
  467. #undef PUNY_INVL
Advertisement
Add Comment
Please, Sign In to add comment